diff --git a/.gitattributes b/.gitattributes
index 1a3015ec..45c80a14 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,3 +7,4 @@
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.avi filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d03bb411..6032b423 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,17 +3,31 @@ repos:
     rev: 4.0.0
     hooks:
       - id: flake8
-        exclude: thirdparty/|examples/
+        exclude: |
+            (?x)^(
+                thirdparty/|
+                examples/|
+                modelscope/utils/ast_index_file.py
+            )$
   - repo: https://github.com/PyCQA/isort.git
     rev: 4.3.21
     hooks:
       - id: isort
-        exclude: examples
+        exclude: |
+            (?x)^(
+                examples/|
+                modelscope/utils/ast_index_file.py
+            )$
   - repo: https://github.com/pre-commit/mirrors-yapf.git
     rev: v0.30.0
     hooks:
       - id: yapf
-        exclude: thirdparty/|examples/
+        exclude: |
+            (?x)^(
+                thirdparty/|
+                examples/|
+                modelscope/utils/ast_index_file.py
+            )$
   - repo: https://github.com/pre-commit/pre-commit-hooks.git
     rev: v3.1.0
     hooks:
diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml
index 0b2e2f39..d1c02b08 100644
--- a/.pre-commit-config_local.yaml
+++ b/.pre-commit-config_local.yaml
@@ -3,17 +3,31 @@ repos:
     rev: 4.0.0
     hooks:
       - id: flake8
-        exclude: thirdparty/|examples/
+        exclude: |
+            (?x)^(
+                thirdparty/|
+                examples/|
+                modelscope/utils/ast_index_file.py
+            )$
   - repo: /home/admin/pre-commit/isort
     rev: 4.3.21
     hooks:
       - id: isort
-        exclude: examples
+        exclude: |
+            (?x)^(
+                examples/|
+                modelscope/utils/ast_index_file.py
+            )$
   - repo: /home/admin/pre-commit/mirrors-yapf
     rev: v0.30.0
     hooks:
       - id: yapf
-        exclude: thirdparty/|examples/
+        exclude: |
+            (?x)^(
+                thirdparty/|
+                examples/|
+                modelscope/utils/ast_index_file.py
+            )$
   - repo: /home/admin/pre-commit/pre-commit-hooks
     rev: v3.1.0
     hooks:
diff --git a/MANIFEST.in b/MANIFEST.in
index 665d7e90..3cd79b03 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1 @@
-recursive-include modelscope/configs *.py
+recursive-include modelscope/configs *.py *.cu *.h *.cpp
diff --git a/data/test/audios/mix_speech.wav b/data/test/audios/mix_speech.wav
new file mode 100644
index 00000000..b200e668
--- /dev/null
+++ b/data/test/audios/mix_speech.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34c2f1867f7882614b7087f2fd2acb722d0f520a2ec50b2d116d5b3f0c05f84b
+size 141134
diff --git a/data/test/audios/s1_speech.wav b/data/test/audios/s1_speech.wav
new file mode 100644
index 00000000..3901fade
--- /dev/null
+++ b/data/test/audios/s1_speech.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:437b1064a0e38219a9043e25e4761c9f1161c0431636dcea159b44524e0f34eb
+size 141134
diff --git a/data/test/audios/s2_speech.wav b/data/test/audios/s2_speech.wav
new file mode 100644
index 00000000..bd1601d3
--- /dev/null
+++ b/data/test/audios/s2_speech.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1eb51be6751b35aa521866ef0cd1caa64e39451cd7f4b22dee5c1cb7e3e43d5
+size 141134
diff --git a/data/test/images/audrey_hepburn.jpg b/data/test/images/audrey_hepburn.jpg
new file mode 100644
index 00000000..a5afb854
--- /dev/null
+++ b/data/test/images/audrey_hepburn.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da5cf2f3318e61cd38193af374b21a2dec0e90f2aa0e25b3b1825488eadbdc9d
+size 97191
diff --git a/data/test/images/banana.jpg b/data/test/images/banana.jpg
new file mode 100644
index 00000000..ab830689
--- /dev/null
+++ b/data/test/images/banana.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63cafb24e856c58dd01797333e1e2b895815bc48836cb2dfce937ad10222600b
+size 31191
diff --git a/data/test/images/blurry.jpg b/data/test/images/blurry.jpg
new file mode 100644
index 00000000..d53c1394
--- /dev/null
+++ b/data/test/images/blurry.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71cf8e7d24ab067920473a4ce0b5e440c12bb0dd9c2ef6373e2474c796678e2e
+size 48850
diff --git a/data/test/images/face_liveness_ir.jpg b/data/test/images/face_liveness_ir.jpg
new file mode 100644
index 00000000..067bea11
--- /dev/null
+++ b/data/test/images/face_liveness_ir.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cbd55923de4bbe90f5d098f607f2cd966db45892be016198609cafe268fde49
+size 46551
diff --git a/data/test/images/face_liveness_rgb.png b/data/test/images/face_liveness_rgb.png
new file mode 100644
index 00000000..eb08b60e
--- /dev/null
+++ b/data/test/images/face_liveness_rgb.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72e729ff6b0c0cd95091d9b9df2a50536ea3175ea26427a996090a3f7cc188a2
+size 22792
diff --git a/data/test/images/facefusion_template.jpg b/data/test/images/facefusion_template.jpg
new file mode 100644
index 00000000..60a8a6cb
--- /dev/null
+++ b/data/test/images/facefusion_template.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e11e3558040246fc6d84bf87afdb016228172893f475f843dedbdcda5092a3d
+size 181713
diff --git a/data/test/images/facefusion_user.jpg b/data/test/images/facefusion_user.jpg
new file mode 100644
index 00000000..b8d0b5df
--- /dev/null
+++ b/data/test/images/facefusion_user.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e82e688d2eb2755ceb0b0051d7129f6e94e6e5fe57f68727e41cd0c1e909b89c
+size 11143
diff --git a/data/test/images/image_detection/annotations/coco_sample.json b/data/test/images/image_detection/annotations/coco_sample.json
new file mode 100644
index 00000000..bad7d807
--- /dev/null
+++ b/data/test/images/image_detection/annotations/coco_sample.json
@@ -0,0 +1 @@
+{"categories": [{"supercategory": "person", "id": 1, "name": "person"}, {"supercategory": "vehicle", "id": 2, "name": "bicycle"}, {"supercategory": "vehicle", "id": 3, "name": "car"}, {"supercategory": "vehicle", "id": 4, "name": "motorcycle"}, {"supercategory": "vehicle", "id": 5, "name": "airplane"}, {"supercategory": "vehicle", "id": 6, "name": "bus"}, {"supercategory": "vehicle", "id": 7, "name": "train"}, {"supercategory": "vehicle", "id": 8, "name": "truck"}, {"supercategory": "vehicle", "id": 9, "name": "boat"}, {"supercategory": "outdoor", "id": 10, "name": "traffic light"}, {"supercategory": "outdoor", "id": 11, "name": "fire hydrant"}, {"supercategory": "outdoor", "id": 13, "name": "stop sign"}, {"supercategory": "outdoor", "id": 14, "name": "parking meter"}, {"supercategory": "outdoor", "id": 15, "name": "bench"}, {"supercategory": "animal", "id": 16, "name": "bird"}, {"supercategory": "animal", "id": 17, "name": "cat"}, {"supercategory": "animal", "id": 18, "name": "dog"}, {"supercategory": "animal", "id": 19, "name": "horse"}, {"supercategory": "animal", "id": 20, "name": "sheep"}, {"supercategory": "animal", "id": 21, "name": "cow"}, {"supercategory": "animal", "id": 22, "name": "elephant"}, {"supercategory": "animal", "id": 23, "name": "bear"}, {"supercategory": "animal", "id": 24, "name": "zebra"}, {"supercategory": "animal", "id": 25, "name": "giraffe"}, {"supercategory": "accessory", "id": 27, "name": "backpack"}, {"supercategory": "accessory", "id": 28, "name": "umbrella"}, {"supercategory": "accessory", "id": 31, "name": "handbag"}, {"supercategory": "accessory", "id": 32, "name": "tie"}, {"supercategory": "accessory", "id": 33, "name": "suitcase"}, {"supercategory": "sports", "id": 34, "name": "frisbee"}, {"supercategory": "sports", "id": 35, "name": "skis"}, {"supercategory": "sports", "id": 36, "name": "snowboard"}, {"supercategory": "sports", "id": 37, "name": "sports ball"}, {"supercategory": "sports", "id": 38, "name": "kite"}, {"supercategory": "sports", "id": 39, "name": "baseball bat"}, {"supercategory": "sports", "id": 40, "name": "baseball glove"}, {"supercategory": "sports", "id": 41, "name": "skateboard"}, {"supercategory": "sports", "id": 42, "name": "surfboard"}, {"supercategory": "sports", "id": 43, "name": "tennis racket"}, {"supercategory": "kitchen", "id": 44, "name": "bottle"}, {"supercategory": "kitchen", "id": 46, "name": "wine glass"}, {"supercategory": "kitchen", "id": 47, "name": "cup"}, {"supercategory": "kitchen", "id": 48, "name": "fork"}, {"supercategory": "kitchen", "id": 49, "name": "knife"}, {"supercategory": "kitchen", "id": 50, "name": "spoon"}, {"supercategory": "kitchen", "id": 51, "name": "bowl"}, {"supercategory": "food", "id": 52, "name": "banana"}, {"supercategory": "food", "id": 53, "name": "apple"}, {"supercategory": "food", "id": 54, "name": "sandwich"}, {"supercategory": "food", "id": 55, "name": "orange"}, {"supercategory": "food", "id": 56, "name": "broccoli"}, {"supercategory": "food", "id": 57, "name": "carrot"}, {"supercategory": "food", "id": 58, "name": "hot dog"}, {"supercategory": "food", "id": 59, "name": "pizza"}, {"supercategory": "food", "id": 60, "name": "donut"}, {"supercategory": "food", "id": 61, "name": "cake"}, {"supercategory": "furniture", "id": 62, "name": "chair"}, {"supercategory": "furniture", "id": 63, "name": "couch"}, {"supercategory": "furniture", "id": 64, "name": "potted plant"}, {"supercategory": "furniture", "id": 65, "name": "bed"}, {"supercategory": "furniture", "id": 67, "name": "dining table"}, {"supercategory": "furniture", "id": 70, "name": "toilet"}, {"supercategory": "electronic", "id": 72, "name": "tv"}, {"supercategory": "electronic", "id": 73, "name": "laptop"}, {"supercategory": "electronic", "id": 74, "name": "mouse"}, {"supercategory": "electronic", "id": 75, "name": "remote"}, {"supercategory": "electronic", "id": 76, "name": "keyboard"}, {"supercategory": "electronic", "id": 77, "name": "cell phone"}, {"supercategory": "appliance", "id": 78, "name": "microwave"}, {"supercategory": "appliance", "id": 79, "name": "oven"}, {"supercategory": "appliance", "id": 80, "name": "toaster"}, {"supercategory": "appliance", "id": 81, "name": "sink"}, {"supercategory": "appliance", "id": 82, "name": "refrigerator"}, {"supercategory": "indoor", "id": 84, "name": "book"}, {"supercategory": "indoor", "id": 85, "name": "clock"}, {"supercategory": "indoor", "id": 86, "name": "vase"}, {"supercategory": "indoor", "id": 87, "name": "scissors"}, {"supercategory": "indoor", "id": 88, "name": "teddy bear"}, {"supercategory": "indoor", "id": 89, "name": "hair drier"}, {"supercategory": "indoor", "id": 90, "name": "toothbrush"}], "images": [{"license": 1, "file_name": "000000425226.jpg", "coco_url": "http://images.cocodataset.org/val2017/000000425226.jpg", "height": 640, "width": 480, "date_captured": "2013-11-14 21:48:51", "flickr_url": "http://farm5.staticflickr.com/4055/4546463824_bc40e0752b_z.jpg", "id": 1}], "annotations": [{"image_id": 1, "category_id": 1, "segmentation": [], "area": 47803.279549999985, "iscrowd": 0, "bbox": [73.35, 206.02, 300.58, 372.5], "id": 1}]}
diff --git a/data/test/images/image_detection/images/000000425226.jpg b/data/test/images/image_detection/images/000000425226.jpg
new file mode 100755
index 00000000..0b6a0537
--- /dev/null
+++ b/data/test/images/image_detection/images/000000425226.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:feadc69a8190787088fda0ac12971d91badc93dbe06057645050fdbec1ce6911
+size 204232
diff --git a/data/test/images/image_matching1.jpg b/data/test/images/image_matching1.jpg
new file mode 100644
index 00000000..b61efcbf
--- /dev/null
+++ b/data/test/images/image_matching1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05ad1e66d7fee2f9e11766160522ad823f1fcc0ab8a5740a6c89b1765228ea32
+size 334048
diff --git a/data/test/images/image_matching2.jpg b/data/test/images/image_matching2.jpg
new file mode 100644
index 00000000..11f51edc
--- /dev/null
+++ b/data/test/images/image_matching2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ed3a68939b922bc2362b1d8051c24d2ca03be6a431fcc7c423e157012debd5a
+size 424584
diff --git a/data/test/images/image_safetyhat.jpg b/data/test/images/image_safetyhat.jpg
new file mode 100644
index 00000000..e70b9061
--- /dev/null
+++ b/data/test/images/image_safetyhat.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dca477e8a0e25bccb4966ddaebad75d7c770deb1c5e55b9b5e9f39078ea84c2
+size 168454
diff --git a/data/test/images/image_smoke.jpg b/data/test/images/image_smoke.jpg
new file mode 100644
index 00000000..364bc16e
--- /dev/null
+++ b/data/test/images/image_smoke.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dbbcaa0bb6b2c64b1c360f03913b7ab5386a846cc81c34825c115c41c4d672a
+size 23345
diff --git a/data/test/images/image_voc2007_000001.jpg b/data/test/images/image_voc2007_000001.jpg
new file mode 100644
index 00000000..c60f921e
--- /dev/null
+++ b/data/test/images/image_voc2007_000001.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f0bdad67d01aa452929683b74a124a2926b6bce534c85f3ee0f00e20eeacab0
+size 78771
diff --git a/data/test/images/indoor_layout_estimation.png b/data/test/images/indoor_layout_estimation.png
new file mode 100644
index 00000000..3e9d0a61
--- /dev/null
+++ b/data/test/images/indoor_layout_estimation.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cd49527b050b704355ea422f3cb927cf77f9537f2e1e2eae533becb06a7dc45
+size 358204
diff --git a/data/test/images/ocr_detection_vlpt.jpg b/data/test/images/ocr_detection_vlpt.jpg
new file mode 100644
index 00000000..e6e14e28
--- /dev/null
+++ b/data/test/images/ocr_detection_vlpt.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f24570355f178d2a8226112d1443d735837e59573545cfff12458dd791ae341
+size 308158
diff --git a/data/test/images/panorama_depth_estimation.jpg b/data/test/images/panorama_depth_estimation.jpg
new file mode 100644
index 00000000..1c519732
--- /dev/null
+++ b/data/test/images/panorama_depth_estimation.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df5f2de59df6b55d8ee5d414cc2f98d714e14b518c159d4085ad2ac65d36627
+size 137606
diff --git a/data/test/images/vision_middleware_test1.jpg b/data/test/images/vision_middleware_test1.jpg
new file mode 100644
index 00000000..46cc2c9a
--- /dev/null
+++ b/data/test/images/vision_middleware_test1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb0fa302f94560ac8b04057c85a632036e4dc6c6a201ead4c59eb439831b55e9
+size 109305
diff --git a/data/test/pointclouds/flyingthings_pcd1.npy b/data/test/pointclouds/flyingthings_pcd1.npy
new file mode 100644
index 00000000..e4debece
--- /dev/null
+++ b/data/test/pointclouds/flyingthings_pcd1.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fa9f5c8a49d457a7b6f4239e438699e60541e7602e8b3b66da9f7b6d55096ab
+size 1735856
diff --git a/data/test/pointclouds/flyingthings_pcd2.npy b/data/test/pointclouds/flyingthings_pcd2.npy
new file mode 100644
index 00000000..9aad7431
--- /dev/null
+++ b/data/test/pointclouds/flyingthings_pcd2.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86618feded6ae9fbcc772b9a7da17bad7d8b9c68ae0d505a239d110a3a0a7bf4
+size 1735856
diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin
index 469a13f9..686a39d0 100644
--- a/data/test/regression/sbert_ws_zh.bin
+++ b/data/test/regression/sbert_ws_zh.bin
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dc16ad72e753f751360dab82878ec0a31190fb5125632d8f4698f6537fae79cb
-size 40819
+oid sha256:e168377ec5ca88452ae9a782674bb0c90f666597a9f198fadbc8ec4ce55776a0
+size 40633
diff --git a/data/test/regression/tinynas_obj_detection.bin b/data/test/regression/tinynas_obj_detection.bin
new file mode 100644
index 00000000..1d958222
--- /dev/null
+++ b/data/test/regression/tinynas_obj_detection.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:753728b02574958ac9018b235609b87fc99ee23d2dbbe579b98a9b12d7443cc4
+size 118048
diff --git a/data/test/regression/vit_base_image_classification.bin b/data/test/regression/vit_base_image_classification.bin
new file mode 100644
index 00000000..768ddcc6
--- /dev/null
+++ b/data/test/regression/vit_base_image_classification.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba58d77303a90ca0b971c9312928182c5f779465a0b12661be8b7c88bf2ff015
+size 44817
diff --git a/data/test/videos/000.mp4 b/data/test/videos/000.mp4
new file mode 100644
index 00000000..50fa10d1
--- /dev/null
+++ b/data/test/videos/000.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f90f00a210e3a0b24df80439bf6b956c83b2b841eba83f534a7c58d38a49d72c
+size 1009531
diff --git a/data/test/videos/047.mp4 b/data/test/videos/047.mp4
new file mode 100644
index 00000000..e7b311c0
--- /dev/null
+++ b/data/test/videos/047.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11504fef9f9bf4ed281ed30ee3aa24f3c155231c235eb61d57bb9fb8287b5699
+size 3448945
diff --git a/data/test/videos/MOT17-03-partial.mp4 b/data/test/videos/MOT17-03-partial.mp4
new file mode 100644
index 00000000..ed1ee7ba
--- /dev/null
+++ b/data/test/videos/MOT17-03-partial.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca26615762e3f4ccca53a020efe73c3cf3598edc68bb68b5555c24e815718336
+size 3151767
diff --git a/data/test/videos/video_caption_and_qa_test.mp4 b/data/test/videos/video_caption_and_qa_test.mp4
new file mode 100644
index 00000000..125783af
--- /dev/null
+++ b/data/test/videos/video_caption_and_qa_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c822c66fcf04de28016b224ef372cb1c93b7f13f2cba4e11f53a37fec8e769e
+size 828272
diff --git a/data/test/videos/video_depth_estimation.mp4 b/data/test/videos/video_depth_estimation.mp4
new file mode 100644
index 00000000..e8bd2e4c
--- /dev/null
+++ b/data/test/videos/video_depth_estimation.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31a504f9527622dc91322dc66acafc5b673afdddf59afd513dc435d93e4e6ca2
+size 7202711
diff --git a/data/test/videos/video_frame_interpolation_test.mp4 b/data/test/videos/video_frame_interpolation_test.mp4
new file mode 100644
index 00000000..4085a88f
--- /dev/null
+++ b/data/test/videos/video_frame_interpolation_test.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e97ff88d0af12f7dd3ef04ce50b87b51ffbb9a57dce81d2d518df4abd2fdb826
+size 3231793
diff --git a/data/test/videos/video_stabilization_test_video.avi b/data/test/videos/video_stabilization_test_video.avi
new file mode 100644
index 00000000..c1674f09
--- /dev/null
+++ b/data/test/videos/video_stabilization_test_video.avi
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd5ada9f0dae56c826623cb73295a04358a5538effeb7f54134bfd0a4322f00
+size 3700682
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 160e2604..aa28d26b 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -9,9 +9,10 @@ SHELL ["/bin/bash", "-c"]
 COPY docker/rcfiles /tmp/resources
 COPY docker/jupyter_plugins /tmp/resources/jupyter_plugins
 RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
+    apt-get clean && \
     cp /tmp/resources/ubuntu20.04_sources.tuna /etc/apt/sources.list && \
     apt-get update && \
-    apt-get install -y locales wget git  vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
+    apt-get install -y locales wget git strace gdb  vim ffmpeg libsm6 tzdata language-pack-zh-hans ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
     wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
     dpkg -i ./git-lfs_3.2.0_amd64.deb && \
     rm -f ./git-lfs_3.2.0_amd64.deb && \
@@ -72,6 +73,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r /var/modelscope/multi-modal.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/science.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir -r /var/modelscope/tests.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip cache purge
 
 # default shell bash
@@ -99,10 +101,17 @@ RUN if [ "$USE_GPU" = "True" ] ; then \
      echo 'cpu unsupport uniford'; \
     fi
 
-RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext deepspeed
+RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 numpy==1.18.5 https://pypi.tuna.tsinghua.edu.cn/packages/70/ad/06f8a06cef819606cb1a521bcc144288daee5c7e73c5d722492866cb1b92/wenetruntime-1.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ipykernel fairseq fasttext deepspeed
 COPY docker/scripts/install_apex.sh /tmp/install_apex.sh
 RUN if [ "$USE_GPU" = "True" ] ; then \
         bash /tmp/install_apex.sh; \
     else \
-     echo 'cpu unsupport uniford'; \
+     echo 'cpu unsupport apex'; \
+    fi
+RUN  apt-get update && apt-get install -y sox && \
+    apt-get clean
+RUN if [ "$USE_GPU" = "True" ] ; then \
+        pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \
+    else \
+     echo 'cpu unsupport Pointnet2'; \
     fi
diff --git a/docker/scripts/install_apex.sh b/docker/scripts/install_apex.sh
index 5a734243..f78e849e 100644
--- a/docker/scripts/install_apex.sh
+++ b/docker/scripts/install_apex.sh
@@ -1,3 +1,4 @@
+export MAX_JOBS=16
 git clone https://github.com/NVIDIA/apex
 cd apex
 TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6" pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
diff --git a/docs/make.bat b/docs/make.bat
index 3d64bb3a..9534b018 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -7,7 +7,7 @@ REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
-set SOURCEDIR=.
+set SOURCEDIR=source
 set BUILDDIR=build
 
 if "%1" == "" goto help
diff --git a/docs/source/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst
new file mode 100644
index 00000000..b9aade44
--- /dev/null
+++ b/docs/source/_templates/autosummary/class.rst
@@ -0,0 +1,10 @@
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :inherited-members:
+    :members:
+
+.. autogenerated from source/_templates/autosummary/class.rst
diff --git a/docs/source/_templates/classtemplate.rst b/docs/source/_templates/classtemplate.rst
new file mode 100644
index 00000000..c547bf79
--- /dev/null
+++ b/docs/source/_templates/classtemplate.rst
@@ -0,0 +1,12 @@
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+
+
+..
+  autogenerated from source/_templates/classtemplate.rst
+  note it does not have :inherited-members:
diff --git a/docs/source/_templates/sobolengine.rst b/docs/source/_templates/sobolengine.rst
new file mode 100644
index 00000000..e732eecc
--- /dev/null
+++ b/docs/source/_templates/sobolengine.rst
@@ -0,0 +1,14 @@
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+    :exclude-members: MAXBIT, MAXDIM
+    :undoc-members:
+
+
+..
+  autogenerated from source/_templates/sobolengine.rst
+  note it has specific options
diff --git a/docs/source/api/modelscope.fileio.format.rst b/docs/source/api/modelscope.fileio.format.rst
deleted file mode 100644
index 2c7b11de..00000000
--- a/docs/source/api/modelscope.fileio.format.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-modelscope.fileio.format package
-================================
-
-.. automodule:: modelscope.fileio.format
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.fileio.format.base module
-------------------------------------
-
-.. automodule:: modelscope.fileio.format.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.fileio.format.json module
-------------------------------------
-
-.. automodule:: modelscope.fileio.format.json
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.fileio.format.yaml module
-------------------------------------
-
-.. automodule:: modelscope.fileio.format.yaml
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.fileio.rst b/docs/source/api/modelscope.fileio.rst
deleted file mode 100644
index 3f4ae1ca..00000000
--- a/docs/source/api/modelscope.fileio.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-modelscope.fileio package
-=========================
-
-.. automodule:: modelscope.fileio
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   modelscope.fileio.format
-
-Submodules
-----------
-
-modelscope.fileio.file module
------------------------------
-
-.. automodule:: modelscope.fileio.file
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.fileio.io module
----------------------------
-
-.. automodule:: modelscope.fileio.io
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.hub.rst b/docs/source/api/modelscope.hub.rst
index 47d210c2..ac249081 100644
--- a/docs/source/api/modelscope.hub.rst
+++ b/docs/source/api/modelscope.hub.rst
@@ -1,50 +1,17 @@
-modelscope.hub package
-=========================
+modelscope.hub
+==============
 
 .. automodule:: modelscope.hub
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Subpackages
------------
+.. currentmodule:: modelscope.hub
 
-.. toctree::
-   :maxdepth: 4
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
 
-   modelscope.hub.utils
-
-Submodules
-----------
-
-modelscope.hub.api module
------------------------------
-
-.. automodule:: modelscope.hub.api
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.hub.git module
----------------------------
-
-.. automodule:: modelscope.hub.git
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.hub.file_download module
----------------------------
-
-.. automodule:: modelscope.hub.file_download
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.hub.snapshot_download module
----------------------------
-
-.. automodule:: modelscope.hub.snapshot_download
-   :members:
-   :undoc-members:
-   :show-inheritance:
+    api.HubApi
+    repository.Repository
+    deploy.ServiceDeployer
+    snapshot_download.snapshot_download
+    file_download.model_file_download
diff --git a/docs/source/api/modelscope.hub.utils.rst b/docs/source/api/modelscope.hub.utils.rst
deleted file mode 100644
index 74d8ae96..00000000
--- a/docs/source/api/modelscope.hub.utils.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-modelscope.hub.utils package
-===============================
-
-.. automodule:: modelscope.hub.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.hub.utils.caching module
--------------------------------------------------------
-
-.. automodule:: modelscope.hub.utils.caching
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.cv.image\_matting\_pipeline module
--------------------------------------------------------
-
-.. automodule:: modelscope.hub.utils.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.models.base.rst b/docs/source/api/modelscope.models.base.rst
new file mode 100644
index 00000000..8d49188c
--- /dev/null
+++ b/docs/source/api/modelscope.models.base.rst
@@ -0,0 +1,17 @@
+modelscope.models.base
+======================
+
+.. automodule:: modelscope.models.base
+
+.. currentmodule:: modelscope.models.base
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Model
+    TorchModel
+    Head
+    TorchHead
diff --git a/docs/source/api/modelscope.models.builder.rst b/docs/source/api/modelscope.models.builder.rst
new file mode 100644
index 00000000..ed738f81
--- /dev/null
+++ b/docs/source/api/modelscope.models.builder.rst
@@ -0,0 +1,16 @@
+modelscope.models.builder
+=========================
+
+.. automodule:: modelscope.models.builder
+
+.. currentmodule:: modelscope.models.builder
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    build_model
+    build_backbone
+    build_head
diff --git a/docs/source/api/modelscope.models.cv.cartoon.facelib.LK.rst b/docs/source/api/modelscope.models.cv.cartoon.facelib.LK.rst
deleted file mode 100644
index 848c7d67..00000000
--- a/docs/source/api/modelscope.models.cv.cartoon.facelib.LK.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-modelscope.models.cv.cartoon.facelib.LK package
-===============================================
-
-.. automodule:: modelscope.models.cv.cartoon.facelib.LK
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.models.cv.cartoon.facelib.LK.lk module
--------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.facelib.LK.lk
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.models.cv.cartoon.facelib.rst b/docs/source/api/modelscope.models.cv.cartoon.facelib.rst
deleted file mode 100644
index a81536b0..00000000
--- a/docs/source/api/modelscope.models.cv.cartoon.facelib.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-modelscope.models.cv.cartoon.facelib package
-============================================
-
-.. automodule:: modelscope.models.cv.cartoon.facelib
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   modelscope.models.cv.cartoon.facelib.LK
-
-Submodules
-----------
-
-modelscope.models.cv.cartoon.facelib.config module
---------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.facelib.config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.cv.cartoon.facelib.face\_detector module
-----------------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.facelib.face_detector
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.cv.cartoon.facelib.face\_landmark module
-----------------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.facelib.face_landmark
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.cv.cartoon.facelib.facer module
--------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.facelib.facer
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.rst b/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.rst
deleted file mode 100644
index b5845af7..00000000
--- a/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-modelscope.models.cv.cartoon.mtcnn\_pytorch package
-===================================================
-
-.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   modelscope.models.cv.cartoon.mtcnn_pytorch.src
diff --git a/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.src.rst b/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.src.rst
deleted file mode 100644
index 715cc292..00000000
--- a/docs/source/api/modelscope.models.cv.cartoon.mtcnn_pytorch.src.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-modelscope.models.cv.cartoon.mtcnn\_pytorch.src package
-=======================================================
-
-.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch.src
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.models.cv.cartoon.mtcnn\_pytorch.src.align\_trans module
--------------------------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch.src.align_trans
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.cv.cartoon.mtcnn\_pytorch.src.matlab\_cp2tform module
------------------------------------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.mtcnn_pytorch.src.matlab_cp2tform
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.models.cv.cartoon.rst b/docs/source/api/modelscope.models.cv.cartoon.rst
deleted file mode 100644
index 5a262e03..00000000
--- a/docs/source/api/modelscope.models.cv.cartoon.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-modelscope.models.cv.cartoon package
-====================================
-
-.. automodule:: modelscope.models.cv.cartoon
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   modelscope.models.cv.cartoon.facelib
-   modelscope.models.cv.cartoon.mtcnn_pytorch
-
-Submodules
-----------
-
-modelscope.models.cv.cartoon.utils module
------------------------------------------
-
-.. automodule:: modelscope.models.cv.cartoon.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.models.cv.rst b/docs/source/api/modelscope.models.cv.rst
index 47ce3916..c4704112 100644
--- a/docs/source/api/modelscope.models.cv.rst
+++ b/docs/source/api/modelscope.models.cv.rst
@@ -1,15 +1,14 @@
-modelscope.models.cv package
-============================
+modelscope.models.cv
+====================
 
 .. automodule:: modelscope.models.cv
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Subpackages
------------
+.. currentmodule:: modelscope.models.cv
 
-.. toctree::
-   :maxdepth: 4
 
-   modelscope.models.cv.cartoon
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    easycv_base.EasyCVBaseModel
diff --git a/docs/source/api/modelscope.models.nlp.rst b/docs/source/api/modelscope.models.nlp.rst
deleted file mode 100644
index 6cc411d4..00000000
--- a/docs/source/api/modelscope.models.nlp.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-modelscope.models.nlp package
-=============================
-
-.. automodule:: modelscope.models.nlp
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.models.nlp.bert\_for\_sequence\_classification module
-------------------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.bert_for_sequence_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.palm\_for\_text\_generation module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.palm_for_text_generation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.csanmt\_for\_translation module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.palm_for_text_generation
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.masked\_language module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.masked_language
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.sbert\_for\_nil module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.sbert_for_nil
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.sbert\_for\_sentence\_similarity module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.sbert_for_sentence_similarity
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.sbert\_for\_sentiment\_classification module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.sbert_for_sentiment_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.sbert\_for\_sequence\_classification module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.sbert_for_sequence_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.sbert\_for\_token\_classification module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.sbert_for_token_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.nlp.sbert\_for\_zero\_shot\_classification module
-----------------------------------------------------
-
-.. automodule:: modelscope.models.nlp.sbert_for_zero_shot_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.models.rst b/docs/source/api/modelscope.models.rst
index 2eaa1a6b..01bbae3b 100644
--- a/docs/source/api/modelscope.models.rst
+++ b/docs/source/api/modelscope.models.rst
@@ -1,37 +1,14 @@
-modelscope.models package
-=========================
+modelscope.models
+=================
 
 .. automodule:: modelscope.models
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Subpackages
------------
+.. currentmodule:: modelscope.models
 
 .. toctree::
-   :maxdepth: 4
+   :maxdepth: 2
+   :caption: Model Api
 
-   modelscope.models.cv
-   modelscope.models.nlp
-   modelscope.models.multi_modal
-   modelscope.models.audio
-
-Submodules
-----------
-
-modelscope.models.base module
------------------------------
-
-.. automodule:: modelscope.models.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.models.builder module
---------------------------------
-
-.. automodule:: modelscope.models.builder
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   bases <modelscope.models.base>
+   builders <modelscope.models.builder>
+   cv <modelscope.models.cv>
diff --git a/docs/source/api/modelscope.msdatasets.cv.rst b/docs/source/api/modelscope.msdatasets.cv.rst
new file mode 100644
index 00000000..ef0a8a3b
--- /dev/null
+++ b/docs/source/api/modelscope.msdatasets.cv.rst
@@ -0,0 +1,14 @@
+modelscope.msdatasets.cv
+================================
+
+.. automodule:: modelscope.msdatasets.cv
+
+.. currentmodule:: modelscope.msdatasets.cv
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    easycv_base.EasyCVBaseDataset
+    image_classification.ClsDataset
diff --git a/docs/source/api/modelscope.msdatasets.ms_dataset.rst b/docs/source/api/modelscope.msdatasets.ms_dataset.rst
new file mode 100644
index 00000000..03cc8d97
--- /dev/null
+++ b/docs/source/api/modelscope.msdatasets.ms_dataset.rst
@@ -0,0 +1,14 @@
+modelscope.msdatasets.ms_dataset
+================================
+
+.. automodule:: modelscope.msdatasets.ms_dataset
+
+.. currentmodule:: modelscope.msdatasets.ms_dataset
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    MsMapDataset
+    MsDataset
diff --git a/docs/source/api/modelscope.msdatasets.rst b/docs/source/api/modelscope.msdatasets.rst
index 53b858a8..ecef7951 100644
--- a/docs/source/api/modelscope.msdatasets.rst
+++ b/docs/source/api/modelscope.msdatasets.rst
@@ -1,18 +1,13 @@
-modelscope.msdatasets package
-=============================
+modelscope.msdatasets
+=====================
 
 .. automodule:: modelscope.msdatasets
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Submodules
-----------
+.. currentmodule:: modelscope.msdatasets
 
-modelscope.msdatasets.ms\_dataset module
-----------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :caption: Dataset Api
 
-.. automodule:: modelscope.msdatasets.ms_dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   dataset     <modelscope.msdatasets.ms_dataset>
+   cv    <modelscope.msdatasets.cv>
diff --git a/docs/source/api/modelscope.pipelines.audio.rst b/docs/source/api/modelscope.pipelines.audio.rst
deleted file mode 100644
index f162893f..00000000
--- a/docs/source/api/modelscope.pipelines.audio.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-modelscope.pipelines.audio package
-==================================
-
-.. automodule:: modelscope.pipelines.audio
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.pipelines.base.rst b/docs/source/api/modelscope.pipelines.base.rst
new file mode 100644
index 00000000..5cf56cea
--- /dev/null
+++ b/docs/source/api/modelscope.pipelines.base.rst
@@ -0,0 +1,14 @@
+modelscope.pipelines.base
+=========================
+
+.. automodule:: modelscope.pipelines.base
+
+.. currentmodule:: modelscope.pipelines.base
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Pipeline
+    DistributedPipeline
diff --git a/docs/source/api/modelscope.pipelines.builder.rst b/docs/source/api/modelscope.pipelines.builder.rst
new file mode 100644
index 00000000..45d68bd7
--- /dev/null
+++ b/docs/source/api/modelscope.pipelines.builder.rst
@@ -0,0 +1,15 @@
+modelscope.pipelines.builder
+============================
+
+.. automodule:: modelscope.pipelines.builder
+
+.. currentmodule:: modelscope.pipelines.builder
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    build_pipeline
+    pipeline
diff --git a/docs/source/api/modelscope.pipelines.cv.rst b/docs/source/api/modelscope.pipelines.cv.rst
index 3f2da3f4..c695ea69 100644
--- a/docs/source/api/modelscope.pipelines.cv.rst
+++ b/docs/source/api/modelscope.pipelines.cv.rst
@@ -1,26 +1,14 @@
-modelscope.pipelines.cv package
-===============================
+modelscope.pipelines.cv
+=======================
 
 .. automodule:: modelscope.pipelines.cv
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Submodules
-----------
+.. currentmodule:: modelscope.pipelines.cv
 
-modelscope.pipelines.cv.image\_cartoon\_pipeline module
--------------------------------------------------------
 
-.. automodule:: modelscope.pipelines.cv.image_cartoon_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
 
-modelscope.pipelines.cv.image\_matting\_pipeline module
--------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.cv.image_matting_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
+    ActionRecognitionPipeline
diff --git a/docs/source/api/modelscope.pipelines.multi_modal.rst b/docs/source/api/modelscope.pipelines.multi_modal.rst
deleted file mode 100644
index 4bc3982f..00000000
--- a/docs/source/api/modelscope.pipelines.multi_modal.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-modelscope.pipelines.multi\_modal package
-=========================================
-
-.. automodule:: modelscope.pipelines.multi_modal
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.pipelines.multi\_modal.image\_captioning\_pipeline module
-----------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.multi_modal.image_captioning_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.multi\_modal.multi\_modal\_embedding\_pipeline module
-----------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.multi_modal.multi_modal_embedding_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.multi\_modal.text\_to\_image\_synthesis\_pipeline module
-----------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.multi_modal.text_to_image_synthesis_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.multi\_modal.visual\_question\_answering\_pipeline module
-----------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.multi_modal.visual_question_answering_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.pipelines.nlp.rst b/docs/source/api/modelscope.pipelines.nlp.rst
deleted file mode 100644
index 836d914f..00000000
--- a/docs/source/api/modelscope.pipelines.nlp.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-modelscope.pipelines.nlp package
-================================
-
-.. automodule:: modelscope.pipelines.nlp
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.pipelines.nlp.sequence\_classification\_pipeline module
-------------------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.nlp.sequence_classification_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.nlp.text\_generation\_pipeline module
-----------------------------------------------------------
-
-.. automodule:: modelscope.pipelines.nlp.text_generation_pipeline
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.pipelines.rst b/docs/source/api/modelscope.pipelines.rst
index e56a9a87..f0cb433d 100644
--- a/docs/source/api/modelscope.pipelines.rst
+++ b/docs/source/api/modelscope.pipelines.rst
@@ -1,53 +1,14 @@
-modelscope.pipelines package
-============================
+modelscope.pipelines
+====================
 
 .. automodule:: modelscope.pipelines
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Subpackages
------------
+.. currentmodule:: modelscope.pipelines
 
 .. toctree::
-   :maxdepth: 4
+   :maxdepth: 2
+   :caption: Pipeline Api
 
-   modelscope.pipelines.cv
-   modelscope.pipelines.nlp
-   modelscope.pipelines.multi_modal
-   modelscope.pipelines.audio
-
-Submodules
-----------
-
-modelscope.pipelines.builder module
------------------------------------
-
-.. automodule:: modelscope.pipelines.builder
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.base module
------------------------------------
-
-.. automodule:: modelscope.pipelines.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.outputs module
------------------------------------
-
-.. automodule:: modelscope.outputs
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.pipelines.util module
---------------------------------
-
-.. automodule:: modelscope.pipelines.util
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   base     <modelscope.pipelines.base>
+   builder <modelscope.pipelines.builder>
+   cv    <modelscope.pipelines.cv>
diff --git a/docs/source/api/modelscope.preprocessors.base.rst b/docs/source/api/modelscope.preprocessors.base.rst
new file mode 100644
index 00000000..478a9612
--- /dev/null
+++ b/docs/source/api/modelscope.preprocessors.base.rst
@@ -0,0 +1,14 @@
+modelscope.preprocessors.base
+======================
+
+.. automodule:: modelscope.preprocessors.base
+
+.. currentmodule:: modelscope.preprocessors.base
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    Preprocessor
diff --git a/docs/source/api/modelscope.preprocessors.builder.rst b/docs/source/api/modelscope.preprocessors.builder.rst
new file mode 100644
index 00000000..f6a163c4
--- /dev/null
+++ b/docs/source/api/modelscope.preprocessors.builder.rst
@@ -0,0 +1,14 @@
+modelscope.preprocessors.builder
+======================
+
+.. automodule:: modelscope.preprocessors.builder
+
+.. currentmodule:: modelscope.preprocessors.builder
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    build_preprocessor
diff --git a/docs/source/api/modelscope.preprocessors.rst b/docs/source/api/modelscope.preprocessors.rst
index b555198d..ae5cf0ce 100644
--- a/docs/source/api/modelscope.preprocessors.rst
+++ b/docs/source/api/modelscope.preprocessors.rst
@@ -1,50 +1,14 @@
-modelscope.preprocessors package
-================================
+modelscope.preprocessors
+=================
 
 .. automodule:: modelscope.preprocessors
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Submodules
-----------
+.. currentmodule:: modelscope.preprocessors
 
-modelscope.preprocessors.base module
-------------------------------------
+.. toctree::
+   :maxdepth: 2
+   :caption: Preprocessor Api
 
-.. automodule:: modelscope.preprocessors.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.preprocessors.builder module
----------------------------------------
-
-.. automodule:: modelscope.preprocessors.builder
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.preprocessors.common module
---------------------------------------
-
-.. automodule:: modelscope.preprocessors.common
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.preprocessors.image module
--------------------------------------
-
-.. automodule:: modelscope.preprocessors.image
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.preprocessors.nlp module
------------------------------------
-
-.. automodule:: modelscope.preprocessors.nlp
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   base     <modelscope.preprocessors.base>
+   builders <modelscope.preprocessors.builder>
+   video    <modelscope.preprocessors.video>
diff --git a/docs/source/api/modelscope.preprocessors.video.rst b/docs/source/api/modelscope.preprocessors.video.rst
new file mode 100644
index 00000000..3a927b75
--- /dev/null
+++ b/docs/source/api/modelscope.preprocessors.video.rst
@@ -0,0 +1,20 @@
+modelscope.preprocessors.video
+====================
+
+.. automodule:: modelscope.preprocessors.video
+
+.. currentmodule:: modelscope.preprocessors.video
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ReadVideoData
+    kinetics400_tranform
+    _interval_based_sampling
+    _decode_video_frames_list
+    _decode_video
+    KineticsResizedCrop
+    MovieSceneSegmentationPreprocessor
diff --git a/docs/source/api/modelscope.rst b/docs/source/api/modelscope.rst
deleted file mode 100644
index d38654a4..00000000
--- a/docs/source/api/modelscope.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-modelscope package
-==================
-
-.. automodule:: modelscope
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Subpackages
------------
-
-.. toctree::
-   :maxdepth: 4
-
-   modelscope.fileio
-   modelscope.models
-   modelscope.pipelines
-   modelscope.preprocessors
-   modelscope.msdatasets
-   modelscope.trainers
-   modelscope.utils
-   modelscope.hub
-
-Submodules
-----------
-
-modelscope.version module
--------------------------
-
-.. automodule:: modelscope.version
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.trainers.base.rst b/docs/source/api/modelscope.trainers.base.rst
new file mode 100644
index 00000000..3a28b75c
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.base.rst
@@ -0,0 +1,14 @@
+modelscope.trainers.base
+========================
+
+.. automodule:: modelscope.trainers.base
+
+.. currentmodule:: modelscope.trainers.base
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    BaseTrainer
+    DummyTrainer
diff --git a/docs/source/api/modelscope.trainers.builder.rst b/docs/source/api/modelscope.trainers.builder.rst
new file mode 100644
index 00000000..bdfcfe48
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.builder.rst
@@ -0,0 +1,14 @@
+modelscope.trainers.builder
+===========================
+
+.. automodule:: modelscope.trainers.builder
+
+.. currentmodule:: modelscope.trainers.builder
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    build_trainer
diff --git a/docs/source/api/modelscope.trainers.cv.rst b/docs/source/api/modelscope.trainers.cv.rst
new file mode 100644
index 00000000..e7c4a0d9
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.cv.rst
@@ -0,0 +1,14 @@
+modelscope.trainers.cv
+=======================
+
+.. automodule:: modelscope.trainers.cv
+
+.. currentmodule:: modelscope.trainers.cv
+
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    ImagePortraitEnhancementTrainer
diff --git a/docs/source/api/modelscope.trainers.nlp.rst b/docs/source/api/modelscope.trainers.nlp.rst
deleted file mode 100644
index 4bc2f875..00000000
--- a/docs/source/api/modelscope.trainers.nlp.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-modelscope.trainers.nlp package
-===============================
-
-.. automodule:: modelscope.trainers.nlp
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.trainers.nlp.sequence\_classification\_trainer module
-----------------------------------------------------------------
-
-.. automodule:: modelscope.trainers.nlp.sequence_classification_trainer
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/modelscope.trainers.rst b/docs/source/api/modelscope.trainers.rst
index aac4fb99..32f11c6c 100644
--- a/docs/source/api/modelscope.trainers.rst
+++ b/docs/source/api/modelscope.trainers.rst
@@ -1,34 +1,15 @@
-modelscope.trainers package
-===========================
+modelscope.trainers
+===================
 
 .. automodule:: modelscope.trainers
-   :members:
-   :undoc-members:
-   :show-inheritance:
 
-Subpackages
------------
+.. currentmodule:: modelscope.trainers
 
 .. toctree::
-   :maxdepth: 4
+   :maxdepth: 2
+   :caption: Trainer Api
 
-   modelscope.trainers.nlp
-
-Submodules
-----------
-
-modelscope.trainers.base module
--------------------------------
-
-.. automodule:: modelscope.trainers.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.trainers.builder module
-----------------------------------
-
-.. automodule:: modelscope.trainers.builder
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   base     <modelscope.trainers.base>
+   builder <modelscope.trainers.builder>
+   EpochBasedTrainer <modelscope.trainers.trainer>
+   cv    <modelscope.trainers.cv>
diff --git a/docs/source/api/modelscope.trainers.trainer.rst b/docs/source/api/modelscope.trainers.trainer.rst
new file mode 100644
index 00000000..88942046
--- /dev/null
+++ b/docs/source/api/modelscope.trainers.trainer.rst
@@ -0,0 +1,13 @@
+modelscope.trainers.trainer
+===========================
+
+.. automodule:: modelscope.trainers.trainer
+
+.. currentmodule:: modelscope.trainers.trainer
+
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    EpochBasedTrainer
diff --git a/docs/source/api/modelscope.utils.rst b/docs/source/api/modelscope.utils.rst
deleted file mode 100644
index 3d705cfb..00000000
--- a/docs/source/api/modelscope.utils.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-modelscope.utils package
-========================
-
-.. automodule:: modelscope.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-modelscope.utils.config module
-------------------------------
-
-.. automodule:: modelscope.utils.config
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.utils.constant module
---------------------------------
-
-.. automodule:: modelscope.utils.constant
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.utils.hub module
----------------------------
-
-.. automodule:: modelscope.utils.hub
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.utils.logger module
-------------------------------
-
-.. automodule:: modelscope.utils.logger
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.utils.registry module
---------------------------------
-
-.. automodule:: modelscope.utils.registry
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-modelscope.utils.type\_assert module
-------------------------------------
-
-.. automodule:: modelscope.utils.type_assert
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4371c927..eb9e9955 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -40,17 +40,37 @@ release = version
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.autodoc',
     'sphinx.ext.viewcode',
-    'myst_parser',
     'sphinx_markdown_tables',
     'sphinx_copybutton',
+    'myst_parser',
 ]
 
 autodoc_mock_imports = [
     'matplotlib', 'pycocotools', 'terminaltables', 'mmcv.ops'
 ]
+# build the templated autosummary files
+autosummary_generate = True
+numpydoc_show_class_members = False
+
+# Enable overriding of function signatures in the first line of the docstring.
+autodoc_docstring_signature = True
+
+# Disable docstring inheritance
+autodoc_inherit_docstrings = False
+
+# Show type hints in the description
+autodoc_typehints = 'description'
+
+# Add parameter types if the parameter is documented in the docstring
+autodoc_typehints_description_target = 'documented_params'
+
+autodoc_default_options = {
+    'member-order': 'bysource',
+}
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -58,27 +78,46 @@ templates_path = ['_templates']
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
-}
+source_suffix = ['.rst', '.md']
 
 # The master toctree document.
-master_doc = 'index'
+root_doc = 'index'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['build', 'Thumbs.db', '.DS_Store']
-
+exclude_patterns = [
+    'build', 'source/.ipynb_checkpoints', 'source/api/generated', 'Thumbs.db',
+    '.DS_Store'
+]
+# A list of glob-style patterns [1] that are used to find source files.
+# They are matched against the source file names relative to the source directory,
+# using slashes as directory separators on all platforms.
+# The default is **, meaning that all files are recursively included from the source directory.
+# include_patterns = [
+#    'index.rst',
+#    'quick_start.md',
+#    'develop.md',
+#    'faq.md',
+#    'change_log.md',
+#    'api/modelscope.hub*',
+#    'api/modelscope.models.base*',
+#    'api/modelscope.models.builder*',
+#    'api/modelscope.pipelines.base*',
+#    'api/modelscope.pipelines.builder*',
+#    'api/modelscope.preprocessors.base*',
+#    'api/modelscope.preprocessors.builder*',
+#    'api/modelscope.trainers.base*',
+#    'api/modelscope.trainers.builder*',
+# ]
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_book_theme'
-html_theme_path = [sphinx_book_theme.get_html_theme_path()]
-html_theme_options = {}
+# html_theme = 'sphinx_book_theme'
+# html_theme_path = [sphinx_book_theme.get_html_theme_path()]
+# html_theme_options = {}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@@ -88,7 +127,7 @@ html_static_path = ['_static']
 
 # -- Options for HTMLHelp output ---------------------------------------------
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'modelscope_doc'
+# htmlhelp_basename = 'modelscope_doc'
 
 # -- Extension configuration -------------------------------------------------
 # Ignore >>> when copying code
@@ -97,8 +136,3 @@ copybutton_prompt_is_regexp = True
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {'https://docs.python.org/': None}
-
-autodoc_default_options = {
-    'member-order': 'bysource',
-    'special-members': '__init__',
-}
diff --git a/docs/source/faq.md b/docs/source/faq.md
deleted file mode 100644
index e1975b9f..00000000
--- a/docs/source/faq.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# 常见问题
-
-<a name="macos-pip-tokenizer-error"></a>
-
-### 1. macOS环境pip方式安装tokenizers报错
-
-对于tokenizers库， pypi上缺乏针对`macOS`环境预编译包，需要搭建源码编译环境后才能正确安装，步骤如下：
-
-1. 安装rust
-    ```shell
-    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
-    pip install setuptools_rust
-
-    ```
-
-2. 更新rust环境变量
-
-    ```shell
-    source $HOME/.cargo/env
-    ```
-3. 安装tokenizers
-    ```shell
-    pip install tokenizers
-    ```
-reference: [https://huggingface.co/docs/tokenizers/installation#installation-from-sources](https://huggingface.co/docs/tokenizers/installation#installation-from-sources)
-
-### 2. pip 安装包冲突
-
-> ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
-
-由于依赖库之间的版本不兼容，可能会存在版本冲突的情况，大部分情况下不影响正常运行。
-
-### 3. 安装pytorch出现版本错误
-
-> ERROR: Ignored the following versions that require a different python version: 1.1.0 Requires-Python >=3.8; 1.1.0rc1 Requires-Python >=3.8; 1.1.1 Requires-Python >=3.8
-> ERROR: Could not find a version that satisfies the requirement torch==1.8.1+cu111 (from versions: 1.0.0, 1.0.1, 1.0.1.post2, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.4.0, 1.5.0, 1.5.1, 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1, 1.10.0, 1.10.1, 1.10.2, 1.11.0)
-> ERROR: No matching distribution found for torch==1.8.1+cu111
-
-安装时使用如下命令：
-
-```shell
-pip install -f https://download.pytorch.org/whl/torch_stable.html -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
-```
-### 4. zsh: no matches found: modelscope-0.2.2-py3-none-any.whl[all]
-mac终端的zsh 对于[]需要做转义，执行如下命令
-```shell
-pip install modelscope\[all\] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
diff --git a/docs/source/index.rst b/docs/source/index.rst
index aba54341..065ea469 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -3,25 +3,24 @@
    contain the root `toctree` directive.
 
 ModelScope DOCUMENTATION
-=======================================
-
-ModelScope doc
+========================
 
 .. toctree::
    :maxdepth: 2
-   :caption: USER GUIDE
+   :caption: DEVELOPER GUIDE
 
-   quick_start.md
    develop.md
-   faq.md
 
 .. toctree::
    :maxdepth: 2
-   :caption: Tutorials
-
-   tutorials/index
-
+   :caption: API Doc
 
+   Hub <api/modelscope.hub>
+   Model <api/modelscope.models>
+   Preprocessor <api/modelscope.preprocessors>
+   Pipeline <api/modelscope.pipelines>
+   Trainer <api/modelscope.trainers>
+   MsDataset <api/modelscope.msdatasets>
 
 .. toctree::
    :maxdepth: 2
@@ -29,21 +28,6 @@ ModelScope doc
 
    change_log.md
 
-.. toctree::
-..    :maxdepth: 10
-..    :caption: API Doc
-
-..    api/modelscope.preprocessors
-..    api/modelscope.models
-..    api/modelscope.pipelines
-..    api/modelscope.fileio
-..    api/modelscope.utils
-..    api/modelscope.hub
-..    api/modelscope.msdatasets
-..    api/modelscope.tools
-..    api/modelscope.trainers
-
-
 Indices and tables
 ==================
 * :ref:`genindex`
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
deleted file mode 100644
index 7cefa048..00000000
--- a/docs/source/quick_start.md
+++ /dev/null
@@ -1,118 +0,0 @@
-ModelScope Library目前支持tensorflow，pytorch深度学习框架进行模型训练、推理， 在Python 3.7+, Pytorch 1.8+, Tensorflow1.15，Tensorflow 2.x上测试可运行。
-
-**注： **`**语音相关**`**的功能仅支持 python3.7,tensorflow1.15的**`**linux**`**环境使用。  其他功能可以在windows、mac上安装使用。**
-
-## python环境配置
-
-首先，参考[文档](https://docs.anaconda.com/anaconda/install/) 安装配置Anaconda环境。
-安装完成后，执行如下命令为modelscope library创建对应的python环境。
-
-```shell
-conda create -n modelscope python=3.7
-conda activate modelscope
-```
-
-## 安装深度学习框架
-
-- 安装pytorch[参考链接](https://pytorch.org/get-started/locally/)。
-
-```shell
-pip3 install torch torchvision torchaudio
-```
-
-- 安装Tensorflow[参考链接](https://www.tensorflow.org/install/pip)。
-
-```shell
-pip install --upgrade tensorflow
-```
-
-## ModelScope library 安装
-
-注： 如果在安装过程中遇到错误，请前往[常见问题](faq.md)查找解决方案。
-
-### pip安装
-执行如下命令可以安装所有领域依赖：
-```shell
-pip install "modelscope[cv,nlp,audio,multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-如仅需体验`语音功能`，请执行如下命令：
-```shell
-pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-如仅需体验CV功能，可执行如下命令安装依赖：
-```shell
-pip install "modelscope[cv]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-如仅需体验NLP功能，可执行如下命令安装依赖：
-```shell
-pip install "modelscope[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-如仅需体验多模态功能，可执行如下命令安装依赖：
-```shell
-pip install "modelscope[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-**注**：
-
-1. `**语音相关**`**的功能仅支持 python3.7,tensorflow1.15的**`**linux**`**环境使用。  其他功能可以在windows、mac上安装使用。**
-
-2. 语音领域中一部分模型使用了三方库SoundFile进行wav文件处理，**在Linux系统上用户需要手动安装SoundFile的底层依赖库libsndfile**，在Windows和MacOS上会自动安装不需要用户操作。详细信息可参考[SoundFile官网](https://github.com/bastibe/python-soundfile#installation)。以Ubuntu系统为>例，用户需要执行如下命令:
-
-    ```shell
-    sudo apt-get update
-    sudo apt-get install libsndfile1
-    ```
-
-3. **CV功能使用需要安装mmcv-full， 请参考mmcv**[**安装手册**](https://github.com/open-mmlab/mmcv#installation)**进行安装**
-
-### 使用源码安装
-
-适合本地开发调试使用，修改源码后可以直接执行。
-ModelScope的源码可以直接clone到本地：
-
-```shell
-git clone git@github.com:modelscope/modelscope.git
-cd modelscope
-git fetch origin master
-git checkout master
-
-```
-
-
-安装依赖
-如需安装所有依赖，请执行如下命令
-```shell
-pip install -e ".[cv,nlp,audio,multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-
-
-如需体验`语音功能`，请单独执行如下命令：
-```shell
-pip install -e ".[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-如仅需体验CV功能，可执行如下命令安装依赖：
-```shell
-pip install -e ".[cv]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-如仅需体验NLP功能，可执行如下命令安装依赖：
-```shell
-pip install -e ".[nlp]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-如仅需体验多模态功能，可执行如下命令安装依赖：
-```shell
-pip install -e ".[multi-modal]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-### 安装验证
-
-安装成功后，可以执行如下命令进行验证安装是否正确：
-
-```shell
-python -c "from modelscope.pipelines import pipeline;print(pipeline('word-segmentation')('今天天气不错，适合 出去游玩'))"
-```
diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst
deleted file mode 100644
index 9d8528c2..00000000
--- a/docs/source/tutorials/index.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-.. toctree::
-   :maxdepth: 2
-   :caption: Tutorials
-
-   pipeline.md
-   trainer.md
diff --git a/docs/source/tutorials/pipeline.md b/docs/source/tutorials/pipeline.md
deleted file mode 100644
index ebdc06f3..00000000
--- a/docs/source/tutorials/pipeline.md
+++ /dev/null
@@ -1,61 +0,0 @@
-# Pipeline使用教程
-本文简单介绍如何使用`pipeline`函数加载模型进行推理。`pipeline`函数支持按照任务类型、模型名称从模型仓库拉取模型进行进行推理，包含以下几个方面：
-* 使用pipeline()函数进行推理
-* 指定特定预处理、特定模型进行推理
-* 不同场景推理任务示例
-## 环境准备
-详细步骤可以参考 [快速开始](../quick_start.md)
-## Pipeline基本用法
-下面以中文分词任务为例，说明pipeline函数的基本用法
-
-1. pipeline函数支持指定特定任务名称，加载任务默认模型，创建对应pipeline对象
-   执行如下python代码
-   ```python
-   from modelscope.pipelines import pipeline
-   word_segmentation = pipeline('word-segmentation')
-   ```
-
-2. 输入文本
-   ``` python
-   input = '今天天气不错，适合出去游玩'
-   print(word_segmentation(input))
-   {'output': '今天 天气 不错 ， 适合 出去 游玩'}
-   ```
-
-3. 输入多条样本
-
-pipeline对象也支持传入多个样本列表输入，返回对应输出列表，每个元素对应输入样本的返回结果
-
-   ```python
-   inputs =  ['今天天气不错，适合出去游玩','这本书很好，建议你看看']
-   print(word_segmentation(inputs))
-   [{'output': '今天 天气 不错 ， 适合 出去 游玩'}, {'output': '这 本 书 很 好 ， 建议 你 看看'}]
-   ```
-## 指定预处理、模型进行推理
-pipeline函数支持传入实例化的预处理对象、模型对象，从而支持用户在推理过程中定制化预处理、模型。
-
-1. 首先，创建预处理方法和模型
-```python
-from modelscope.models import Model
-from modelscope.preprocessors import TokenClassificationPreprocessor
-model = Model.from_pretrained('damo/nlp_structbert_word-segmentation_chinese-base')
-tokenizer = TokenClassificationPreprocessor(model.model_dir)
-```
-
-2. 使用tokenizer和模型对象创建pipeline
-```python
-from modelscope.pipelines import pipeline
-word_seg = pipeline('word-segmentation', model=model, preprocessor=tokenizer)
-input = '今天天气不错，适合出去游玩'
-print(word_seg(input))
-{'output': '今天 天气 不错 ， 适合 出去 游玩'}
-```
-## 不同场景任务推理示例
-下面以一个图像任务：人像抠图（'image-matting'）为例，进一步说明pipeline的用法
-```python
-import cv2
-from modelscope.pipelines import pipeline
-img_matting = pipeline('image-matting')
-result = img_matting('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matting.png')
-cv2.imwrite('result.png', result['output_png'])
-```
diff --git a/docs/source/tutorials/trainer.md b/docs/source/tutorials/trainer.md
deleted file mode 100644
index 1dfdb9cf..00000000
--- a/docs/source/tutorials/trainer.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# Trainer使用教程
-Modelscope提供了众多预训练模型，你可以使用其中任意一个，利用公开数据集或者私有数据集针对特定任务进行模型训练，在本篇文章中将介绍如何使用Modelscope的`Trainer`模块进行Finetuning和评估。
-
-## 环境准备
-详细步骤可以参考 [快速开始](../quick_start.md)
-
-### 准备数据集
-
-在开始Finetuning前，需要准备一个数据集用以训练和评估，详细可以参考数据集使用教程。
-
-```python
-from datasets import Dataset
-train_dataset = MsDataset.load'afqmc_small', namespace='modelscope', split='train')
-eval_dataset = MsDataset.load('afqmc_small', namespace='modelscope', split='validation')
-```
-### 训练
-ModelScope把所有训练相关的配置信息全部放到了模型仓库下的`configuration.json`中，因此我们只需要创建Trainer，加载配置文件，传入数据集即可完成训练。
-
-首先，通过工厂方法创建Trainer， 需要传入模型仓库路径， 训练数据集对象，评估数据集对象，训练目录
-```python
-kwargs = dict(
-    model='damo/nlp_structbert_sentiment-classification_chinese-base',
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    work_dir='work_dir')
-
-trainer = build_trainer(default_args=kwargs)
-```
-
-启动训练。
-```python
-trainer.train()
-```
-
-如果需要调整训练参数，可以在模型仓库页面下载`configuration.json`文件到本地，修改参数后，指定配置文件路径，创建trainer
-```python
-kwargs = dict(
-    model='damo/nlp_structbert_sentiment-classification_chinese-base',
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    cfg_file='你的配置文件路径'
-    work_dir='work_dir')
-
-trainer = build_trainer(default_args=kwargs)
-trainer.train()
-```
-
-
-### 评估
-训练过程中会定期使用验证集进行评估测试， Trainer模块也支持指定特定轮次保存的checkpoint路径，进行单次评估。
-```python
-eval_results = trainer.evaluate('work_dir/epoch_10.pth')
-print(eval_results)
-```
diff --git a/examples/pytorch/finetune_image_classification.py b/examples/pytorch/finetune_image_classification.py
new file mode 100644
index 00000000..b5c2f651
--- /dev/null
+++ b/examples/pytorch/finetune_image_classification.py
@@ -0,0 +1,86 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.trainers.builder import build_trainer
+from modelscope.trainers.training_args import (ArgAttr, CliArgumentParser,
+                                               training_args)
+
+
+def define_parser():
+    training_args.num_classes = ArgAttr(
+        cfg_node_name=[
+            'model.mm_model.head.num_classes',
+            'model.mm_model.train_cfg.augments.0.num_classes',
+            'model.mm_model.train_cfg.augments.1.num_classes'
+        ],
+        type=int,
+        help='number of classes')
+
+    training_args.train_batch_size.default = 16
+    training_args.train_data_worker.default = 1
+    training_args.max_epochs.default = 1
+    training_args.optimizer.default = 'AdamW'
+    training_args.lr.default = 1e-4
+    training_args.warmup_iters = ArgAttr(
+        'train.lr_config.warmup_iters',
+        type=int,
+        default=1,
+        help='number of warmup epochs')
+    training_args.topk = ArgAttr(
+        cfg_node_name=[
+            'train.evaluation.metric_options.topk',
+            'evaluation.metric_options.topk'
+        ],
+        default=(1, ),
+        help='evaluation using topk, tuple format, eg (1,), (1,5)')
+
+    training_args.train_data = ArgAttr(
+        type=str, default='tany0699/cats_and_dogs', help='train dataset')
+    training_args.validation_data = ArgAttr(
+        type=str, default='tany0699/cats_and_dogs', help='validation dataset')
+    training_args.model_id = ArgAttr(
+        type=str,
+        default='damo/cv_vit-base_image-classification_ImageNet-labels',
+        help='model name')
+
+    parser = CliArgumentParser(training_args)
+    return parser
+
+
+def create_dataset(name, split):
+    namespace, dataset_name = name.split('/')
+    return MsDataset.load(
+        dataset_name, namespace=namespace, subset_name='default', split=split)
+
+
+def train(parser):
+    cfg_dict = parser.get_cfg_dict()
+    args = parser.args
+    train_dataset = create_dataset(args.train_data, split='train')
+    val_dataset = create_dataset(args.validation_data, split='validation')
+
+    def cfg_modify_fn(cfg):
+        cfg.merge_from_dict(cfg_dict)
+        return cfg
+
+    kwargs = dict(
+        model=args.model_id,  # model id
+        train_dataset=train_dataset,  # training dataset
+        eval_dataset=val_dataset,  # validation dataset
+        cfg_modify_fn=cfg_modify_fn  # callback to modify configuration
+    )
+
+    # in distributed training, specify pytorch launcher
+    if 'MASTER_ADDR' in os.environ:
+        kwargs['launcher'] = 'pytorch'
+
+    trainer = build_trainer(
+        name=Trainers.image_classification, default_args=kwargs)
+    # start to train
+    trainer.train()
+
+
+if __name__ == '__main__':
+    parser = define_parser()
+    train(parser)
diff --git a/examples/pytorch/run_train.sh b/examples/pytorch/run_train.sh
new file mode 100644
index 00000000..2093fa09
--- /dev/null
+++ b/examples/pytorch/run_train.sh
@@ -0,0 +1,5 @@
+PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
+    examples/pytorch/finetune_image_classification.py \
+    --num_classes 2 \
+    --train_data 'tany0699/cats_and_dogs' \
+    --validation_data 'tany0699/cats_and_dogs'
diff --git a/modelscope.utils.cv.image_utils b/modelscope.utils.cv.image_utils
new file mode 100644
index 00000000..e919f6fe
--- /dev/null
+++ b/modelscope.utils.cv.image_utils
@@ -0,0 +1,8 @@
+2023-01-11 09:05:29,113 - modelscope - WARNING - Authentication has expired, please re-login if you need to access private models or datasets.
+2023-01-11 09:05:29,285 - modelscope - WARNING - Authentication has expired, please re-login if you need to access private models or datasets.
+2023-01-11 09:05:29,436 - modelscope - INFO - Model revision not specified, use default: master in development mode
+2023-01-11 09:05:29,436 - modelscope - INFO - Development mode use revision: master
+2023-01-11 09:05:29,590 - modelscope - INFO - File configuration.json already in cache, skip downloading!
+2023-01-11 09:05:29,590 - modelscope - INFO - File model.onnx already in cache, skip downloading!
+2023-01-11 09:05:29,590 - modelscope - INFO - File README.md already in cache, skip downloading!
+2023-01-11 09:05:29,590 - modelscope - INFO - File result.png already in cache, skip downloading!
diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py
index a597114f..48bab33a 100644
--- a/modelscope/exporters/__init__.py
+++ b/modelscope/exporters/__init__.py
@@ -1,4 +1,5 @@
 from .base import Exporter
 from .builder import build_exporter
 from .nlp import SbertForSequenceClassificationExporter
+from .tf_model_exporter import TfModelExporter
 from .torch_model_exporter import TorchModelExporter
diff --git a/modelscope/exporters/base.py b/modelscope/exporters/base.py
index c8b7900e..bf190660 100644
--- a/modelscope/exporters/base.py
+++ b/modelscope/exporters/base.py
@@ -1,10 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from abc import ABC, abstractmethod
+from typing import Dict, Union
 
 from modelscope.models import Model
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.constant import ModelFile
+from modelscope.utils.hub import snapshot_download
 from .builder import build_exporter
 
 
@@ -12,36 +14,43 @@ class Exporter(ABC):
     """Exporter base class to output model to onnx, torch_script, graphdef, etc.
     """
 
-    def __init__(self):
-        self.model = None
+    def __init__(self, model=None):
+        self.model = model
 
     @classmethod
-    def from_model(cls, model: Model, **kwargs):
+    def from_model(cls, model: Union[Model, str], **kwargs):
         """Build the Exporter instance.
 
         Args:
-            model: A Model instance. it will be used to generate the intermediate format file,
-            and the configuration.json in its model_dir field will be used to create the exporter instance.
+            model: A Model instance or a model id or a model dir, the configuration.json file besides to which
+            will be used to create the exporter instance.
             kwargs: Extra kwargs used to create the Exporter instance.
 
         Returns:
             The Exporter instance
         """
+        if isinstance(model, str):
+            model = Model.from_pretrained(model)
+
+        assert hasattr(model, 'model_dir')
+        model_dir = model.model_dir
         cfg = Config.from_file(
-            os.path.join(model.model_dir, ModelFile.CONFIGURATION))
+            os.path.join(model_dir, ModelFile.CONFIGURATION))
         task_name = cfg.task
+        if hasattr(model, 'group_key'):
+            task_name = model.group_key
         model_cfg = cfg.model
         if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
             model_cfg.type = model_cfg.model_type
         export_cfg = ConfigDict({'type': model_cfg.type})
         if hasattr(cfg, 'export'):
             export_cfg.update(cfg.export)
+        export_cfg['model'] = model
         exporter = build_exporter(export_cfg, task_name, kwargs)
-        exporter.model = model
         return exporter
 
     @abstractmethod
-    def export_onnx(self, outputs: str, opset=11, **kwargs):
+    def export_onnx(self, output_dir: str, opset=13, **kwargs):
         """Export the model as onnx format files.
 
         In some cases,  several files may be generated,
@@ -49,7 +58,7 @@ class Exporter(ABC):
 
         Args:
             opset: The version of the ONNX operator set to use.
-            outputs: The output dir.
+            output_dir: The output dir.
             kwargs: In this default implementation,
                 kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
 
diff --git a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
index 7a11f73a..802e92a2 100644
--- a/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
+++ b/modelscope/exporters/nlp/sbert_for_sequence_classification_exporter.py
@@ -8,11 +8,20 @@ from modelscope.exporters.builder import EXPORTERS
 from modelscope.exporters.torch_model_exporter import TorchModelExporter
 from modelscope.metainfo import Models
 from modelscope.preprocessors import (
-    TextClassificationTransformersPreprocessor, build_preprocessor)
-from modelscope.utils.config import Config
+    Preprocessor, TextClassificationTransformersPreprocessor,
+    build_preprocessor)
 from modelscope.utils.constant import ModeKeys, Tasks
 
 
+@EXPORTERS.register_module(Tasks.text_classification, module_name=Models.bert)
+@EXPORTERS.register_module(
+    Tasks.text_classification, module_name=Models.structbert)
+@EXPORTERS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
+@EXPORTERS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.bert)
+@EXPORTERS.register_module(
+    Tasks.sentiment_classification, module_name=Models.bert)
+@EXPORTERS.register_module(Tasks.nli, module_name=Models.bert)
 @EXPORTERS.register_module(
     Tasks.sentence_similarity, module_name=Models.structbert)
 @EXPORTERS.register_module(
@@ -38,14 +47,9 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
             Dummy inputs.
         """
 
-        cfg = Config.from_file(
-            os.path.join(self.model.model_dir, 'configuration.json'))
-        field_name = Tasks.find_field_by_task(cfg.task)
-        if 'type' not in cfg.preprocessor and 'val' in cfg.preprocessor:
-            cfg = cfg.preprocessor.val
-        else:
-            cfg = cfg.preprocessor
-
+        assert hasattr(
+            self.model, 'model_dir'
+        ), 'model_dir attribute is required to build the preprocessor'
         batch_size = 1
         sequence_length = {}
         if shape is not None:
@@ -55,13 +59,11 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
                 batch_size, max_length = shape
                 sequence_length = {'sequence_length': max_length}
 
-        cfg.update({
-            'model_dir': self.model.model_dir,
-            'mode': ModeKeys.TRAIN,
-            **sequence_length
-        })
-        preprocessor: TextClassificationTransformersPreprocessor = build_preprocessor(
-            cfg, field_name)
+        preprocessor = Preprocessor.from_pretrained(
+            self.model.model_dir,
+            preprocessor_mode=ModeKeys.TRAIN,
+            task=Tasks.text_classification,
+            **sequence_length)
         if pair:
             first_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
             second_sequence = preprocessor.nlp_tokenizer.tokenizer.unk_token
diff --git a/modelscope/exporters/tf_model_exporter.py b/modelscope/exporters/tf_model_exporter.py
new file mode 100644
index 00000000..3035b4ce
--- /dev/null
+++ b/modelscope/exporters/tf_model_exporter.py
@@ -0,0 +1,114 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Callable, Dict, Mapping
+
+import tensorflow as tf
+
+from modelscope.outputs import ModelOutputBase
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.regress_test_utils import compare_arguments_nested
+from .base import Exporter
+
+logger = get_logger()
+
+
+class TfModelExporter(Exporter):
+
+    def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
+        """Generate dummy inputs for model exportation to onnx or other formats by tracing.
+
+        Returns:
+            Dummy inputs that matches the specific model input, the matched preprocessor can be used here.
+        """
+        return None
+
+    def export_onnx(self, output_dir: str, opset=13, **kwargs):
+        model = self.model if 'model' not in kwargs else kwargs.pop('model')
+        onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
+        self._tf2_export_onnx(model, onnx_file, opset=opset, **kwargs)
+        return {'model': onnx_file}
+
+    def _tf2_export_onnx(self,
+                         model,
+                         output: str,
+                         opset: int = 13,
+                         validation: bool = True,
+                         rtol: float = None,
+                         atol: float = None,
+                         call_func: Callable = None,
+                         **kwargs):
+        logger.info(
+            'Important: This exporting function only supports models of tf2.0 or above.'
+        )
+        import onnx
+        import tf2onnx
+        dummy_inputs = self.generate_dummy_inputs(
+            **kwargs) if 'dummy_inputs' not in kwargs else kwargs.pop(
+                'dummy_inputs')
+        if dummy_inputs is None:
+            raise NotImplementedError(
+                'Model property dummy_inputs,inputs,outputs must be set.')
+
+        input_signature = [
+            tf.TensorSpec.from_tensor(tensor, name=key)
+            for key, tensor in dummy_inputs.items()
+        ]
+        onnx_model, _ = tf2onnx.convert.from_keras(
+            model, input_signature, opset=opset)
+        onnx.save(onnx_model, output)
+
+        if validation:
+            try:
+                import onnx
+                import onnxruntime as ort
+            except ImportError:
+                logger.warn(
+                    'Cannot validate the exported onnx file, because '
+                    'the installation of onnx or onnxruntime cannot be found')
+                return
+
+            def tensor_nested_numpify(tensors):
+                if isinstance(tensors, (list, tuple)):
+                    return type(tensors)(
+                        tensor_nested_numpify(t) for t in tensors)
+                if isinstance(tensors, Mapping):
+                    # return dict
+                    return {
+                        k: tensor_nested_numpify(t)
+                        for k, t in tensors.items()
+                    }
+                if isinstance(tensors, tf.Tensor):
+                    t = tensors.cpu()
+                    return t.numpy()
+                return tensors
+
+            onnx_model = onnx.load(output)
+            onnx.checker.check_model(onnx_model)
+            ort_session = ort.InferenceSession(output)
+            outputs_origin = call_func(
+                dummy_inputs) if call_func is not None else model(dummy_inputs)
+            if isinstance(outputs_origin, (Mapping, ModelOutputBase)):
+                outputs_origin = list(
+                    tensor_nested_numpify(outputs_origin).values())
+            elif isinstance(outputs_origin, (tuple, list)):
+                outputs_origin = list(tensor_nested_numpify(outputs_origin))
+            outputs = ort_session.run(
+                None,
+                tensor_nested_numpify(dummy_inputs),
+            )
+            outputs = tensor_nested_numpify(outputs)
+            if isinstance(outputs, dict):
+                outputs = list(outputs.values())
+            elif isinstance(outputs, tuple):
+                outputs = list(outputs)
+
+            tols = {}
+            if rtol is not None:
+                tols['rtol'] = rtol
+            if atol is not None:
+                tols['atol'] = atol
+            if not compare_arguments_nested('Onnx model output match failed',
+                                            outputs, outputs_origin, **tols):
+                raise RuntimeError(
+                    'export onnx failed because of validation error.')
diff --git a/modelscope/fileio/file.py b/modelscope/fileio/file.py
index 93329d2e..d93f24c9 100644
--- a/modelscope/fileio/file.py
+++ b/modelscope/fileio/file.py
@@ -84,7 +84,8 @@ class LocalStorage(Storage):
         """
         dirname = os.path.dirname(filepath)
         if dirname and not os.path.exists(dirname):
-            os.makedirs(dirname)
+            os.makedirs(dirname, exist_ok=True)
+
         with open(filepath, 'wb') as f:
             f.write(obj)
 
@@ -106,7 +107,8 @@ class LocalStorage(Storage):
         """
         dirname = os.path.dirname(filepath)
         if dirname and not os.path.exists(dirname):
-            os.makedirs(dirname)
+            os.makedirs(dirname, exist_ok=True)
+
         with open(filepath, 'w', encoding=encoding) as f:
             f.write(obj)
 
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index c408f072..c56d16e0 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
 # yapf: disable
+
 import datetime
 import functools
 import os
@@ -39,13 +39,13 @@ from modelscope.hub.errors import (InvalidParameter, NotExistError,
                                    raise_for_http_status, raise_on_error)
 from modelscope.hub.git import GitCommandWrapper
 from modelscope.hub.repository import Repository
-from modelscope.utils.config_ds import DOWNLOADED_DATASETS_PATH
 from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                        DEFAULT_MODEL_REVISION,
                                        DEFAULT_REPOSITORY_REVISION,
                                        MASTER_MODEL_BRANCH, DatasetFormations,
-                                       DatasetMetaFormats, DownloadChannel,
-                                       DownloadMode, ModelFile)
+                                       DatasetMetaFormats,
+                                       DatasetVisibilityMap, DownloadChannel,
+                                       ModelFile)
 from modelscope.utils.logger import get_logger
 from .utils.utils import (get_endpoint, get_release_datetime,
                           model_id_to_group_owner_name)
@@ -54,38 +54,51 @@ logger = get_logger()
 
 
 class HubApi:
+    """Model hub api interface.
+    """
+    def __init__(self, endpoint: Optional[str] = None):
+        """The ModelScope HubApi。
 
-    def __init__(self, endpoint=None):
+        Args:
+            endpoint (str, optional): The modelscope server http|https address. Defaults to None.
+        """
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
         self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
         self.session = Session()
-        retry = Retry(total=2, read=2, connect=2, backoff_factor=1,
-                      status_forcelist=(500, 502, 503, 504),)
+        retry = Retry(
+            total=2,
+            read=2,
+            connect=2,
+            backoff_factor=1,
+            status_forcelist=(500, 502, 503, 504),
+        )
         adapter = HTTPAdapter(max_retries=retry)
         self.session.mount('http://', adapter)
         self.session.mount('https://', adapter)
         # set http timeout
         for method in REQUESTS_API_HTTP_METHOD:
-            setattr(self.session,
-                    method,
-                    functools.partial(getattr(self.session, method), timeout=API_HTTP_CLIENT_TIMEOUT))
+            setattr(
+                self.session, method,
+                functools.partial(
+                    getattr(self.session, method),
+                    timeout=API_HTTP_CLIENT_TIMEOUT))
 
     def login(
         self,
         access_token: str,
     ) -> tuple():
-        """
-        Login with username and password
+        """Login with your SDK access token, which can be obtained from
+           https://www.modelscope.cn user center.
 
         Args:
-            access_token(`str`): user access token on modelscope.
+            access_token (str): user access token on modelscope.
+
         Returns:
             cookies: to authenticate yourself to ModelScope open-api
-            gitlab token: to access private repos
+            git_token: token to access your git repository.
 
-        <Tip>
+        Note:
             You only have to login once within 30 days.
-        </Tip>
         """
         path = f'{self.endpoint}/api/v1/login'
         r = self.session.post(
@@ -107,27 +120,28 @@ class HubApi:
         return d[API_RESPONSE_FIELD_DATA][
             API_RESPONSE_FIELD_GIT_ACCESS_TOKEN], cookies
 
-    def create_model(
-        self,
-        model_id: str,
-        visibility: str,
-        license: str,
-        chinese_name: Optional[str] = None,
-    ) -> str:
-        """
-        Create model repo at ModelScopeHub
+    def create_model(self,
+                     model_id: str,
+                     visibility: Optional[int] = ModelVisibility.PUBLIC,
+                     license: Optional[str] = Licenses.APACHE_V2,
+                     chinese_name: Optional[str] = None) -> str:
+        """Create model repo at ModelScopeHub.
 
         Args:
-            model_id:(`str`): The model id
-            visibility(`int`): visibility of the model(1-private, 5-public), default public.
-            license(`str`): license of the model, default none.
-            chinese_name(`str`, *optional*): chinese name of the model
-        Returns:
-            name of the model created
+            model_id (str): The model id
+            visibility (int, optional): visibility of the model(1-private, 5-public), default 5.
+            license (str, optional): license of the model, default none.
+            chinese_name (str, optional): chinese name of the model.
 
-        <Tip>
+        Returns:
+            Name of the model created
+
+        Raises:
+            InvalidParameter: If model_id is invalid.
+            ValueError: If not login.
+
+        Note:
             model_id = {owner}/{name}
-        </Tip>
         """
         if model_id is None:
             raise InvalidParameter('model_id is required!')
@@ -151,14 +165,17 @@ class HubApi:
         model_repo_url = f'{get_endpoint()}/{model_id}'
         return model_repo_url
 
-    def delete_model(self, model_id):
-        """_summary_
+    def delete_model(self, model_id: str):
+        """Delete model_id from ModelScope.
 
         Args:
             model_id (str): The model id.
-        <Tip>
+
+        Raises:
+            ValueError: If not login.
+
+        Note:
             model_id = {owner}/{name}
-        </Tip>
         """
         cookies = ModelScopeConfig.get_cookies()
         if cookies is None:
@@ -169,27 +186,28 @@ class HubApi:
         raise_for_http_status(r)
         raise_on_error(r.json())
 
-    def get_model_url(self, model_id):
+    def get_model_url(self, model_id: str):
         return f'{self.endpoint}/api/v1/models/{model_id}.git'
 
     def get_model(
         self,
         model_id: str,
-        revision: str = DEFAULT_MODEL_REVISION,
+        revision: Optional[str] = DEFAULT_MODEL_REVISION,
     ) -> str:
-        """
-        Get model information at modelscope_hub
+        """Get model information at ModelScope
 
         Args:
-            model_id(`str`): The model id.
-            revision(`str`): revision of model
+            model_id (str): The model id.
+            revision (str optional): revision of model.
+
         Returns:
             The model detail information.
+
         Raises:
             NotExistError: If the model is not exist, will throw NotExistError
-        <Tip>
+
+        Note:
             model_id = {owner}/{name}
-        </Tip>
         """
         cookies = ModelScopeConfig.get_cookies()
         owner_or_group, name = model_id_to_group_owner_name(model_id)
@@ -211,13 +229,12 @@ class HubApi:
     def push_model(self,
                    model_id: str,
                    model_dir: str,
-                   visibility: int = ModelVisibility.PUBLIC,
-                   license: str = Licenses.APACHE_V2,
+                   visibility: Optional[int] = ModelVisibility.PUBLIC,
+                   license: Optional[str] = Licenses.APACHE_V2,
                    chinese_name: Optional[str] = None,
                    commit_message: Optional[str] = 'upload model',
                    revision: Optional[str] = DEFAULT_REPOSITORY_REVISION):
-        """
-        Upload model from a given directory to given repository. A valid model directory
+        """Upload model from a given directory to given repository. A valid model directory
         must contain a configuration.json file.
 
         This function upload the files in given directory to given repository. If the
@@ -229,11 +246,11 @@ class HubApi:
         which can be obtained from ModelScope's website.
 
         Args:
-            model_id (`str`):
+            model_id (str):
                 The model id to be uploaded, caller must have write permission for it.
-            model_dir(`str`):
+            model_dir(str):
                 The Absolute Path of the finetune result.
-            visibility(`int`, defaults to `0`):
+            visibility(int, optional):
                 Visibility of the new created model(1-private, 5-public). If the model is
                 not exists in ModelScope, this function will create a new model with this
                 visibility and this parameter is required. You can ignore this parameter
@@ -250,6 +267,12 @@ class HubApi:
             revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
                 which branch to push. If the branch is not exists, It will create a new
                 branch and push to it.
+
+        Raises:
+            InvalidParameter: Parameter invalid.
+            NotLoginException: Not login
+            ValueError: No configuration.json
+            Exception: Create failed.
         """
         if model_id is None:
             raise InvalidParameter('model_id cannot be empty!')
@@ -305,7 +328,10 @@ class HubApi:
                 date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
                 commit_message = '[automsg] push model %s to hub at %s' % (
                     model_id, date)
-            repo.push(commit_message=commit_message, local_branch=revision, remote_branch=revision)
+            repo.push(
+                commit_message=commit_message,
+                local_branch=revision,
+                remote_branch=revision)
         except Exception:
             raise
         finally:
@@ -313,14 +339,18 @@ class HubApi:
 
     def list_models(self,
                     owner_or_group: str,
-                    page_number=1,
-                    page_size=10) -> dict:
+                    page_number: Optional[int] = 1,
+                    page_size: Optional[int] = 10) -> dict:
         """List models in owner or group.
 
         Args:
-            owner_or_group(`str`): owner or group.
-            page_number(`int`): The page number, default: 1
-            page_size(`int`): The page size, default: 10
+            owner_or_group(str): owner or group.
+            page_number(int, optional): The page number, default: 1
+            page_size(int, optional): The page size, default: 10
+
+        Raises:
+            RequestError: The request error.
+
         Returns:
             dict: {"models": "list of models", "TotalCount": total_number_of_models_in_owner_or_group}
         """
@@ -358,7 +388,7 @@ class HubApi:
     def list_model_revisions(
             self,
             model_id: str,
-            cutoff_timestamp: int = None,
+            cutoff_timestamp: Optional[int] = None,
             use_cookies: Union[bool, CookieJar] = False) -> List[str]:
         """Get model branch and tags.
 
@@ -368,6 +398,7 @@ class HubApi:
                                     The timestamp is represented by the seconds elasped from the epoch time.
             use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
                         will load cookie from local. Defaults to False.
+
         Returns:
             Tuple[List[str], List[str]]: Return list of branch name and tags
         """
@@ -385,7 +416,10 @@ class HubApi:
                 ] if info['RevisionMap']['Tags'] else []
         return tags
 
-    def get_valid_revision(self, model_id: str, revision=None, cookies: Optional[CookieJar] = None):
+    def get_valid_revision(self,
+                           model_id: str,
+                           revision=None,
+                           cookies: Optional[CookieJar] = None):
         release_timestamp = get_release_datetime()
         current_timestamp = int(round(datetime.datetime.now().timestamp()))
         # for active development in library codes (non-release-branches), release_timestamp
@@ -396,27 +430,36 @@ class HubApi:
                 model_id, use_cookies=False if cookies is None else cookies)
             if revision is None:
                 revision = MASTER_MODEL_BRANCH
-                logger.info('Model revision not specified, use default: %s in development mode' % revision)
+                logger.info(
+                    'Model revision not specified, use default: %s in development mode'
+                    % revision)
             if revision not in branches and revision not in tags:
-                raise NotExistError('The model: %s has no branch or tag : %s .' % revision)
+                raise NotExistError('The model: %s has no revision : %s .' % (model_id, revision))
             logger.info('Development mode use revision: %s' % revision)
         else:
             if revision is None:  # user not specified revision, use latest revision before release time
                 revisions = self.list_model_revisions(
-                    model_id, cutoff_timestamp=release_timestamp, use_cookies=False if cookies is None else cookies)
+                    model_id,
+                    cutoff_timestamp=release_timestamp,
+                    use_cookies=False if cookies is None else cookies)
                 if len(revisions) == 0:
-                    raise NoValidRevisionError('The model: %s has no valid revision!' % model_id)
+                    raise NoValidRevisionError(
+                        'The model: %s has no valid revision!' % model_id)
                 # tags (revisions) returned from backend are guaranteed to be ordered by create-time
                 # we shall obtain the latest revision created earlier than release version of this branch
                 revision = revisions[0]
-                logger.info('Model revision not specified, use the latest revision: %s' % revision)
+                logger.info(
+                    'Model revision not specified, use the latest revision: %s'
+                    % revision)
             else:
                 # use user-specified revision
                 revisions = self.list_model_revisions(
-                    model_id, cutoff_timestamp=current_timestamp, use_cookies=False if cookies is None else cookies)
+                    model_id,
+                    cutoff_timestamp=current_timestamp,
+                    use_cookies=False if cookies is None else cookies)
                 if revision not in revisions:
-                    raise NotExistError(
-                        'The model: %s has no revision: %s !' % (model_id, revision))
+                    raise NotExistError('The model: %s has no revision: %s !' %
+                                        (model_id, revision))
                 logger.info('Use user-specified model revision: %s' % revision)
         return revision
 
@@ -431,6 +474,7 @@ class HubApi:
             model_id (str): The model id
             use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, will
                         will load cookie from local. Defaults to False.
+
         Returns:
             Tuple[List[str], List[str]]: Return list of branch name and tags
         """
@@ -466,9 +510,6 @@ class HubApi:
                         will load cookie from local. Defaults to False.
             headers: request headers
 
-        Raises:
-            ValueError: If user_cookies is True, but no local cookie.
-
         Returns:
             List[dict]: Model file list.
         """
@@ -505,25 +546,8 @@ class HubApi:
         dataset_list = r.json()[API_RESPONSE_FIELD_DATA]
         return [x['Name'] for x in dataset_list]
 
-    def fetch_dataset_scripts(
-            self,
-            dataset_name: str,
-            namespace: str,
-            download_mode: Optional[DownloadMode],
-            revision: Optional[str] = DEFAULT_DATASET_REVISION):
-        if namespace is None:
-            raise ValueError(
-                f'Dataset from Hubs.modelscope should have a valid "namespace", but get {namespace}'
-            )
-        revision = revision or DEFAULT_DATASET_REVISION
-        cache_dir = os.path.join(DOWNLOADED_DATASETS_PATH, namespace,
-                                 dataset_name, revision)
-        download_mode = DownloadMode(download_mode
-                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
-        if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(
-                cache_dir):
-            shutil.rmtree(cache_dir)
-        os.makedirs(cache_dir, exist_ok=True)
+    def get_dataset_id_and_type(self, dataset_name: str, namespace: str):
+        """ Get the dataset id and type. """
         datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
         cookies = ModelScopeConfig.get_cookies()
         r = self.session.get(datahub_url, cookies=cookies)
@@ -531,8 +555,15 @@ class HubApi:
         datahub_raise_on_error(datahub_url, resp)
         dataset_id = resp['Data']['Id']
         dataset_type = resp['Data']['Type']
+        return dataset_id, dataset_type
+
+    def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_id: str, revision: str):
+        """ Get the meta file-list of the dataset. """
         datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
+        cookies = ModelScopeConfig.get_cookies()
         r = self.session.get(datahub_url, cookies=cookies, headers=self.headers)
+        r = self.session.get(
+            datahub_url, cookies=cookies, headers=self.headers)
         resp = r.json()
         datahub_raise_on_error(datahub_url, resp)
         file_list = resp['Data']
@@ -542,9 +573,23 @@ class HubApi:
                 f'version = {revision}] dose not exist')
 
         file_list = file_list['Files']
+        return file_list
+
+    def get_dataset_meta_files_local_paths(self, dataset_name: str,
+                                           namespace: str,
+                                           revision: str,
+                                           meta_cache_dir: str, dataset_type: int, file_list: list):
         local_paths = defaultdict(list)
         dataset_formation = DatasetFormations(dataset_type)
         dataset_meta_format = DatasetMetaFormats[dataset_formation]
+        cookies = ModelScopeConfig.get_cookies()
+
+        # Dump the data_type as a local file
+        dataset_type_file_path = os.path.join(meta_cache_dir,
+                                              f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
+        with open(dataset_type_file_path, 'w') as fp:
+            fp.write('*** Automatically-generated file, do not modify ***')
+
         for file_info in file_list:
             file_path = file_info['Path']
             extension = os.path.splitext(file_path)[-1]
@@ -553,7 +598,7 @@ class HubApi:
                               f'Revision={revision}&FilePath={file_path}'
                 r = self.session.get(datahub_url, cookies=cookies)
                 raise_for_http_status(r)
-                local_path = os.path.join(cache_dir, file_path)
+                local_path = os.path.join(meta_cache_dir, file_path)
                 if os.path.exists(local_path):
                     logger.warning(
                         f"Reusing dataset {dataset_name}'s python file ({local_path})"
@@ -564,14 +609,14 @@ class HubApi:
                     f.write(r.content)
                 local_paths[extension].append(local_path)
 
-        return local_paths, dataset_formation, cache_dir
+        return local_paths, dataset_formation
 
     def fetch_single_csv_script(self, script_url: str):
         cookies = ModelScopeConfig.get_cookies()
         resp = self.session.get(script_url, cookies=cookies, headers=self.headers)
         if not resp or not resp.text:
             raise 'The meta-csv file cannot be empty when the meta-args `big_data` is true.'
-        text_list = resp.text.split('\n')
+        text_list = resp.text.strip().split('\n')
         text_headers = text_list[0]
         text_content = text_list[1:]
 
@@ -599,19 +644,48 @@ class HubApi:
 
     def get_dataset_access_config_session(
             self,
-            cookies: CookieJar,
             dataset_name: str,
             namespace: str,
+            check_cookie: bool,
             revision: Optional[str] = DEFAULT_DATASET_REVISION):
 
         datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
                       f'ststoken?Revision={revision}'
-
+        if check_cookie:
+            cookies = self._check_cookie(use_cookies=True)
+        else:
+            cookies = ModelScopeConfig.get_cookies()
         r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
+
+        r = self.session.get(
+            url=datahub_url, cookies=cookies, headers=self.headers)
         resp = r.json()
         raise_on_error(resp)
         return resp['Data']
 
+    def get_dataset_access_config_for_unzipped(self,
+                                               dataset_name: str,
+                                               namespace: str,
+                                               revision: str,
+                                               zip_file_name: str):
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
+        cookies = ModelScopeConfig.get_cookies()
+        r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
+        resp = r.json()
+        # get visibility of the dataset
+        raise_on_error(resp)
+        data = resp['Data']
+        visibility = DatasetVisibilityMap.get(data['Visibility'])
+
+        datahub_sts_url = f'{datahub_url}/ststoken?Revision={revision}'
+        r_sts = self.session.get(url=datahub_sts_url, cookies=cookies, headers=self.headers)
+        resp_sts = r_sts.json()
+        raise_on_error(resp_sts)
+        data_sts = resp_sts['Data']
+        file_dir = visibility + '-unzipped' + '/' + namespace + '_' + dataset_name + '_' + zip_file_name
+        data_sts['Dir'] = file_dir
+        return data_sts
+
     def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
                                  is_recursive, is_filter_dir, revision):
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
@@ -624,12 +698,6 @@ class HubApi:
         resp = resp['Data']
         return resp
 
-    def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
-        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
-        cookies = ModelScopeConfig.get_cookies()
-        r = self.session.post(url, cookies=cookies, headers=self.headers)
-        raise_for_http_status(r)
-
     def delete_oss_dataset_object(self, object_name: str, dataset_name: str,
                                   namespace: str, revision: str) -> str:
         if not object_name or not dataset_name or not namespace or not revision:
@@ -637,7 +705,7 @@ class HubApi:
 
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss?Path={object_name}&Revision={revision}'
 
-        cookies = self.check_local_cookies(use_cookies=True)
+        cookies = ModelScopeConfig.get_cookies()
         resp = self.session.delete(url=url, cookies=cookies)
         resp = resp.json()
         raise_on_error(resp)
@@ -652,7 +720,7 @@ class HubApi:
         url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/prefix?Prefix={object_name}/' \
             f'&Revision={revision}'
 
-        cookies = self.check_local_cookies(use_cookies=True)
+        cookies = ModelScopeConfig.get_cookies()
         resp = self.session.delete(url=url, cookies=cookies)
         resp = resp.json()
         raise_on_error(resp)
@@ -661,32 +729,40 @@ class HubApi:
 
     def datahub_remote_call(self, url):
         cookies = ModelScopeConfig.get_cookies()
-        r = self.session.get(url, cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()})
+        r = self.session.get(
+            url,
+            cookies=cookies,
+            headers={'user-agent': ModelScopeConfig.get_user_agent()})
         resp = r.json()
         datahub_raise_on_error(url, resp)
         return resp['Data']
 
-    def check_local_cookies(self, use_cookies) -> CookieJar:
-        return self._check_cookie(use_cookies=use_cookies)
+    def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool) -> None:
+        is_ci_test = os.getenv('CI_TEST') == 'True'
+        if dataset_name and namespace and not is_ci_test and not use_streaming:
+            try:
+                cookies = ModelScopeConfig.get_cookies()
 
-    def dataset_download_uv(self, dataset_name: str, namespace: str):
-        if not dataset_name or not namespace:
-            raise ValueError('dataset_name or namespace cannot be empty!')
+                # Download count
+                download_count_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
+                download_count_resp = self.session.post(download_count_url, cookies=cookies, headers=self.headers)
+                raise_for_http_status(download_count_resp)
 
-        # get channel and user_name
-        channel = DownloadChannel.LOCAL.value
-        user_name = ''
-        if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ:
-            channel = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT]
-        if MODELSCOPE_CLOUD_USERNAME in os.environ:
-            user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
+                # Download uv
+                channel = DownloadChannel.LOCAL.value
+                user_name = ''
+                if MODELSCOPE_CLOUD_ENVIRONMENT in os.environ:
+                    channel = os.environ[MODELSCOPE_CLOUD_ENVIRONMENT]
+                if MODELSCOPE_CLOUD_USERNAME in os.environ:
+                    user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
+                download_uv_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/' \
+                                  f'{channel}?user={user_name}'
+                download_uv_resp = self.session.post(download_uv_url, cookies=cookies, headers=self.headers)
+                download_uv_resp = download_uv_resp.json()
+                raise_on_error(download_uv_resp)
 
-        url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}'
-        cookies = ModelScopeConfig.get_cookies()
-        r = self.session.post(url, cookies=cookies, headers=self.headers)
-        resp = r.json()
-        raise_on_error(resp)
-        return resp['Message']
+            except Exception as e:
+                logger.error(e)
 
 
 class ModelScopeConfig:
@@ -763,7 +839,8 @@ class ModelScopeConfig:
             with open(
                     os.path.join(ModelScopeConfig.path_credential,
                                  ModelScopeConfig.USER_INFO_FILE_NAME),
-                    'r', encoding='utf-8') as f:
+                    'r',
+                    encoding='utf-8') as f:
                 info = f.read()
                 return info.split(':')[0], info.split(':')[1]
         except FileNotFoundError:
@@ -784,7 +861,8 @@ class ModelScopeConfig:
             with open(
                     os.path.join(ModelScopeConfig.path_credential,
                                  ModelScopeConfig.GIT_TOKEN_FILE_NAME),
-                    'r', encoding='utf-8') as f:
+                    'r',
+                    encoding='utf-8') as f:
                 token = f.read()
         except FileNotFoundError:
             pass
diff --git a/modelscope/hub/check_model.py b/modelscope/hub/check_model.py
new file mode 100644
index 00000000..f2d4a98f
--- /dev/null
+++ b/modelscope/hub/check_model.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import pickle
+from typing import Dict, Optional, Union
+from urllib.parse import urlparse
+
+from modelscope.hub.api import HubApi, ModelScopeConfig
+from modelscope.hub.constants import (FILE_HASH, MODEL_META_FILE_NAME,
+                                      MODEL_META_MODEL_ID)
+from modelscope.hub.git import GitCommandWrapper
+from modelscope.hub.utils.caching import FileSystemCache, ModelFileSystemCache
+from modelscope.hub.utils.utils import compute_hash
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def check_local_model_is_latest(
+    model_root_path: str,
+    user_agent: Optional[Union[Dict, str]] = None,
+):
+    """Check local model repo is latest.
+    Check local model repo is same as hub latest version.
+    """
+    model_cache = None
+    # download with git
+    if os.path.exists(os.path.join(model_root_path, '.git')):
+        git_cmd_wrapper = GitCommandWrapper()
+        git_url = git_cmd_wrapper.get_repo_remote_url(model_root_path)
+        if git_url.endswith('.git'):
+            git_url = git_url[:-4]
+        u_parse = urlparse(git_url)
+        model_id = u_parse.path[1:]
+    else:  # snapshot_download
+        model_cache = ModelFileSystemCache(model_root_path)
+        model_id = model_cache.get_model_id()
+
+    try:
+        # make headers
+        headers = {
+            'user-agent':
+            ModelScopeConfig.get_user_agent(user_agent=user_agent, )
+        }
+        cookies = ModelScopeConfig.get_cookies()
+
+        snapshot_header = headers if 'CI_TEST' in os.environ else {
+            **headers,
+            **{
+                'Snapshot': 'True'
+            }
+        }
+        _api = HubApi()
+        try:
+            _, revisions = _api.get_model_branches_and_tags(
+                model_id=model_id, use_cookies=cookies)
+            if len(revisions) > 0:
+                latest_revision = revisions[0]
+            else:
+                latest_revision = 'master'
+        except:  # noqa: E722
+            latest_revision = 'master'
+
+        model_files = _api.get_model_files(
+            model_id=model_id,
+            revision=latest_revision,
+            recursive=True,
+            headers=snapshot_header,
+            use_cookies=cookies,
+        )
+        for model_file in model_files:
+            if model_file['Type'] == 'tree':
+                continue
+            # check model_file updated
+            if model_cache is not None:
+                if model_cache.exists(model_file):
+                    continue
+                else:
+                    logger.info(
+                        'Model is updated from modelscope hub, you can verify from http://www.modelscope.cn.'
+                    )
+                    break
+            else:
+                if FILE_HASH in model_file:
+                    local_file_hash = compute_hash(
+                        os.path.join(model_root_path, model_file['Path']))
+                    if local_file_hash == model_file[FILE_HASH]:
+                        continue
+                    else:
+                        logger.info(
+                            'Model is updated from modelscope hub, you can verify from http://www.modelscope.cn.'
+                        )
+                        break
+    except:  # noqa: E722
+        pass  # ignore
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 7f3cae0c..3cde867d 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -25,6 +25,8 @@ MODELSCOPE_CLOUD_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT'
 MODELSCOPE_CLOUD_USERNAME = 'MODELSCOPE_USERNAME'
 MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG'
 ONE_YEAR_SECONDS = 24 * 365 * 60 * 60
+MODEL_META_FILE_NAME = '.mdl'
+MODEL_META_MODEL_ID = 'id'
 
 
 class Licenses(object):
diff --git a/modelscope/hub/deploy.py b/modelscope/hub/deploy.py
index 3b1c9cfc..565929ff 100644
--- a/modelscope/hub/deploy.py
+++ b/modelscope/hub/deploy.py
@@ -185,6 +185,8 @@ class DeleteServiceParameters(AttrsToQueryString):
 
 
 class ServiceDeployer(object):
+    """Faciliate model deployment on to supported service provider(s).
+    """
 
     def __init__(self, endpoint=None):
         self.endpoint = endpoint if endpoint is not None else get_endpoint()
@@ -210,7 +212,6 @@ class ServiceDeployer(object):
             provider (ServiceProviderParameters): The service provider parameter
 
         Raises:
-            NotLoginException: To use this api, you need login first.
             NotSupportError: Not supported platform.
             RequestError: The server return error.
 
@@ -248,10 +249,9 @@ class ServiceDeployer(object):
         Args:
             instance_name (str): The deployed instance name.
             provider (ServiceProviderParameters): The cloud provider information, for eas
-            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
+                need region(eg: ch-hangzhou), access_key_id and access_key_secret.
 
         Raises:
-            NotLoginException: To use this api, you need login first.
             RequestError: The request is failed from server.
 
         Returns:
@@ -279,10 +279,9 @@ class ServiceDeployer(object):
         Args:
             instance_name (str): The instance name you want to delete.
             provider (ServiceProviderParameters): The cloud provider information, for eas
-            need region(eg: ch-hangzhou), access_key_id and access_key_secret.
+                need region(eg: ch-hangzhou), access_key_id and access_key_secret.
 
         Raises:
-            NotLoginException: To call this api, you need login first.
             RequestError: The request is failed.
 
         Returns:
@@ -305,17 +304,17 @@ class ServiceDeployer(object):
 
     def list(self,
              provider: ServiceProviderParameters,
-             skip: int = 0,
-             limit: int = 100):
+             skip: Optional[int] = 0,
+             limit: Optional[int] = 100):
         """List deployed model instances.
 
         Args:
             provider (ServiceProviderParameters): The cloud service provider parameter,
-            for eas, need access_key_id and access_key_secret.
-            skip: start of the list, current not support.
-            limit: maximum number of instances return, current not support
+                for eas, need access_key_id and access_key_secret.
+            skip (int, optional): start of the list, current not support.
+            limit (int, optional): maximum number of instances return, current not support
+
         Raises:
-            NotLoginException: To use this api, you need login first.
             RequestError: The request is failed from server.
 
         Returns:
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index 4c4e5dbd..be94d7fd 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -49,10 +49,10 @@ def is_ok(rsp):
     """ Check the request is ok
 
     Args:
-        rsp (_type_): The request response body
-        Failed: {'Code': 10010101004, 'Message': 'get model info failed, err: unauthorized permission',
-                 'RequestId': '', 'Success': False}
-        Success: {'Code': 200, 'Data': {}, 'Message': 'success', 'RequestId': '', 'Success': True}
+        rsp (Response): The request response body
+
+    Returns:
+       bool: `True` if success otherwise `False`.
     """
     return rsp['Code'] == HTTPStatus.OK and rsp['Success']
 
@@ -84,6 +84,12 @@ def raise_on_error(rsp):
 
     Args:
         rsp (_type_): The server response
+
+    Raises:
+        RequestError: the response error message.
+
+    Returns:
+        bool: True if request is OK, otherwise raise `RequestError` exception.
     """
     if rsp['Code'] == HTTPStatus.OK:
         return True
@@ -91,26 +97,37 @@ def raise_on_error(rsp):
         raise RequestError(rsp['Message'])
 
 
-# TODO use raise_on_error instead if modelhub and datahub response have uniform structures,
 def datahub_raise_on_error(url, rsp):
     """If response error, raise exception
 
     Args:
-        rsp (_type_): The server response
+        url (str): The request url
+        rsp (HTTPResponse): The server response.
+
+    Raises:
+        RequestError: the http request error.
+
+    Returns:
+        bool: `True` if request is OK, otherwise raise `RequestError` exception.
     """
     if rsp.get('Code') == HTTPStatus.OK:
         return True
     else:
         raise RequestError(
-            f"Url = {url}, Status = {rsp.get('status')}, error = {rsp.get('error')}, message = {rsp.get('message')}"
+            f"Url = {url}, Message = {rsp.get('Message')}, Please specify correct dataset_name and namespace."
         )
 
 
 def raise_for_http_status(rsp):
-    """
-    Attempt to decode utf-8 first since some servers
+    """Attempt to decode utf-8 first since some servers
     localize reason strings, for invalid utf-8, fall back
     to decoding with iso-8859-1.
+
+    Args:
+        rsp: The http response.
+
+    Raises:
+        HTTPError: The http error info.
     """
     http_error_msg = ''
     if isinstance(rsp.reason, bytes):
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index 2b929bef..77f38fe9 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -36,47 +36,40 @@ def model_file_download(
     local_files_only: Optional[bool] = False,
     cookies: Optional[CookieJar] = None,
 ) -> Optional[str]:  # pragma: no cover
-    """
-    Download from a given URL and cache it if it's not already present in the
-    local cache.
+    """Download from a given URL and cache it if it's not already present in the local cache.
 
     Given a URL, this function looks for the corresponding file in the local
     cache. If it's not there, download it. Then return the path to the cached
     file.
 
     Args:
-        model_id (`str`):
-            The model to whom the file to be downloaded belongs.
-        file_path(`str`):
-            Path of the file to be downloaded, relative to the root of model repo
-        revision(`str`, *optional*):
-            revision of the model file to be downloaded.
-            Can be any of a branch, tag or commit hash
-        cache_dir (`str`, `Path`, *optional*):
-            Path to the folder where cached files are stored.
-        user_agent (`dict`, `str`, *optional*):
-            The user-agent info in the form of a dictionary or a string.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, avoid downloading the file and return the path to the
-            local cached file if it exists.
-            if `False`, download the file anyway even it exists
+        model_id (str): The model to whom the file to be downloaded belongs.
+        file_path(str): Path of the file to be downloaded, relative to the root of model repo.
+        revision(str, optional): revision of the model file to be downloaded.
+            Can be any of a branch, tag or commit hash.
+        cache_dir (str, Path, optional): Path to the folder where cached files are stored.
+        user_agent (dict, str, optional): The user-agent info in the form of a dictionary or a string.
+        local_files_only (bool, optional）:  If `True`, avoid downloading the file and return the path to the
+            local cached file if it exists. if `False`, download the file anyway even it exists.
+        cookies (CookieJar, optional): The cookie of download request.
 
     Returns:
-        Local path (string) of file or if networking is off, last version of
+        string: string of local file or if networking is off, last version of
         file cached on disk.
 
-    <Tip>
+    Raises:
+        NotExistError: The file is not exist.
+        ValueError: The request parameter error.
 
-    Raises the following errors:
+    Note:
+        Raises the following errors:
 
-        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
-          if `use_auth_token=True` and the token cannot be found.
-        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
-          if ETag cannot be determined.
-        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
-          if some parameter value is invalid
-
-    </Tip>
+            - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+            if `use_auth_token=True` and the token cannot be found.
+            - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
+            if ETag cannot be determined.
+            - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+            if some parameter value is invalid
     """
     if cache_dir is None:
         cache_dir = get_cache_dir()
@@ -165,10 +158,17 @@ def model_file_download(
 
 
 def get_file_download_url(model_id: str, file_path: str, revision: str):
-    """
-    Format file download url according to `model_id`, `revision` and `file_path`.
+    """Format file download url according to `model_id`, `revision` and `file_path`.
     e.g., Given `model_id=john/bert`, `revision=master`, `file_path=README.md`,
-    the resulted download url is: https://modelscope.cn/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
+    the resulted download url is: https://modelscope.co/api/v1/models/john/bert/repo?Revision=master&FilePath=README.md
+
+    Args:
+        model_id (str): The model_id.
+        file_path (str): File path
+        revision (str): File revision.
+
+    Returns:
+        str: The file url.
     """
     download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
     return download_url_template.format(
@@ -186,20 +186,23 @@ def http_get_file(
     cookies: CookieJar,
     headers: Optional[Dict[str, str]] = None,
 ):
-    """
-    Download remote file, will retry 5 times before giving up on errors.
+    """Download remote file, will retry 5 times before giving up on errors.
+
     Args:
-        url(`str`):
+        url(str):
             actual download url of the file
-        local_dir(`str`):
+        local_dir(str):
             local directory where the downloaded file stores
-        file_name(`str`):
+        file_name(str):
             name of the file stored in `local_dir`
-        cookies(`CookieJar`):
+        cookies(CookieJar):
             cookies used to authentication the user, which is used for downloading private repos
-        headers(`Optional[Dict[str, str]] = None`):
+        headers(Dict[str, str], optional):
             http headers to carry necessary info when requesting the remote file
 
+    Raises:
+        FileDownloadError: Failed download failed.
+
     """
     total = -1
     temp_file_manager = partial(
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index 51474504..80887738 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -2,7 +2,7 @@
 
 import os
 import subprocess
-from typing import List
+from typing import List, Optional
 
 from modelscope.utils.logger import get_logger
 from ..utils.constant import MASTER_MODEL_BRANCH
@@ -33,6 +33,9 @@ class GitCommandWrapper(metaclass=Singleton):
         """Run git command, if command return 0, return subprocess.response
              otherwise raise GitError, message is stdout and stderr.
 
+        Args:
+            args: List of command args.
+
         Raises:
             GitError: Exception with stdout and stderr.
 
@@ -106,7 +109,7 @@ class GitCommandWrapper(metaclass=Singleton):
               token: str,
               url: str,
               repo_name: str,
-              branch: str = None):
+              branch: Optional[str] = None):
         """ git clone command wrapper.
         For public project, token can None, private repo, there must token.
 
@@ -116,6 +119,9 @@ class GitCommandWrapper(metaclass=Singleton):
             url (str): The remote url
             repo_name (str): The local repository path name.
             branch (str, optional): _description_. Defaults to None.
+
+        Returns:
+            The popen response.
         """
         url = self._add_token(token, url)
         if branch:
@@ -162,7 +168,11 @@ class GitCommandWrapper(metaclass=Singleton):
         """Run git commit command
 
         Args:
+            repo_dir (str): the repository directory.
             message (str): commit message.
+
+        Returns:
+            The command popen response.
         """
         commit_args = ['-C', '%s' % repo_dir, 'commit', '-m', "'%s'" % message]
         rsp = self._run_git_command(*commit_args)
diff --git a/modelscope/hub/repository.py b/modelscope/hub/repository.py
index aa4057c7..1d107a3c 100644
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -24,20 +24,20 @@ class Repository:
                  revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
                  auth_token: Optional[str] = None,
                  git_path: Optional[str] = None):
-        """
-        Instantiate a Repository object by cloning the remote ModelScopeHub repo
+        """Instantiate a Repository object by cloning the remote ModelScopeHub repo
+
         Args:
-            model_dir(`str`):
-                The model root directory.
-            clone_from:
-                model id in ModelScope-hub from which git clone
-            revision(`Optional[str]`):
-                revision of the model you want to clone from. Can be any of a branch, tag or commit hash
-            auth_token(`Optional[str]`):
-                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
-                as the token is already saved when you login the first time, if None, we will use saved token.
-            git_path:(`Optional[str]`):
-                The git command line path, if None, we use 'git'
+            model_dir (str): The model root directory.
+            clone_from (str): model id in ModelScope-hub from which git clone
+            revision (str, optional): revision of the model you want to clone from.
+                     Can be any of a branch, tag or commit hash
+            auth_token (str, optional): token obtained when calling `HubApi.login()`.
+                        Usually you can safely ignore the parameter as the token is already
+                        saved when you login the first time, if None, we will use saved token.
+            git_path (str, optional): The git command line path, if None, we use 'git'
+
+        Raises:
+            InvalidParameter: revision is None.
         """
         self.model_dir = model_dir
         self.model_base_dir = os.path.dirname(model_dir)
@@ -92,16 +92,19 @@ class Repository:
              commit_message: str,
              local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
              remote_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
-             force: bool = False):
+             force: Optional[bool] = False):
         """Push local files to remote, this method will do.
-           git pull
-           git add
-           git commit
-           git push
+        Execute git pull, git add, git commit, git push in order.
+
         Args:
             commit_message (str): commit message
-            branch (Optional[str], optional): which branch to push.
-            force (Optional[bool]): whether to use forced-push.
+            local_branch(str, optional): The local branch, default master.
+            remote_branch (str, optional): The remote branch to push, default master.
+            force (bool, optional): whether to use forced-push.
+
+        Raises:
+            InvalidParameter: no commit message.
+            NotLoginException: no auth token.
         """
         if commit_message is None or not isinstance(commit_message, str):
             msg = 'commit_message must be provided!'
@@ -128,12 +131,19 @@ class Repository:
             local_branch=local_branch,
             remote_branch=remote_branch)
 
-    def tag(self, tag_name: str, message: str, ref: str = MASTER_MODEL_BRANCH):
+    def tag(self,
+            tag_name: str,
+            message: str,
+            ref: Optional[str] = MASTER_MODEL_BRANCH):
         """Create a new tag.
+
         Args:
             tag_name (str): The name of the tag
             message (str): The tag message.
-            ref (str): The tag reference, can be commit id or branch.
+            ref (str, optional): The tag reference, can be commit id or branch.
+
+        Raises:
+            InvalidParameter: no commit message.
         """
         if tag_name is None or tag_name == '':
             msg = 'We use tag-based revision, therefore tag_name cannot be None or empty.'
@@ -150,7 +160,7 @@ class Repository:
     def tag_and_push(self,
                      tag_name: str,
                      message: str,
-                     ref: str = MASTER_MODEL_BRANCH):
+                     ref: Optional[str] = MASTER_MODEL_BRANCH):
         """Create tag and push to remote
 
         Args:
@@ -175,18 +185,19 @@ class DatasetRepository:
                  git_path: Optional[str] = None):
         """
         Instantiate a Dataset Repository object by cloning the remote ModelScope dataset repo
+
         Args:
-            repo_work_dir(`str`):
-                The dataset repo root directory.
-            dataset_id:
-                dataset id in ModelScope from which git clone
-            revision(`Optional[str]`):
-                revision of the dataset you want to clone from. Can be any of a branch, tag or commit hash
-            auth_token(`Optional[str]`):
-                token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
-                as the token is already saved when you login the first time, if None, we will use saved token.
-            git_path:(`Optional[str]`):
-                The git command line path, if None, we use 'git'
+            repo_work_dir (str): The dataset repo root directory.
+            dataset_id (str): dataset id in ModelScope from which git clone
+            revision (str, optional): revision of the dataset you want to clone from.
+                                      Can be any of a branch, tag or commit hash
+            auth_token (str, optional): token obtained when calling `HubApi.login()`.
+                                        Usually you can safely ignore the parameter as the token is
+                                        already saved when you login the first time, if None, we will use saved token.
+            git_path (str, optional): The git command line path, if None, we use 'git'
+
+        Raises:
+            InvalidParameter: parameter invalid.
         """
         self.dataset_id = dataset_id
         if not repo_work_dir or not isinstance(repo_work_dir, str):
@@ -230,16 +241,21 @@ class DatasetRepository:
     def push(self,
              commit_message: str,
              branch: Optional[str] = DEFAULT_DATASET_REVISION,
-             force: bool = False):
+             force: Optional[bool] = False):
         """Push local files to remote, this method will do.
            git pull
            git add
            git commit
            git push
+
         Args:
             commit_message (str): commit message
-            branch (Optional[str], optional): which branch to push.
-            force (Optional[bool]): whether to use forced-push.
+            branch (str, optional): which branch to push.
+            force (bool, optional): whether to use forced-push.
+
+        Raises:
+            InvalidParameter: no commit message.
+            NotLoginException: no access token.
         """
         if commit_message is None or not isinstance(commit_message, str):
             msg = 'commit_message must be provided!'
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index 3a615c1a..67492649 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -1,10 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import re
 import tempfile
 from http.cookiejar import CookieJar
 from pathlib import Path
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Union
 
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
@@ -23,7 +24,8 @@ def snapshot_download(model_id: str,
                       cache_dir: Union[str, Path, None] = None,
                       user_agent: Optional[Union[Dict, str]] = None,
                       local_files_only: Optional[bool] = False,
-                      cookies: Optional[CookieJar] = None) -> str:
+                      cookies: Optional[CookieJar] = None,
+                      ignore_file_pattern: List = None) -> str:
     """Download all files of a repo.
     Downloads a whole snapshot of a repo's files at the specified revision. This
     is useful when you want all files from a repo, because you don't know which
@@ -32,31 +34,32 @@ def snapshot_download(model_id: str,
 
     An alternative would be to just clone a repo but this would require that the
     user always has git and git-lfs installed, and properly configured.
-    Args:
-        model_id (`str`):
-            A user or an organization name and a repo name separated by a `/`.
-        revision (`str`, *optional*):
-            An optional Git revision id which can be a branch name, a tag, or a
-            commit hash. NOTE: currently only branch and tag name is supported
-        cache_dir (`str`, `Path`, *optional*):
-            Path to the folder where cached files are stored.
-        user_agent (`str`, `dict`, *optional*):
-            The user-agent info in the form of a dictionary or a string.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, avoid downloading the file and return the path to the
-            local cached file if it exists.
-    Returns:
-        Local folder path (string) of repo snapshot
 
-    <Tip>
-    Raises the following errors:
-    - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
-      if `use_auth_token=True` and the token cannot be found.
-    - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
-      ETag cannot be determined.
-    - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
-      if some parameter value is invalid
-    </Tip>
+    Args:
+        model_id (str): A user or an organization name and a repo name separated by a `/`.
+        revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a
+            commit hash. NOTE: currently only branch and tag name is supported
+        cache_dir (str, Path, optional): Path to the folder where cached files are stored.
+        user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string.
+        local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the
+            local cached file if it exists.
+        cookies (CookieJar, optional): The cookie of the request, default None.
+        ignore_file_pattern (`str` or `List`, *optional*, default to `None`):
+            Any file pattern to be ignored in downloading, like exact file names or file extensions.
+    Raises:
+        ValueError: the value details.
+
+    Returns:
+        str: Local folder path (string) of repo snapshot
+
+    Note:
+        Raises the following errors:
+        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+        if `use_auth_token=True` and the token cannot be found.
+        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
+        ETag cannot be determined.
+        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+        if some parameter value is invalid
     """
 
     if cache_dir is None:
@@ -105,10 +108,16 @@ def snapshot_download(model_id: str,
             headers=snapshot_header,
         )
 
+        if ignore_file_pattern is None:
+            ignore_file_pattern = []
+        if isinstance(ignore_file_pattern, str):
+            ignore_file_pattern = [ignore_file_pattern]
+
         with tempfile.TemporaryDirectory(
                 dir=temporary_cache_dir) as temp_cache_dir:
             for model_file in model_files:
-                if model_file['Type'] == 'tree':
+                if model_file['Type'] == 'tree' or \
+                        any([re.search(pattern, model_file['Name']) is not None for pattern in ignore_file_pattern]):
                     continue
                 # check model_file is exist in cache, if existed, skip download, otherwise download
                 if cache.exists(model_file):
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index 1acd2e84..f92aaaf4 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -6,9 +6,12 @@ import pickle
 import tempfile
 from shutil import move, rmtree
 
+from modelscope.hub.constants import MODEL_META_FILE_NAME, MODEL_META_MODEL_ID
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
+"""Implements caching functionality, used internally only
+"""
 
 
 class FileSystemCache(object):
@@ -21,11 +24,11 @@ class FileSystemCache(object):
         cache_root_location: str,
         **kwargs,
     ):
-        """
-        Parameters
-        ----------
-        cache_location: str
-            The root location to store files.
+        """Base file system cache interface.
+
+        Args:
+            cache_root_location (str): The root location to store files.
+            kwargs(dict): The keyword arguments.
         """
         os.makedirs(cache_root_location, exist_ok=True)
         self.cache_root_location = cache_root_location
@@ -35,19 +38,6 @@ class FileSystemCache(object):
         return self.cache_root_location
 
     def load_cache(self):
-        """Read set of stored blocks from file
-        Args:
-            owner(`str`): individual or group username at modelscope, can be empty for official models
-            name(`str`): name of the model
-        Returns:
-            The model details information.
-        Raises:
-            NotExistError: If the model is not exist, will throw NotExistError
-            TODO: Error based error code.
-        <Tip>
-            model_id = {owner}/{name}
-        </Tip>
-        """
         self.cached_files = []
         cache_keys_file_path = os.path.join(self.cache_root_location,
                                             FileSystemCache.KEY_FILE_NAME)
@@ -68,30 +58,24 @@ class FileSystemCache(object):
 
     def get_file(self, key):
         """Check the key is in the cache, if exist, return the file, otherwise return None.
+
         Args:
-            key(`str`): The cache key.
-        Returns:
-            If file exist, return the cached file location, otherwise None.
+            key(str): The cache key.
+
         Raises:
             None
-        <Tip>
-            model_id = {owner}/{name}
-        </Tip>
         """
         pass
 
     def put_file(self, key, location):
-        """Put file to the cache,
+        """Put file to the cache.
+
         Args:
-            key(`str`): The cache key
-            location(`str`): Location of the file, we will move the file to cache.
-        Returns:
-            The cached file path of the file.
+            key (str): The cache key
+            location (str): Location of the file, we will move the file to cache.
+
         Raises:
             None
-        <Tip>
-            model_id = {owner}/{name}
-        </Tip>
         """
         pass
 
@@ -113,8 +97,7 @@ class FileSystemCache(object):
         return False
 
     def clear_cache(self):
-        """Remove all files and metadat from the cache
-
+        """Remove all files and metadata from the cache
         In the case of multiple cache locations, this clears only the last one,
         which is assumed to be the read/write one.
         """
@@ -127,19 +110,16 @@ class FileSystemCache(object):
 
 class ModelFileSystemCache(FileSystemCache):
     """Local cache file layout
-       cache_root/owner/model_name/|individual cached files
-                                   |.mk: file, The cache index file
+       cache_root/owner/model_name/individual cached files and cache index file '.mcs'
        Save only one version for each file.
     """
 
-    def __init__(self, cache_root, owner, name):
+    def __init__(self, cache_root, owner=None, name=None):
         """Put file to the cache
         Args:
-            cache_root(`str`): The modelscope local cache root(default: ~/.modelscope/cache/models/)
+            cache_root(`str`): The modelscope local cache root(default: ~/.cache/modelscope/)
             owner(`str`): The model owner.
             name('str'): The name of the model
-            branch('str'): The branch of model
-            tag('str'): The tag of model
         Returns:
         Raises:
             None
@@ -147,12 +127,39 @@ class ModelFileSystemCache(FileSystemCache):
             model_id = {owner}/{name}
         </Tip>
         """
-        super().__init__(os.path.join(cache_root, owner, name))
+        if owner is None or name is None:
+            # get model meta from
+            super().__init__(os.path.join(cache_root))
+            self.load_model_meta()
+        else:
+            super().__init__(os.path.join(cache_root, owner, name))
+            self.model_meta = {MODEL_META_MODEL_ID: '%s/%s' % (owner, name)}
+            self.save_model_meta()
+
+    def load_model_meta(self):
+        meta_file_path = os.path.join(self.cache_root_location,
+                                      MODEL_META_FILE_NAME)
+        if os.path.exists(meta_file_path):
+            with open(meta_file_path, 'rb') as f:
+                self.model_meta = pickle.load(f)
+        else:
+            self.model_meta = {MODEL_META_MODEL_ID: 'unknown'}
+
+    def get_model_id(self):
+        return self.model_meta[MODEL_META_MODEL_ID]
+
+    def save_model_meta(self):
+        meta_file_path = os.path.join(self.cache_root_location,
+                                      MODEL_META_FILE_NAME)
+        with open(meta_file_path, 'wb') as f:
+            pickle.dump(self.model_meta, f)
 
     def get_file_by_path(self, file_path):
         """Retrieve the cache if there is file match the path.
+
         Args:
             file_path (str): The file path in the model.
+
         Returns:
             path: the full path of the file.
         """
@@ -169,9 +176,11 @@ class ModelFileSystemCache(FileSystemCache):
 
     def get_file_by_path_and_commit_id(self, file_path, commit_id):
         """Retrieve the cache if there is file match the path.
+
         Args:
             file_path (str): The file path in the model.
             commit_id (str): The commit id of the file
+
         Returns:
             path: the full path of the file.
         """
@@ -194,7 +203,7 @@ class ModelFileSystemCache(FileSystemCache):
             model_file_info (ModelFileInfo): The file information of the file.
 
         Returns:
-            _type_: _description_
+            str: The file path.
         """
         cache_key = self.__get_cache_key(model_file_info)
         for cached_file in self.cached_files:
@@ -240,7 +249,7 @@ class ModelFileSystemCache(FileSystemCache):
                 return True
             else:
                 self.remove_key(
-                    model_file_info)  # sameone may manual delete the file
+                    model_file_info)  # someone may manual delete the file
         return False
 
     def remove_if_exists(self, model_file_info):
@@ -262,23 +271,8 @@ class ModelFileSystemCache(FileSystemCache):
         """Put model on model_file_location to cache, the model first download to /tmp, and move to cache.
 
         Args:
-            model_file_info (str): The file description returned by get_model_files
-                                      sample:
-                                    {
-                                        "CommitMessage": "add model\n",
-                                        "CommittedDate": 1654857567,
-                                        "CommitterName": "mulin.lyh",
-                                        "IsLFS": false,
-                                        "Mode": "100644",
-                                        "Name": "resnet18.pth",
-                                        "Path": "resnet18.pth",
-                                        "Revision": "09b68012b27de0048ba74003690a890af7aff192",
-                                        "Size": 46827520,
-                                        "Type": "blob"
-                                    }
+            model_file_info (str): The file description returned by get_model_files.
             model_file_location (str): The location of the temporary file.
-        Raises:
-            NotImplementedError: _description_
 
         Returns:
             str: The location of the cached file.
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 3cc2c1e6..31e6e72c 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -29,9 +29,14 @@ def model_id_to_group_owner_name(model_id):
 
 
 def get_cache_dir(model_id: Optional[str] = None):
-    """
-    cache dir precedence:
-        function parameter > enviroment > ~/.cache/modelscope/hub
+    """cache dir precedence:
+        function parameter > environment > ~/.cache/modelscope/hub
+
+    Args:
+        model_id (str, optional): The model id.
+
+    Returns:
+        str: the model_id dir if model_id not None, otherwise cache root dir.
     """
     default_cache_dir = get_default_cache_dir()
     base_path = os.getenv('MODELSCOPE_CACHE',
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 45297fd7..beb76d33 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -19,9 +19,11 @@ class Models(object):
     realtime_video_object_detection = 'realtime-video-object-detection'
     scrfd = 'scrfd'
     classification_model = 'ClassificationModel'
+    bnext = 'bnext'
     nafnet = 'nafnet'
     csrnet = 'csrnet'
     cascade_mask_rcnn_swin = 'cascade_mask_rcnn_swin'
+    maskdino_swin = 'maskdino_swin'
     gpen = 'gpen'
     product_retrieval_embedding = 'product-retrieval-embedding'
     body_2d_keypoints = 'body-2d-keypoints'
@@ -29,6 +31,7 @@ class Models(object):
     crowd_counting = 'HRNetCrowdCounting'
     face_2d_keypoints = 'face-2d-keypoints'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
+    r50_panoptic_segmentation = 'r50-panoptic-segmentation'
     image_reid_person = 'passvitb'
     image_inpainting = 'FFTInpainting'
     video_summarization = 'pgl-video-summarization'
@@ -37,6 +40,9 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     newcrfs_depth_estimation = 'newcrfs-depth-estimation'
+    panovit_layout_estimation = 'panovit-layout-estimation'
+    unifuse_depth_estimation = 'unifuse-depth-estimation'
+    dro_resnet18_depth_estimation = 'dro-resnet18-depth-estimation'
     resnet50_bert = 'resnet50-bert'
     referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
     fer = 'fer'
@@ -46,6 +52,8 @@ class Models(object):
     mogface = 'mogface'
     mtcnn = 'mtcnn'
     ulfd = 'ulfd'
+    rts = 'rts'
+    flir = 'flir'
     arcface = 'arcface'
     facemask = 'facemask'
     flc = 'flc'
@@ -59,7 +67,18 @@ class Models(object):
     image_body_reshaping = 'image-body-reshaping'
     image_skychange = 'image-skychange'
     video_human_matting = 'video-human-matting'
+    video_frame_interpolation = 'video-frame-interpolation'
     video_object_segmentation = 'video-object-segmentation'
+    quadtree_attention_image_matching = 'quadtree-attention-image-matching'
+    vision_middleware = 'vision-middleware'
+    video_stabilization = 'video-stabilization'
+    real_basicvsr = 'real-basicvsr'
+    rcp_sceneflow_estimation = 'rcp-sceneflow-estimation'
+    image_casmvs_depth_estimation = 'image-casmvs-depth-estimation'
+    vop_retrieval_model = 'vop-retrieval-model'
+    ddcolor = 'ddcolor'
+    defrcn = 'defrcn'
+    image_face_fusion = 'image-face-fusion'
 
     # EasyCV models
     yolox = 'YOLOX'
@@ -87,6 +106,7 @@ class Models(object):
     lcrf_wseg = 'lstm-crf-for-word-segmentation'
     gcnncrf = 'gcnn-crf'
     bart = 'bart'
+    gpt2 = 'gpt2'
     gpt3 = 'gpt3'
     gpt_moe = 'gpt-moe'
     gpt_neo = 'gpt-neo'
@@ -99,14 +119,21 @@ class Models(object):
     codegeex = 'codegeex'
     bloom = 'bloom'
     unite = 'unite'
+    megatron_bert = 'megatron-bert'
+    use = 'user-satisfaction-estimation'
 
     # audio models
     sambert_hifigan = 'sambert-hifigan'
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
+    speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
+    speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
     kws_kwsbp = 'kws-kwsbp'
     generic_asr = 'generic-asr'
     wenet_asr = 'wenet-asr'
+    generic_itn = 'generic-itn'
+    generic_punc = 'generic-punc'
+    generic_sv = 'generic-sv'
 
     # multi-modal models
     ofa = 'ofa'
@@ -117,6 +144,9 @@ class Models(object):
     multi_stage_diffusion = 'multi-stage-diffusion-text-to-image-synthesis'
     team = 'team-multi-modal-similarity'
     video_clip = 'video-clip-multi-modal-embedding'
+    mgeo = 'mgeo'
+    vldoc = 'vldoc'
+    hitea = 'hitea'
 
     # science models
     unifold = 'unifold'
@@ -163,6 +193,7 @@ class Pipelines(object):
     # vision tasks
     portrait_matting = 'unet-image-matting'
     image_denoise = 'nafnet-image-denoise'
+    image_deblur = 'nafnet-image-deblur'
     person_image_cartoon = 'unet-person-image-cartoon'
     ocr_detection = 'resnet18-ocr-detection'
     table_recognition = 'dla34-table-recognition'
@@ -185,6 +216,8 @@ class Pipelines(object):
     camouflaged_detection = 'res2net-camouflaged-detection'
     image_classification = 'image-classification'
     face_detection = 'resnet-face-detection-scrfd10gkps'
+    face_liveness_ir = 'manual-face-liveness-flir'
+    face_liveness_rgb = 'manual-face-liveness-flir'
     card_detection = 'resnet-card-detection-scrfd34gkps'
     ulfd_face_detection = 'manual-face-detection-ulfd'
     tinymog_face_detection = 'manual-face-detection-tinymog'
@@ -198,6 +231,9 @@ class Pipelines(object):
     general_image_classification = 'vit-base_image-classification_ImageNet-labels'
     daily_image_classification = 'vit-base_image-classification_Dailylife-labels'
     nextvit_small_daily_image_classification = 'nextvit-small_image-classification_Dailylife-labels'
+    convnext_base_image_classification_garbage = 'convnext-base_image-classification_garbage'
+    bnext_small_image_classification = 'bnext-small_image-classification_ImageNet-labels'
+    common_image_classification = 'common-image-classification'
     image_color_enhance = 'csrnet-image-color-enhance'
     virtual_try_on = 'virtual-try-on'
     image_colorization = 'unet-image-colorization'
@@ -208,9 +244,11 @@ class Pipelines(object):
     realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
     realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
     face_recognition = 'ir101-face-recognition-cfglint'
+    face_recognition_ood = 'ir-face-recognition-ood-rts'
     arc_face_recognition = 'ir50-face-recognition-arcface'
     mask_face_recognition = 'resnet-face-recognition-facemask'
     image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
+    maskdino_instance_segmentation = 'maskdino-swin-image-instance-segmentation'
     image2image_translation = 'image-to-image-translation'
     live_category = 'live-category'
     video_category = 'video-category'
@@ -225,11 +263,16 @@ class Pipelines(object):
     crowd_counting = 'hrnet-crowd-counting'
     action_detection = 'ResNetC3D-action-detection'
     video_single_object_tracking = 'ostrack-vitb-video-single-object-tracking'
+    video_multi_object_tracking = 'video-multi-object-tracking'
     image_panoptic_segmentation = 'image-panoptic-segmentation'
+    image_panoptic_segmentation_easycv = 'image-panoptic-segmentation-easycv'
     video_summarization = 'googlenet_pgl_video_summarization'
     language_guided_video_summarization = 'clip-it-video-summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_depth_estimation = 'image-depth-estimation'
+    indoor_layout_estimation = 'indoor-layout-estimation'
+    video_depth_estimation = 'video-depth-estimation'
+    panorama_depth_estimation = 'panorama-depth-estimation'
     image_reid_person = 'passvitb-image-reid-person'
     image_inpainting = 'fft-inpainting'
     text_driven_segmentation = 'text-driven-segmentation'
@@ -246,7 +289,18 @@ class Pipelines(object):
     referring_video_object_segmentation = 'referring-video-object-segmentation'
     image_skychange = 'image-skychange'
     video_human_matting = 'video-human-matting'
+    vision_middleware_multi_task = 'vision-middleware-multi-task'
+    video_frame_interpolation = 'video-frame-interpolation'
     video_object_segmentation = 'video-object-segmentation'
+    image_matching = 'image-matching'
+    video_stabilization = 'video-stabilization'
+    video_super_resolution = 'realbasicvsr-video-super-resolution'
+    pointcloud_sceneflow_estimation = 'pointcloud-sceneflow-estimation'
+    image_multi_view_depth_estimation = 'image-multi-view-depth-estimation'
+    vop_retrieval = 'vop-video-text-retrieval'
+    ddcolor_image_colorization = 'ddcolor-image-colorization'
+    image_fewshot_detection = 'image-fewshot-detection'
+    image_face_fusion = 'image-face-fusion'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
@@ -282,6 +336,7 @@ class Pipelines(object):
     table_question_answering_pipeline = 'table-question-answering-pipeline'
     sentence_embedding = 'sentence-embedding'
     text_ranking = 'text-ranking'
+    mgeo_ranking = 'mgeo-ranking'
     relation_extraction = 'relation-extraction'
     document_segmentation = 'document-segmentation'
     extractive_summarization = 'extractive-summarization'
@@ -294,15 +349,20 @@ class Pipelines(object):
     translation_en_to_fr = 'translation_en_to_fr'  # keep it underscore
     token_classification = 'token-classification'
     translation_evaluation = 'translation-evaluation'
+    user_satisfaction_estimation = 'user-satisfaction-estimation'
 
     # audio tasks
     sambert_hifigan_tts = 'sambert-hifigan-tts'
     speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
+    speech_separation = 'speech-separation'
     kws_kwsbp = 'kws-kwsbp'
     asr_inference = 'asr-inference'
     asr_wenet_inference = 'asr-wenet-inference'
+    itn_inference = 'itn-inference'
+    punc_inference = 'punc-inference'
+    sv_inference = 'sv-inference'
 
     # multi-modal tasks
     image_captioning = 'image-captioning'
@@ -317,6 +377,13 @@ class Pipelines(object):
     image_text_retrieval = 'image-text-retrieval'
     ofa_ocr_recognition = 'ofa-ocr-recognition'
     ofa_asr = 'ofa-asr'
+    ofa_sudoku = 'ofa-sudoku'
+    ofa_text2sql = 'ofa-text2sql'
+    video_captioning = 'video-captioning'
+    video_question_answering = 'video-question-answering'
+    diffusers_stable_diffusion = 'diffusers-stable-diffusion'
+    document_vl_embedding = 'document-vl-embedding'
+    chinese_stable_diffusion = 'chinese-stable-diffusion'
 
     # science tasks
     protein_structure = 'unifold-protein-structure'
@@ -334,11 +401,13 @@ class Trainers(object):
 
     default = 'trainer'
     easycv = 'easycv'
+    tinynas_damoyolo = 'tinynas-damoyolo'
 
     # multi-modal trainers
     clip_multi_modal_embedding = 'clip-multi-modal-embedding'
     ofa = 'ofa'
     mplug = 'mplug'
+    mgeo_ranking_trainer = 'mgeo-ranking-trainer'
 
     # cv trainers
     image_instance_segmentation = 'image-instance-segmentation'
@@ -351,6 +420,7 @@ class Trainers(object):
     referring_video_object_segmentation = 'referring-video-object-segmentation'
     image_classification_team = 'image-classification-team'
     image_classification = 'image-classification'
+    image_fewshot_detection = 'image-fewshot-detection'
 
     # nlp trainers
     bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -362,12 +432,17 @@ class Trainers(object):
     text_generation_trainer = 'text-generation-trainer'
     nlp_plug_trainer = 'nlp-plug-trainer'
     gpt3_trainer = 'nlp-gpt3-trainer'
+    faq_question_answering_trainer = 'faq-question-answering-trainer'
     gpt_moe_trainer = 'nlp-gpt-moe-trainer'
+    table_question_answering_trainer = 'table-question-answering-trainer'
 
     # audio trainers
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
+    speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
     speech_kantts_trainer = 'speech-kantts-trainer'
+    speech_asr_trainer = 'speech-asr-trainer'
+    speech_separation = 'speech-separation'
 
 
 class Preprocessors(object):
@@ -383,7 +458,10 @@ class Preprocessors(object):
 
     # cv preprocessor
     load_image = 'load-image'
-    image_denoie_preprocessor = 'image-denoise-preprocessor'
+    image_denoise_preprocessor = 'image-denoise-preprocessor'
+    image_deblur_preprocessor = 'image-deblur-preprocessor'
+    object_detection_tinynas_preprocessor = 'object-detection-tinynas-preprocessor'
+    image_classification_mmcv_preprocessor = 'image-classification-mmcv-preprocessor'
     image_color_enhance_preprocessor = 'image-color-enhance-preprocessor'
     image_instance_segmentation_preprocessor = 'image-instance-segmentation-preprocessor'
     image_portrait_enhancement_preprocessor = 'image-portrait-enhancement-preprocessor'
@@ -429,6 +507,7 @@ class Preprocessors(object):
     mglm_summarization = 'mglm-summarization'
     sentence_piece = 'sentence-piece'
     translation_evaluation = 'translation-evaluation-preprocessor'
+    dialog_use_preprocessor = 'dialog-use-preprocessor'
 
     # audio preprocessor
     linear_aec_fbank = 'linear-aec-fbank'
@@ -441,6 +520,9 @@ class Preprocessors(object):
     ofa_tasks_preprocessor = 'ofa-tasks-preprocessor'
     clip_preprocessor = 'clip-preprocessor'
     mplug_tasks_preprocessor = 'mplug-tasks-preprocessor'
+    mgeo_ranking = 'mgeo-ranking'
+    vldoc_preprocessor = 'vldoc-preprocessor'
+    hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
 
     # science preprocessor
     unifold_preprocessor = 'unifold-preprocessor'
@@ -452,14 +534,20 @@ class Metrics(object):
 
     # accuracy
     accuracy = 'accuracy'
+
     multi_average_precision = 'mAP'
     audio_noise_metric = 'audio-noise-metric'
+    PPL = 'ppl'
 
     # text gen
     BLEU = 'bleu'
 
     # metrics for image denoise task
     image_denoise_metric = 'image-denoise-metric'
+    # metrics for video frame-interpolation task
+    video_frame_interpolation_metric = 'video-frame-interpolation-metric'
+    # metrics for real-world video super-resolution task
+    video_super_resolution_metric = 'video-super-resolution-metric'
 
     # metric for image instance segmentation task
     image_ins_seg_coco_metric = 'image-ins-seg-coco-metric'
@@ -484,6 +572,8 @@ class Metrics(object):
     inbatch_recall = 'inbatch_recall'
     # metric for referring-video-object-segmentation task
     referring_video_object_segmentation_metric = 'referring-video-object-segmentation-metric'
+    # metric for video stabilization task
+    video_stabilization_metric = 'video-stabilization-metric'
 
 
 class Optimizers(object):
@@ -555,4 +645,5 @@ class Datasets(object):
     SegDataset = 'SegDataset'
     DetDataset = 'DetDataset'
     DetImagesMixDataset = 'DetImagesMixDataset'
+    PanopticDataset = 'PanopticDataset'
     PairedDataset = 'PairedDataset'
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index f106f054..f814cf4d 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -21,6 +21,10 @@ if TYPE_CHECKING:
     from .bleu_metric import BleuMetric
     from .image_inpainting_metric import ImageInpaintingMetric
     from .referring_video_object_segmentation_metric import ReferringVideoObjectSegmentationMetric
+    from .video_frame_interpolation_metric import VideoFrameInterpolationMetric
+    from .video_stabilization_metric import VideoStabilizationMetric
+    from .video_super_resolution_metric.video_super_resolution_metric import VideoSuperResolutionMetric
+    from .ppl_metric import PplMetric
 
 else:
     _import_structure = {
@@ -43,6 +47,9 @@ else:
         'bleu_metric': ['BleuMetric'],
         'referring_video_object_segmentation_metric':
         ['ReferringVideoObjectSegmentationMetric'],
+        'video_frame_interpolation_metric': ['VideoFrameInterpolationMetric'],
+        'video_stabilization_metric': ['VideoStabilizationMetric'],
+        'ppl_metric': ['PplMetric'],
     }
 
     import sys
diff --git a/modelscope/metrics/accuracy_metric.py b/modelscope/metrics/accuracy_metric.py
index fe040177..b1976d8e 100644
--- a/modelscope/metrics/accuracy_metric.py
+++ b/modelscope/metrics/accuracy_metric.py
@@ -40,6 +40,9 @@ class AccuracyMetric(Metric):
             self.labels.append(truth)
         for result in eval_results:
             if isinstance(truth, str):
+                if isinstance(result, list):
+                    result = result[0]
+                assert isinstance(result, str), 'both truth and pred are str'
                 self.preds.append(remove_space_between_chinese_chars(result))
             else:
                 self.preds.append(result)
@@ -51,3 +54,14 @@ class AccuracyMetric(Metric):
                 pred == ref for pred, ref in zip(self.preds, self.labels)
             ])).mean().item()
         }
+
+    def merge(self, other: 'AccuracyMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels = state
diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py
index 8555e95b..4aa5f0d3 100644
--- a/modelscope/metrics/audio_noise_metric.py
+++ b/modelscope/metrics/audio_noise_metric.py
@@ -40,3 +40,15 @@ class AudioNoiseMetric(Metric):
             'avg_sisnr': -avg_sisnr.item(),
             MetricKeys.AVERAGE_LOSS: avg_loss.item()
         }
+
+    def merge(self, other: 'AudioNoiseMetric'):
+        self.loss.extend(other.loss)
+        self.amp_loss.extend(other.amp_loss)
+        self.phase_loss.extend(other.phase_loss)
+        self.sisnr.extend(other.sisnr)
+
+    def __getstate__(self):
+        return self.loss, self.amp_loss, self.phase_loss, self.sisnr
+
+    def __setstate__(self, state):
+        self.loss, self.amp_loss, self.phase_loss, self.sisnr = state
diff --git a/modelscope/metrics/base.py b/modelscope/metrics/base.py
index 955946b5..ac3be810 100644
--- a/modelscope/metrics/base.py
+++ b/modelscope/metrics/base.py
@@ -38,3 +38,19 @@ class Metric(ABC):
 
         """
         pass
+
+    @abstractmethod
+    def merge(self, other: 'Metric'):
+        """ When using data parallel, the data required for different metric calculations
+
+        are stored in their respective Metric classes,
+
+        and we need to merge these data to uniformly calculate metric.
+
+        Args:
+            other: Another Metric instance.
+
+        Returns: None
+
+        """
+        pass
diff --git a/modelscope/metrics/bleu_metric.py b/modelscope/metrics/bleu_metric.py
index 7c134b6a..0255826c 100644
--- a/modelscope/metrics/bleu_metric.py
+++ b/modelscope/metrics/bleu_metric.py
@@ -40,3 +40,13 @@ class BleuMetric(Metric):
         return {
             MetricKeys.BLEU_4: bleu.score,
         }
+
+    def merge(self, other: 'BleuMetric'):
+        self.refs.extend(other.refs)
+        self.hyps.extend(other.hyps)
+
+    def __getstate__(self):
+        return self.eval_tokenized_bleu, self.hyp_name, self.ref_name, self.refs, self.hyps
+
+    def __setstate__(self, state):
+        self.eval_tokenized_bleu, self.hyp_name, self.ref_name, self.refs, self.hyps = state
diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py
index 03d4c324..025187fd 100644
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -12,10 +12,13 @@ METRICS = Registry('metrics')
 class MetricKeys(object):
     ACCURACY = 'accuracy'
     F1 = 'f1'
+    Macro_F1 = 'macro-f1'
     PRECISION = 'precision'
     RECALL = 'recall'
     PSNR = 'psnr'
     SSIM = 'ssim'
+    LPIPS = 'lpips'
+    NIQE = 'niqe'
     AVERAGE_LOSS = 'avg_loss'
     FScore = 'fscore'
     FID = 'fid'
@@ -26,6 +29,10 @@ class MetricKeys(object):
     NED = 'ned'  # ocr metric
     mAP = 'mAP'
     BatchAcc = 'inbatch_t2i_recall_at_1'
+    CROPPING_RATIO = 'cropping_ratio'
+    DISTORTION_VALUE = 'distortion_value'
+    STABILITY_SCORE = 'stability_score'
+    PPL = 'ppl'
 
 
 task_default_metrics = {
@@ -37,6 +44,8 @@ task_default_metrics = {
     Tasks.text_generation: [Metrics.text_gen_metric],
     Tasks.text_classification: [Metrics.seq_cls_metric],
     Tasks.image_denoising: [Metrics.image_denoise_metric],
+    Tasks.image_deblurring: [Metrics.image_denoise_metric],
+    Tasks.video_super_resolution: [Metrics.video_super_resolution_metric],
     Tasks.image_color_enhancement: [Metrics.image_color_enhance_metric],
     Tasks.image_portrait_enhancement:
     [Metrics.image_portrait_enhancement_metric],
@@ -47,6 +56,9 @@ task_default_metrics = {
     Tasks.image_inpainting: [Metrics.image_inpainting_metric],
     Tasks.referring_video_object_segmentation:
     [Metrics.referring_video_object_segmentation_metric],
+    Tasks.video_frame_interpolation:
+    [Metrics.video_frame_interpolation_metric],
+    Tasks.video_stabilization: [Metrics.video_stabilization_metric],
 }
 
 
diff --git a/modelscope/metrics/image_color_enhance_metric.py b/modelscope/metrics/image_color_enhance_metric.py
index b3744975..280a2cdd 100644
--- a/modelscope/metrics/image_color_enhance_metric.py
+++ b/modelscope/metrics/image_color_enhance_metric.py
@@ -256,3 +256,13 @@ class ImageColorEnhanceMetric(Metric):
             MetricKeys.PSNR: sum(psnrs) / len(psnrs),
             MetricKeys.SSIM: sum(ssims) / len(ssims)
         }
+
+    def merge(self, other: 'ImageColorEnhanceMetric'):
+        self.preds.extend(other.preds)
+        self.targets.extend(other.targets)
+
+    def __getstate__(self):
+        return self.preds, self.targets
+
+    def __setstate__(self, state):
+        self.preds, self.targets = state
diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py
index cbbd1ea1..5bc5041a 100644
--- a/modelscope/metrics/image_denoise_metric.py
+++ b/modelscope/metrics/image_denoise_metric.py
@@ -44,6 +44,17 @@ class ImageDenoiseMetric(Metric):
             MetricKeys.SSIM: np.mean(ssim_list)
         }
 
+    def merge(self, other: 'ImageDenoiseMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels = state
+
 
 def reorder_image(img, input_order='HWC'):
     """Reorder images to 'HWC' order.
diff --git a/modelscope/metrics/image_inpainting_metric.py b/modelscope/metrics/image_inpainting_metric.py
index 954d4ca2..27f7083e 100644
--- a/modelscope/metrics/image_inpainting_metric.py
+++ b/modelscope/metrics/image_inpainting_metric.py
@@ -208,3 +208,14 @@ class ImageInpaintingMetric(Metric):
         ssim_list = torch_nested_numpify(ssim_list)
         fid = self.FID.get_value()
         return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid}
+
+    def merge(self, other: 'ImageInpaintingMetric'):
+        self.preds.extend(other.preds)
+        self.targets.extend(other.targets)
+
+    def __getstate__(self):
+        return self.preds, self.targets
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.targets = state
diff --git a/modelscope/metrics/image_instance_segmentation_metric.py b/modelscope/metrics/image_instance_segmentation_metric.py
index 86a19d13..887177cf 100644
--- a/modelscope/metrics/image_instance_segmentation_metric.py
+++ b/modelscope/metrics/image_instance_segmentation_metric.py
@@ -112,6 +112,16 @@ class ImageInstanceSegmentationCOCOMetric(Metric):
             tmp_dir.cleanup()
         return eval_results
 
+    def merge(self, other: 'ImageInstanceSegmentationCOCOMetric'):
+        self.results.extend(other.results)
+
+    def __getstate__(self):
+        return self.results
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.results = state
+
     def format_results(self, results, img_ids, jsonfile_prefix=None, **kwargs):
         """Format the results to json (standard format for COCO evaluation).
 
diff --git a/modelscope/metrics/image_portrait_enhancement_metric.py b/modelscope/metrics/image_portrait_enhancement_metric.py
index 7d94aade..dd69688f 100644
--- a/modelscope/metrics/image_portrait_enhancement_metric.py
+++ b/modelscope/metrics/image_portrait_enhancement_metric.py
@@ -49,3 +49,13 @@ class ImagePortraitEnhancementMetric(Metric):
         ]
 
         return {MetricKeys.PSNR: sum(psnrs) / len(psnrs)}
+
+    def merge(self, other: 'ImagePortraitEnhancementMetric'):
+        self.preds.extend(other.preds)
+        self.targets.extend(other.targets)
+
+    def __getstate__(self):
+        return self.preds, self.targets
+
+    def __setstate__(self, state):
+        self.preds, self.targets = state
diff --git a/modelscope/metrics/inbatch_recall_metric.py b/modelscope/metrics/inbatch_recall_metric.py
index d098a883..818b4563 100644
--- a/modelscope/metrics/inbatch_recall_metric.py
+++ b/modelscope/metrics/inbatch_recall_metric.py
@@ -53,3 +53,14 @@ class InbatchRecallMetric(Metric):
             MetricKeys.BatchAcc:
             sum(self.inbatch_t2i_hitcnts) / sum(self.batch_sizes)
         }
+
+    def merge(self, other: 'InbatchRecallMetric'):
+        self.inbatch_t2i_hitcnts.extend(other.inbatch_t2i_hitcnts)
+        self.batch_sizes.extend(other.batch_sizes)
+
+    def __getstate__(self):
+        return self.inbatch_t2i_hitcnts, self.batch_sizes
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.inbatch_t2i_hitcnts, self.batch_sizes = state
diff --git a/modelscope/metrics/map_metric.py b/modelscope/metrics/map_metric.py
index aac76f22..aa7a835a 100644
--- a/modelscope/metrics/map_metric.py
+++ b/modelscope/metrics/map_metric.py
@@ -50,6 +50,17 @@ class AveragePrecisionMetric(Metric):
         scores = self._calculate_ap_score(self.preds, self.labels, self.thresh)
         return {MetricKeys.mAP: scores.mean().item()}
 
+    def merge(self, other: 'AveragePrecisionMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels, self.thresh
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels, self.thresh = state
+
     def _calculate_ap_score(self, preds, labels, thresh=0.5):
         hyps = np.array(preds)
         refs = np.array(labels)
diff --git a/modelscope/metrics/movie_scene_segmentation_metric.py b/modelscope/metrics/movie_scene_segmentation_metric.py
index 65725b6f..df2807ce 100644
--- a/modelscope/metrics/movie_scene_segmentation_metric.py
+++ b/modelscope/metrics/movie_scene_segmentation_metric.py
@@ -52,3 +52,14 @@ class MovieSceneSegmentationMetric(Metric):
             MetricKeys.RECALL: recall,
             MetricKeys.PRECISION: precision
         }
+
+    def merge(self, other: 'MovieSceneSegmentationMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels = state
diff --git a/modelscope/metrics/ned_metric.py b/modelscope/metrics/ned_metric.py
index e87bb2c4..3a1ce66e 100644
--- a/modelscope/metrics/ned_metric.py
+++ b/modelscope/metrics/ned_metric.py
@@ -53,6 +53,17 @@ class NedMetric(Metric):
             ])).mean().item()
         }
 
+    def merge(self, other: 'NedMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels = state
+
     @staticmethod
     def _distance(pred, ref):
         if pred is None or ref is None:
diff --git a/modelscope/metrics/ppl_metric.py b/modelscope/metrics/ppl_metric.py
new file mode 100644
index 00000000..a7f6f14a
--- /dev/null
+++ b/modelscope/metrics/ppl_metric.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+from typing import Dict, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Metrics
+from modelscope.outputs import OutputKeys
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(group_key=default_group, module_name=Metrics.PPL)
+class PplMetric(Metric):
+    """The metric computation class for any classes.
+
+    This metric class calculates perplexity for the whole input batches.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.avg_loss: float = 0.
+        self.batch_num: int = 0
+
+    def add(self, outputs: Dict, inputs: Dict):
+        logits = outputs[OutputKeys.LOGITS]
+        labels = inputs[OutputKeys.LABELS]
+
+        in_loss = self._get_loss(logits, labels)
+        in_batch_num = self._get_batch_num(inputs[OutputKeys.LABELS])
+
+        self.avg_loss = self._average_loss(in_loss, in_batch_num)
+        self.batch_num += in_batch_num
+
+    @staticmethod
+    def _get_loss(logits: torch.Tensor, labels: torch.Tensor) -> float:
+        labels = labels.view(-1)
+        logits = logits.view(labels.shape[0], -1)
+        return F.cross_entropy(logits, labels).item()
+
+    @staticmethod
+    def _get_batch_num(matrix: Union[np.ndarray, torch.Tensor]) -> int:
+        return matrix.shape[0]
+
+    def _average_loss(self, in_loss: float, in_batch_num):
+        return (self.avg_loss * self.batch_num + in_loss * in_batch_num) \
+            / (self.batch_num + in_batch_num)
+
+    def evaluate(self) -> Dict[str, float]:
+        return {MetricKeys.PPL: math.exp(self.avg_loss)}
+
+    def merge(self, other: 'PplMetric'):
+        self.avg_loss = self._average_loss(other.avg_loss, other.batch_num)
+        self.batch_num += other.batch_num
+
+    def __getstate__(self):
+        return self.avg_loss, self.batch_num
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.avg_loss, self.batch_num = state
diff --git a/modelscope/metrics/referring_video_object_segmentation_metric.py b/modelscope/metrics/referring_video_object_segmentation_metric.py
index 5a0af30b..d6cc7bcc 100644
--- a/modelscope/metrics/referring_video_object_segmentation_metric.py
+++ b/modelscope/metrics/referring_video_object_segmentation_metric.py
@@ -63,6 +63,15 @@ class ReferringVideoObjectSegmentationMetric(Metric):
 
         return eval_metrics
 
+    def merge(self, other: 'ReferringVideoObjectSegmentationMetric'):
+        self.preds.extend(other.preds)
+
+    def __getstate__(self):
+        return self.ann_file, self.calculate_precision_and_iou_metrics, self.preds
+
+    def __setstate__(self, state):
+        self.ann_file, self.calculate_precision_and_iou_metrics, self.preds = state
+
 
 def compute_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6):
     outputs = outputs.int()
diff --git a/modelscope/metrics/sequence_classification_metric.py b/modelscope/metrics/sequence_classification_metric.py
index dc11c3d8..5a817691 100644
--- a/modelscope/metrics/sequence_classification_metric.py
+++ b/modelscope/metrics/sequence_classification_metric.py
@@ -58,4 +58,17 @@ class SequenceClassificationMetric(Metric):
                 preds,
                 average='micro' if any([label > 1
                                         for label in labels]) else None),
+            MetricKeys.Macro_F1:
+            f1_score(labels, preds, average='macro'),
         }
+
+    def merge(self, other: 'SequenceClassificationMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels, self.label_name, self.logit_name
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels, self.label_name, self.logit_name = state
diff --git a/modelscope/metrics/text_generation_metric.py b/modelscope/metrics/text_generation_metric.py
index adad871e..95947d3e 100644
--- a/modelscope/metrics/text_generation_metric.py
+++ b/modelscope/metrics/text_generation_metric.py
@@ -76,3 +76,14 @@ class TextGenerationMetric(Metric):
             MetricKeys.BLEU_1: bleu_1,
             MetricKeys.BLEU_4: bleu_4
         }
+
+    def merge(self, other: 'TextGenerationMetric'):
+        self.preds.extend(other.preds)
+        self.tgts.extend(other.tgts)
+
+    def __getstate__(self):
+        return self.preds, self.tgts
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.tgts = state
diff --git a/modelscope/metrics/token_classification_metric.py b/modelscope/metrics/token_classification_metric.py
index 5d1ece4a..33b64a9c 100644
--- a/modelscope/metrics/token_classification_metric.py
+++ b/modelscope/metrics/token_classification_metric.py
@@ -90,6 +90,19 @@ class TokenClassificationMetric(Metric):
                 MetricKeys.ACCURACY: results[MetricKeys.ACCURACY],
             }
 
+    def merge(self, other: 'TokenClassificationMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return (self.return_entity_level_metrics, self.preds, self.labels,
+                self.label2id, self.label_name, self.logit_name)
+
+    def __setstate__(self, state):
+        self.__init__()
+        (self.return_entity_level_metrics, self.preds, self.labels,
+         self.label2id, self.label_name, self.logit_name) = state
+
     @staticmethod
     def _compute(
         predictions,
diff --git a/modelscope/metrics/video_frame_interpolation_metric.py b/modelscope/metrics/video_frame_interpolation_metric.py
new file mode 100644
index 00000000..4b81fbba
--- /dev/null
+++ b/modelscope/metrics/video_frame_interpolation_metric.py
@@ -0,0 +1,183 @@
+# ------------------------------------------------------------------------
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# ------------------------------------------------------------------------
+import math
+from math import exp
+from typing import Dict
+
+import lpips
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.utils.registry import default_group
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.video_frame_interpolation_metric)
+class VideoFrameInterpolationMetric(Metric):
+    """The metric computation class for video frame interpolation,
+    which will return PSNR, SSIM and LPIPS.
+    """
+    pred_name = 'pred'
+    label_name = 'target'
+
+    def __init__(self):
+        super(VideoFrameInterpolationMetric, self).__init__()
+        self.preds = []
+        self.labels = []
+        self.loss_fn_alex = lpips.LPIPS(net='alex').cuda()
+
+    def add(self, outputs: Dict, inputs: Dict):
+        ground_truths = outputs[VideoFrameInterpolationMetric.label_name]
+        eval_results = outputs[VideoFrameInterpolationMetric.pred_name]
+        self.preds.append(eval_results)
+        self.labels.append(ground_truths)
+
+    def evaluate(self):
+        psnr_list, ssim_list, lpips_list = [], [], []
+        with torch.no_grad():
+            for (pred, label) in zip(self.preds, self.labels):
+                # norm to 0-1
+                height, width = label.size(2), label.size(3)
+                pred = pred[:, :, 0:height, 0:width]
+
+                psnr_list.append(calculate_psnr(label, pred))
+                ssim_list.append(calculate_ssim(label, pred))
+                lpips_list.append(
+                    calculate_lpips(label, pred, self.loss_fn_alex))
+
+        return {
+            MetricKeys.PSNR: np.mean(psnr_list),
+            MetricKeys.SSIM: np.mean(ssim_list),
+            MetricKeys.LPIPS: np.mean(lpips_list)
+        }
+
+    def merge(self, other: 'VideoFrameInterpolationMetric'):
+        self.preds.extend(other.preds)
+        self.labels.extend(other.labels)
+
+    def __getstate__(self):
+        return self.preds, self.labels
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds, self.labels = state
+
+
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([
+        exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+        for x in range(window_size)
+    ])
+    return gauss / gauss.sum()
+
+
+def create_window_3d(window_size, channel=1, device=None):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t())
+    _3D_window = _2D_window.unsqueeze(2) @ (_1D_window.t())
+    window = _3D_window.expand(1, channel, window_size, window_size,
+                               window_size).contiguous().to(device)
+    return window
+
+
+def calculate_psnr(img1, img2):
+    psnr = -10 * math.log10(
+        torch.mean((img1[0] - img2[0]) * (img1[0] - img2[0])).cpu().data)
+    return psnr
+
+
+def calculate_ssim(img1,
+                   img2,
+                   window_size=11,
+                   window=None,
+                   size_average=True,
+                   full=False,
+                   val_range=None):
+    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
+    if val_range is None:
+        if torch.max(img1) > 128:
+            max_val = 255
+        else:
+            max_val = 1
+
+        if torch.min(img1) < -0.5:
+            min_val = -1
+        else:
+            min_val = 0
+        L = max_val - min_val
+    else:
+        L = val_range
+
+    padd = 0
+    (_, _, height, width) = img1.size()
+    if window is None:
+        real_size = min(window_size, height, width)
+        window = create_window_3d(
+            real_size, channel=1, device=img1.device).to(img1.device)
+        # Channel is set to 1 since we consider color images as volumetric images
+
+    img1 = img1.unsqueeze(1)
+    img2 = img2.unsqueeze(1)
+
+    mu1 = F.conv3d(
+        F.pad(img1, (5, 5, 5, 5, 5, 5), mode='replicate'),
+        window,
+        padding=padd,
+        groups=1)
+    mu2 = F.conv3d(
+        F.pad(img2, (5, 5, 5, 5, 5, 5), mode='replicate'),
+        window,
+        padding=padd,
+        groups=1)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+
+    sigma1_sq = F.conv3d(
+        F.pad(img1 * img1, (5, 5, 5, 5, 5, 5), 'replicate'),
+        window,
+        padding=padd,
+        groups=1) - mu1_sq
+    sigma2_sq = F.conv3d(
+        F.pad(img2 * img2, (5, 5, 5, 5, 5, 5), 'replicate'),
+        window,
+        padding=padd,
+        groups=1) - mu2_sq
+    sigma12 = F.conv3d(
+        F.pad(img1 * img2, (5, 5, 5, 5, 5, 5), 'replicate'),
+        window,
+        padding=padd,
+        groups=1) - mu1_mu2
+
+    C1 = (0.01 * L)**2
+    C2 = (0.03 * L)**2
+
+    v1 = 2.0 * sigma12 + C2
+    v2 = sigma1_sq + sigma2_sq + C2
+    cs = torch.mean(v1 / v2)  # contrast sensitivity
+
+    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
+
+    if size_average:
+        ret = ssim_map.mean()
+    else:
+        ret = ssim_map.mean(1).mean(1).mean(1)
+
+    if full:
+        return ret, cs
+    return ret.cpu()
+
+
+def calculate_lpips(img1, img2, loss_fn_alex):
+    img1 = img1 * 2 - 1
+    img2 = img2 * 2 - 1
+
+    d = loss_fn_alex(img1, img2)
+    return d.cpu().item()
diff --git a/modelscope/metrics/video_stabilization_metric.py b/modelscope/metrics/video_stabilization_metric.py
new file mode 100644
index 00000000..96d05b0c
--- /dev/null
+++ b/modelscope/metrics/video_stabilization_metric.py
@@ -0,0 +1,269 @@
+# Part of the implementation is borrowed and modified from DIFRINT,
+# publicly available at https://github.com/jinsc37/DIFRINT/blob/master/metrics.py
+
+import os
+import sys
+import tempfile
+from typing import Dict
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+from modelscope.metainfo import Metrics
+from modelscope.models.cv.video_stabilization.utils.WarpUtils import \
+    warpListImage
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.video_stabilization_metric)
+class VideoStabilizationMetric(Metric):
+    """The metric for video summarization task.
+    """
+
+    def __init__(self):
+        self.inputs = []
+        self.outputs = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        out = video_merger(warpprocess(outputs))
+        self.outputs.append(out['video'])
+        self.inputs.append(inputs['input'][0])
+
+    def evaluate(self):
+        CR = []
+        DV = []
+        SS = []
+        for output, input in zip(self.outputs, self.inputs):
+            cropping_ratio, distortion_value, stability_score = \
+                metrics(input, output)
+            if cropping_ratio <= 1 and distortion_value <= 1 and stability_score <= 1:
+                CR.append(cropping_ratio)
+                DV.append(distortion_value)
+                SS.append(stability_score)
+            else:
+                print('Removed one error item when computing metrics.')
+
+        return {
+            MetricKeys.CROPPING_RATIO: sum(CR) / len(CR),
+            MetricKeys.DISTORTION_VALUE: sum(DV) / len(DV),
+            MetricKeys.STABILITY_SCORE: sum(SS) / len(SS),
+        }
+
+    def merge(self, other: 'VideoStabilizationMetric'):
+        self.inputs.extend(other.inputs)
+        self.outputs.extend(other.outputs)
+
+    def __getstate__(self):
+        return self.inputs, self.outputs
+
+    def __setstate__(self, state):
+        self.inputs, self.outputs = state
+
+
+def warpprocess(inputs):
+    """ video stabilization postprocess
+
+    Args:
+        inputs:  input data
+
+    Return:
+        dict of results:  a dict containing outputs of model.
+    """
+    x_paths = inputs['origin_motion'][:, :, :, 0]
+    y_paths = inputs['origin_motion'][:, :, :, 1]
+    sx_paths = inputs['smooth_path'][:, :, :, 0]
+    sy_paths = inputs['smooth_path'][:, :, :, 1]
+    new_x_motion_meshes = sx_paths - x_paths
+    new_y_motion_meshes = sy_paths - y_paths
+    out_images = warpListImage(inputs['ori_images'], new_x_motion_meshes,
+                               new_y_motion_meshes, inputs['width'],
+                               inputs['height'])
+
+    return {
+        'output': out_images,
+        'fps': inputs['fps'],
+        'width': inputs['width'],
+        'height': inputs['height'],
+        'base_crop_width': inputs['base_crop_width']
+    }
+
+
+def video_merger(inputs):
+    out_images = inputs['output'].numpy().astype(np.uint8)
+    out_images = [
+        np.transpose(out_images[idx], (1, 2, 0))
+        for idx in range(out_images.shape[0])
+    ]
+
+    output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    w = inputs['width']
+    h = inputs['height']
+    base_crop_width = inputs['base_crop_width']
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, inputs['fps'],
+                                   (w, h))
+
+    for idx, frame in enumerate(out_images):
+        horizontal_border = int(base_crop_width * w / 1280)
+        vertical_border = int(horizontal_border * h / w)
+        new_frame = frame[vertical_border:-vertical_border,
+                          horizontal_border:-horizontal_border]
+        new_frame = cv2.resize(new_frame, (w, h))
+        video_writer.write(new_frame)
+    video_writer.release()
+
+    return {'video': output_video_path}
+
+
+def metrics(original_v, pred_v):
+    # Create brute-force matcher object
+    bf = cv2.BFMatcher()
+
+    sift = cv2.SIFT_create()
+
+    # Apply the homography transformation if we have enough good matches
+    MIN_MATCH_COUNT = 10
+
+    ratio = 0.7
+    thresh = 5.0
+
+    CR_seq = []
+    DV_seq = []
+    Pt = np.eye(3)
+    P_seq = []
+
+    vc_o = cv2.VideoCapture(original_v)
+    vc_p = cv2.VideoCapture(pred_v)
+
+    rval_o = vc_o.isOpened()
+    rval_p = vc_p.isOpened()
+
+    imgs1 = []
+    imgs1o = []
+    while (rval_o and rval_p):
+        rval_o, img1 = vc_o.read()
+        rval_p, img1o = vc_p.read()
+        if rval_o and rval_p:
+            imgs1.append(img1)
+            imgs1o.append(img1o)
+    is_got_bad_item = False
+    print('processing ' + original_v.split('/')[-1] + ':')
+    for i in tqdm(range(len(imgs1))):
+        # Load the images in gray scale
+        img1 = imgs1[i]
+        img1o = imgs1o[i]
+
+        # Detect the SIFT key points and compute the descriptors for the two images
+        keyPoints1, descriptors1 = sift.detectAndCompute(img1, None)
+        keyPoints1o, descriptors1o = sift.detectAndCompute(img1o, None)
+
+        # Match the descriptors
+        matches = bf.knnMatch(descriptors1, descriptors1o, k=2)
+
+        # Select the good matches using the ratio test
+        goodMatches = []
+
+        for m, n in matches:
+            if m.distance < ratio * n.distance:
+                goodMatches.append(m)
+
+        if len(goodMatches) > MIN_MATCH_COUNT:
+            # Get the good key points positions
+            sourcePoints = np.float32([
+                keyPoints1[m.queryIdx].pt for m in goodMatches
+            ]).reshape(-1, 1, 2)
+            destinationPoints = np.float32([
+                keyPoints1o[m.trainIdx].pt for m in goodMatches
+            ]).reshape(-1, 1, 2)
+
+            # Obtain the homography matrix
+            M, _ = cv2.findHomography(
+                sourcePoints,
+                destinationPoints,
+                method=cv2.RANSAC,
+                ransacReprojThreshold=thresh)
+        else:
+            is_got_bad_item = True
+
+        # end
+
+        if not is_got_bad_item:
+            # Obtain Scale, Translation, Rotation, Distortion value
+            # Based on https://math.stackexchange.com/questions/78137/decomposition-of-a-nonsquare-affine-matrix
+            scaleRecovered = np.sqrt(M[0, 1]**2 + M[0, 0]**2)
+
+            w, _ = np.linalg.eig(M[0:2, 0:2])
+            w = np.sort(w)[::-1]
+            DV = w[1] / w[0]
+
+            CR_seq.append(1 / scaleRecovered)
+            DV_seq.append(DV)
+
+            # For Stability score calculation
+            if i + 1 < len(imgs1):
+                img2o = imgs1o[i + 1]
+
+                keyPoints2o, descriptors2o = sift.detectAndCompute(img2o, None)
+                matches = bf.knnMatch(descriptors1o, descriptors2o, k=2)
+                goodMatches = []
+
+                for m, n in matches:
+                    if m.distance < ratio * n.distance:
+                        goodMatches.append(m)
+
+                if len(goodMatches) > MIN_MATCH_COUNT:
+                    # Get the good key points positions
+                    sourcePoints = np.float32([
+                        keyPoints1o[m.queryIdx].pt for m in goodMatches
+                    ]).reshape(-1, 1, 2)
+                    destinationPoints = np.float32([
+                        keyPoints2o[m.trainIdx].pt for m in goodMatches
+                    ]).reshape(-1, 1, 2)
+
+                    # Obtain the homography matrix
+                    M, _ = cv2.findHomography(
+                        sourcePoints,
+                        destinationPoints,
+                        method=cv2.RANSAC,
+                        ransacReprojThreshold=thresh)
+                # end
+
+                P_seq.append(np.matmul(Pt, M))
+                Pt = np.matmul(Pt, M)
+            # end
+    # end
+
+    if is_got_bad_item:
+        return -1, -1, -1
+
+    # Make 1D temporal signals
+    P_seq_t = []
+    P_seq_r = []
+
+    for Mp in P_seq:
+        transRecovered = np.sqrt(Mp[0, 2]**2 + Mp[1, 2]**2)
+        # Based on https://math.stackexchange.com/questions/78137/decomposition-of-a-nonsquare-affine-matrix
+        thetaRecovered = np.arctan2(Mp[1, 0], Mp[0, 0]) * 180 / np.pi
+        P_seq_t.append(transRecovered)
+        P_seq_r.append(thetaRecovered)
+
+    # FFT
+    fft_t = np.fft.fft(P_seq_t)
+    fft_r = np.fft.fft(P_seq_r)
+    fft_t = np.abs(fft_t)**2
+    fft_r = np.abs(fft_r)**2
+
+    fft_t = np.delete(fft_t, 0)
+    fft_r = np.delete(fft_r, 0)
+    fft_t = fft_t[:len(fft_t) // 2]
+    fft_r = fft_r[:len(fft_r) // 2]
+
+    SS_t = np.sum(fft_t[:5]) / np.sum(fft_t)
+    SS_r = np.sum(fft_r[:5]) / np.sum(fft_r)
+
+    return np.min([np.mean(CR_seq),
+                   1]), np.absolute(np.min(DV_seq)), (SS_t + SS_r) / 2
diff --git a/modelscope/metrics/video_summarization_metric.py b/modelscope/metrics/video_summarization_metric.py
index 40580382..acfc263f 100644
--- a/modelscope/metrics/video_summarization_metric.py
+++ b/modelscope/metrics/video_summarization_metric.py
@@ -79,3 +79,13 @@ class VideoSummarizationMetric(Metric):
         ]
 
         return {MetricKeys.FScore: sum(f_scores) / len(f_scores)}
+
+    def merge(self, other: 'VideoSummarizationMetric'):
+        self.inputs.extend(other.inputs)
+        self.outputs.extend(other.outputs)
+
+    def __getstate__(self):
+        return self.inputs, self.outputs
+
+    def __setstate__(self, state):
+        self.inputs, self.outputs = state
diff --git a/modelscope/models/nlp/mglm/mpu/tests/__init__.py b/modelscope/metrics/video_super_resolution_metric/__init__.py
similarity index 100%
rename from modelscope/models/nlp/mglm/mpu/tests/__init__.py
rename to modelscope/metrics/video_super_resolution_metric/__init__.py
diff --git a/modelscope/metrics/video_super_resolution_metric/matlab_functions.py b/modelscope/metrics/video_super_resolution_metric/matlab_functions.py
new file mode 100644
index 00000000..96dc49ed
--- /dev/null
+++ b/modelscope/metrics/video_super_resolution_metric/matlab_functions.py
@@ -0,0 +1,182 @@
+# The implementation is adopted from BasicSR,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/utils/matlab_functions.py
+import math
+
+import numpy as np
+import torch
+
+
+def cubic(x):
+    """cubic function used for calculate_weights_indices."""
+    absx = torch.abs(x)
+    absx2 = absx**2
+    absx3 = absx**3
+    a = 1.5 * absx3 - 2.5 * absx2 + 1
+    b = (absx <= 1).type_as(absx)
+    c = -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2
+    return a * b + c * (((absx > 1) * (absx <= 2)).type_as(absx))
+
+
+def calculate_weights_indices(in_length, out_length, scale, kernel,
+                              kernel_width, antialiasing):
+    """Calculate weights and indices, used for imresize function.
+    Args:
+        in_length (int): Input length.
+        out_length (int): Output length.
+        scale (float): Scale factor.
+        kernel_width (int): Kernel width.
+        antialisaing (bool): Whether to apply anti-aliasing when downsampling.
+    """
+
+    if (scale < 1) and antialiasing:
+        # Use a modified kernel (larger kernel width) to simultaneously
+        # interpolate and antialias
+        kernel_width = kernel_width / scale
+
+    # Output-space coordinates
+    x = torch.linspace(1, out_length, out_length)
+
+    # Input-space coordinates. Calculate the inverse mapping such that 0.5
+    # in output space maps to 0.5 in input space, and 0.5 + scale in output
+    # space maps to 1.5 in input space.
+    u = x / scale + 0.5 * (1 - 1 / scale)
+
+    # What is the left-most pixel that can be involved in the computation?
+    left = torch.floor(u - kernel_width / 2)
+
+    # What is the maximum number of pixels that can be involved in the
+    # computation?  Note: it's OK to use an extra pixel here; if the
+    # corresponding weights are all zero, it will be eliminated at the end
+    # of this function.
+    p = math.ceil(kernel_width) + 2
+
+    # The indices of the input pixels involved in computing the k-th output
+    # pixel are in row k of the indices matrix.
+    indices = left.view(out_length, 1).expand(out_length, p) + torch.linspace(
+        0, p - 1, p).view(1, p).expand(out_length, p)
+
+    # The weights used to compute the k-th output pixel are in row k of the
+    # weights matrix.
+    distance_to_center = u.view(out_length, 1).expand(out_length, p) - indices
+
+    # apply cubic kernel
+    if (scale < 1) and antialiasing:
+        weights = scale * cubic(distance_to_center * scale)
+    else:
+        weights = cubic(distance_to_center)
+
+    # Normalize the weights matrix so that each row sums to 1.
+    weights_sum = torch.sum(weights, 1).view(out_length, 1)
+    weights = weights / weights_sum.expand(out_length, p)
+
+    # If a column in weights is all zero, get rid of it. only consider the
+    # first and last column.
+    weights_zero_tmp = torch.sum((weights == 0), 0)
+    if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6):
+        indices = indices.narrow(1, 1, p - 2)
+        weights = weights.narrow(1, 1, p - 2)
+    if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6):
+        indices = indices.narrow(1, 0, p - 2)
+        weights = weights.narrow(1, 0, p - 2)
+    weights = weights.contiguous()
+    indices = indices.contiguous()
+    sym_len_s = -indices.min() + 1
+    sym_len_e = indices.max() - in_length
+    indices = indices + sym_len_s - 1
+    return weights, indices, int(sym_len_s), int(sym_len_e)
+
+
+@torch.no_grad()
+def imresize(img, scale, antialiasing=True):
+    """imresize function same as MATLAB.
+    It now only supports bicubic.
+    The same scale applies for both height and width.
+    Args:
+        img (Tensor | Numpy array):
+            Tensor: Input image with shape (c, h, w), [0, 1] range.
+            Numpy: Input image with shape (h, w, c), [0, 1] range.
+        scale (float): Scale factor. The same scale applies for both height
+            and width.
+        antialisaing (bool): Whether to apply anti-aliasing when downsampling.
+            Default: True.
+    Returns:
+        Tensor: Output image with shape (c, h, w), [0, 1] range, w/o round.
+    """
+    squeeze_flag = False
+    if type(img).__module__ == np.__name__:  # numpy type
+        numpy_type = True
+        if img.ndim == 2:
+            img = img[:, :, None]
+            squeeze_flag = True
+        img = torch.from_numpy(img.transpose(2, 0, 1)).float()
+    else:
+        numpy_type = False
+        if img.ndim == 2:
+            img = img.unsqueeze(0)
+            squeeze_flag = True
+
+    in_c, in_h, in_w = img.size()
+    out_h, out_w = math.ceil(in_h * scale), math.ceil(in_w * scale)
+    kernel_width = 4
+    kernel = 'cubic'
+
+    # get weights and indices
+    weights_h, indices_h, sym_len_hs, sym_len_he = calculate_weights_indices(
+        in_h, out_h, scale, kernel, kernel_width, antialiasing)
+    weights_w, indices_w, sym_len_ws, sym_len_we = calculate_weights_indices(
+        in_w, out_w, scale, kernel, kernel_width, antialiasing)
+    # process H dimension
+    # symmetric copying
+    img_aug = torch.FloatTensor(in_c, in_h + sym_len_hs + sym_len_he, in_w)
+    img_aug.narrow(1, sym_len_hs, in_h).copy_(img)
+
+    sym_patch = img[:, :sym_len_hs, :]
+    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(1, inv_idx)
+    img_aug.narrow(1, 0, sym_len_hs).copy_(sym_patch_inv)
+
+    sym_patch = img[:, -sym_len_he:, :]
+    inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(1, inv_idx)
+    img_aug.narrow(1, sym_len_hs + in_h, sym_len_he).copy_(sym_patch_inv)
+
+    out_1 = torch.FloatTensor(in_c, out_h, in_w)
+    kernel_width = weights_h.size(1)
+    for i in range(out_h):
+        idx = int(indices_h[i][0])
+        for j in range(in_c):
+            out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(
+                0, 1).mv(weights_h[i])
+
+    # process W dimension
+    # symmetric copying
+    out_1_aug = torch.FloatTensor(in_c, out_h, in_w + sym_len_ws + sym_len_we)
+    out_1_aug.narrow(2, sym_len_ws, in_w).copy_(out_1)
+
+    sym_patch = out_1[:, :, :sym_len_ws]
+    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(2, inv_idx)
+    out_1_aug.narrow(2, 0, sym_len_ws).copy_(sym_patch_inv)
+
+    sym_patch = out_1[:, :, -sym_len_we:]
+    inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long()
+    sym_patch_inv = sym_patch.index_select(2, inv_idx)
+    out_1_aug.narrow(2, sym_len_ws + in_w, sym_len_we).copy_(sym_patch_inv)
+
+    out_2 = torch.FloatTensor(in_c, out_h, out_w)
+    kernel_width = weights_w.size(1)
+    for i in range(out_w):
+        idx = int(indices_w[i][0])
+        for j in range(in_c):
+            out_2[j, :, i] = out_1_aug[j, :,
+                                       idx:idx + kernel_width].mv(weights_w[i])
+
+    if squeeze_flag:
+        out_2 = out_2.squeeze(0)
+    if numpy_type:
+        out_2 = out_2.numpy()
+        if not squeeze_flag:
+            out_2 = out_2.transpose(1, 2, 0)
+
+    return out_2
diff --git a/modelscope/metrics/video_super_resolution_metric/metric_util.py b/modelscope/metrics/video_super_resolution_metric/metric_util.py
new file mode 100644
index 00000000..7be249aa
--- /dev/null
+++ b/modelscope/metrics/video_super_resolution_metric/metric_util.py
@@ -0,0 +1,129 @@
+# The implementation is adopted from BasicSR,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/metric_util.py
+
+import numpy as np
+
+
+def _convert_input_type_range(img):
+    """Convert the type and range of the input image.
+    It converts the input image to np.float32 type and range of [0, 1].
+    It is mainly used for pre-processing the input image in colorspace
+    conversion functions such as rgb2ycbcr and ycbcr2rgb.
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+    Returns:
+        (ndarray): The converted image with type of np.float32 and range of
+            [0, 1].
+    """
+    img_type = img.dtype
+    img = img.astype(np.float32)
+    if img_type == np.float32:
+        pass
+    elif img_type == np.uint8:
+        img /= 255.
+    else:
+        raise TypeError(
+            f'The img type should be np.float32 or np.uint8, but got {img_type}'
+        )
+    return img
+
+
+def _convert_output_type_range(img, dst_type):
+    """Convert the type and range of the image according to dst_type.
+    It converts the image to desired type and range. If `dst_type` is np.uint8,
+    images will be converted to np.uint8 type with range [0, 255]. If
+    `dst_type` is np.float32, it converts the image to np.float32 type with
+    range [0, 1].
+    It is mainly used for post-processing images in colorspace conversion
+    functions such as rgb2ycbcr and ycbcr2rgb.
+    Args:
+        img (ndarray): The image to be converted with np.float32 type and
+            range [0, 255].
+        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+            converts the image to np.uint8 type with range [0, 255]. If
+            dst_type is np.float32, it converts the image to np.float32 type
+            with range [0, 1].
+    Returns:
+        (ndarray): The converted image with desired type and range.
+    """
+    if dst_type not in (np.uint8, np.float32):
+        raise TypeError(
+            f'The dst_type should be np.float32 or np.uint8, but got {dst_type}'
+        )
+    if dst_type == np.uint8:
+        img = img.round()
+    else:
+        img /= 255.
+    return img.astype(dst_type)
+
+
+def bgr2ycbcr(img, y_only=False):
+    """Convert a BGR image to YCbCr image.
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def reorder_image(img, input_order='HWC'):
+    """Reorder images to 'HWC' order.
+    If the input_order is (h, w), return (h, w, 1);
+    If the input_order is (c, h, w), return (h, w, c);
+    If the input_order is (h, w, c), return as it is.
+    Args:
+        img (ndarray): Input image.
+        input_order (str): Whether the input order is 'HWC' or 'CHW'.
+            If the input image shape is (h, w), input_order will not have
+            effects. Default: 'HWC'.
+    Returns:
+        ndarray: reordered image.
+    """
+
+    if input_order not in ['HWC', 'CHW']:
+        raise ValueError(
+            f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'"
+        )
+    if len(img.shape) == 2:
+        img = img[..., None]
+    if input_order == 'CHW':
+        img = img.transpose(1, 2, 0)
+    return img
+
+
+def to_y_channel(img):
+    """Change to Y channel of YCbCr.
+    Args:
+        img (ndarray): Images with range [0, 255].
+    Returns:
+        (ndarray): Images with range [0, 255] (float type) without round.
+    """
+    img = img.astype(np.float32) / 255.
+    if img.ndim == 3 and img.shape[2] == 3:
+        img = bgr2ycbcr(img, y_only=True)
+        img = img[..., None]
+    return img * 255.
diff --git a/modelscope/metrics/video_super_resolution_metric/niqe.py b/modelscope/metrics/video_super_resolution_metric/niqe.py
new file mode 100644
index 00000000..ae38ef37
--- /dev/null
+++ b/modelscope/metrics/video_super_resolution_metric/niqe.py
@@ -0,0 +1,210 @@
+# The implementation is adopted from BasicSR,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/niqe.py
+
+import math
+
+import cv2
+import numpy as np
+from scipy.ndimage import convolve
+from scipy.special import gamma
+
+from modelscope.hub.file_download import model_file_download
+from modelscope.metrics.video_super_resolution_metric.matlab_functions import \
+    imresize
+from modelscope.metrics.video_super_resolution_metric.metric_util import (
+    reorder_image, to_y_channel)
+
+downloaded_file_path = model_file_download(
+    model_id='damo/cv_realbasicvsr_video-super-resolution_videolq',
+    file_path='niqe_pris_params.npz')
+
+
+def estimate_aggd_param(block):
+    """Estimate AGGD (Asymmetric Generalized Gaussian Distribution) parameters.
+    Args:
+        block (ndarray): 2D Image block.
+    Returns:
+        tuple: alpha (float), beta_l (float) and beta_r (float) for the AGGD
+            distribution (Estimating the parames in Equation 7 in the paper).
+    """
+    block = block.flatten()
+    gam = np.arange(0.2, 10.001, 0.001)  # len = 9801
+    gam_reciprocal = np.reciprocal(gam)
+    r_gam = np.square(gamma(gam_reciprocal * 2)) / (
+        gamma(gam_reciprocal) * gamma(gam_reciprocal * 3))
+
+    left_std = np.sqrt(np.mean(block[block < 0]**2))
+    right_std = np.sqrt(np.mean(block[block > 0]**2))
+    gammahat = left_std / right_std
+    rhat = (np.mean(np.abs(block)))**2 / np.mean(block**2)
+    rhat1 = rhat * (gammahat**3 + 1) * (gammahat + 1)
+    rhatnorm = rhat1 / ((gammahat**2 + 1)**2)
+    array_position = np.argmin((r_gam - rhatnorm)**2)
+
+    alpha = gam[array_position]
+    beta_l = left_std * np.sqrt(gamma(1 / alpha) / gamma(3 / alpha))
+    beta_r = right_std * np.sqrt(gamma(1 / alpha) / gamma(3 / alpha))
+    return (alpha, beta_l, beta_r)
+
+
+def compute_feature(block):
+    """Compute features.
+    Args:
+        block (ndarray): 2D Image block.
+    Returns:
+        list: Features with length of 18.
+    """
+    feat = []
+    alpha, beta_l, beta_r = estimate_aggd_param(block)
+    feat.extend([alpha, (beta_l + beta_r) / 2])
+
+    # distortions disturb the fairly regular structure of natural images.
+    # This deviation can be captured by analyzing the sample distribution of
+    # the products of pairs of adjacent coefficients computed along
+    # horizontal, vertical and diagonal orientations.
+    shifts = [[0, 1], [1, 0], [1, 1], [1, -1]]
+    for i in range(len(shifts)):
+        shifted_block = np.roll(block, shifts[i], axis=(0, 1))
+        alpha, beta_l, beta_r = estimate_aggd_param(block * shifted_block)
+        # Eq. 8
+        mean = (beta_r - beta_l) * (gamma(2 / alpha) / gamma(1 / alpha))
+        feat.extend([alpha, mean, beta_l, beta_r])
+    return feat
+
+
+def niqe(img,
+         mu_pris_param,
+         cov_pris_param,
+         gaussian_window,
+         block_size_h=96,
+         block_size_w=96):
+    """Calculate NIQE (Natural Image Quality Evaluator) metric.
+    ``Paper: Making a "Completely Blind" Image Quality Analyzer``
+    This implementation could produce almost the same results as the official
+    MATLAB codes: http://live.ece.utexas.edu/research/quality/niqe_release.zip
+    Note that we do not include block overlap height and width, since they are
+    always 0 in the official implementation.
+    For good performance, it is advisable by the official implementation to
+    divide the distorted image in to the same size patched as used for the
+    construction of multivariate Gaussian model.
+    Args:
+        img (ndarray): Input image whose quality needs to be computed. The
+            image must be a gray or Y (of YCbCr) image with shape (h, w).
+            Range [0, 255] with float type.
+        mu_pris_param (ndarray): Mean of a pre-defined multivariate Gaussian
+            model calculated on the pristine dataset.
+        cov_pris_param (ndarray): Covariance of a pre-defined multivariate
+            Gaussian model calculated on the pristine dataset.
+        gaussian_window (ndarray): A 7x7 Gaussian window used for smoothing the
+            image.
+        block_size_h (int): Height of the blocks in to which image is divided.
+            Default: 96 (the official recommended value).
+        block_size_w (int): Width of the blocks in to which image is divided.
+            Default: 96 (the official recommended value).
+    """
+    assert img.ndim == 2, (
+        'Input image must be a gray or Y (of YCbCr) image with shape (h, w).')
+    # crop image
+    h, w = img.shape
+    num_block_h = math.floor(h / block_size_h)
+    num_block_w = math.floor(w / block_size_w)
+    img = img[0:num_block_h * block_size_h, 0:num_block_w * block_size_w]
+
+    distparam = []  # dist param is actually the multiscale features
+    for scale in (1, 2):  # perform on two scales (1, 2)
+        mu = convolve(img, gaussian_window, mode='nearest')
+        sigma = np.sqrt(
+            np.abs(
+                convolve(np.square(img), gaussian_window, mode='nearest')
+                - np.square(mu)))
+        # normalize, as in Eq. 1 in the paper
+        img_nomalized = (img - mu) / (sigma + 1)
+
+        feat = []
+        for idx_w in range(num_block_w):
+            for idx_h in range(num_block_h):
+                # process ecah block
+                block = img_nomalized[idx_h * block_size_h // scale:(idx_h + 1)
+                                      * block_size_h // scale,
+                                      idx_w * block_size_w // scale:(idx_w + 1)
+                                      * block_size_w // scale]
+                feat.append(compute_feature(block))
+
+        distparam.append(np.array(feat))
+
+        if scale == 1:
+            img = imresize(img / 255., scale=0.5, antialiasing=True)
+            img = img * 255.
+
+    distparam = np.concatenate(distparam, axis=1)
+
+    # fit a MVG (multivariate Gaussian) model to distorted patch features
+    mu_distparam = np.nanmean(distparam, axis=0)
+    # use nancov. ref: https://ww2.mathworks.cn/help/stats/nancov.html
+    distparam_no_nan = distparam[~np.isnan(distparam).any(axis=1)]
+    cov_distparam = np.cov(distparam_no_nan, rowvar=False)
+
+    # compute niqe quality, Eq. 10 in the paper
+    invcov_param = np.linalg.pinv((cov_pris_param + cov_distparam) / 2)
+    quality = np.matmul(
+        np.matmul((mu_pris_param - mu_distparam), invcov_param),
+        np.transpose((mu_pris_param - mu_distparam)))
+
+    quality = np.sqrt(quality)
+    quality = float(np.squeeze(quality))
+    return quality
+
+
+def calculate_niqe(img,
+                   crop_border,
+                   input_order='HWC',
+                   convert_to='y',
+                   **kwargs):
+    """Calculate NIQE (Natural Image Quality Evaluator) metric.
+    ``Paper: Making a "Completely Blind" Image Quality Analyzer``
+    This implementation could produce almost the same results as the official
+    MATLAB codes: http://live.ece.utexas.edu/research/quality/niqe_release.zip
+    > MATLAB R2021a result for tests/data/baboon.png: 5.72957338 (5.7296)
+    > Our re-implementation result for tests/data/baboon.png: 5.7295763 (5.7296)
+    We use the official params estimated from the pristine dataset.
+    We use the recommended block size (96, 96) without overlaps.
+    Args:
+        img (ndarray): Input image whose quality needs to be computed.
+            The input image must be in range [0, 255] with float/int type.
+            The input_order of image can be 'HW' or 'HWC' or 'CHW'. (BGR order)
+            If the input order is 'HWC' or 'CHW', it will be converted to gray
+            or Y (of YCbCr) image according to the ``convert_to`` argument.
+        crop_border (int): Cropped pixels in each edge of an image. These
+            pixels are not involved in the metric calculation.
+        input_order (str): Whether the input order is 'HW', 'HWC' or 'CHW'.
+            Default: 'HWC'.
+        convert_to (str): Whether converted to 'y' (of MATLAB YCbCr) or 'gray'.
+            Default: 'y'.
+    Returns:
+        float: NIQE result.
+    """
+    # we use the official params estimated from the pristine dataset.
+    niqe_pris_params = np.load(downloaded_file_path)
+    mu_pris_param = niqe_pris_params['mu_pris_param']
+    cov_pris_param = niqe_pris_params['cov_pris_param']
+    gaussian_window = niqe_pris_params['gaussian_window']
+
+    img = img.astype(np.float32)
+    if input_order != 'HW':
+        img = reorder_image(img, input_order=input_order)
+        if convert_to == 'y':
+            img = to_y_channel(img)
+        elif convert_to == 'gray':
+            img = cv2.cvtColor(img / 255., cv2.COLOR_BGR2GRAY) * 255.
+        img = np.squeeze(img)
+
+    if crop_border != 0:
+        img = img[crop_border:-crop_border, crop_border:-crop_border]
+
+    # round is necessary for being consistent with MATLAB's result
+    img = img.round()
+
+    niqe_result = niqe(img, mu_pris_param, cov_pris_param, gaussian_window)
+
+    return niqe_result
diff --git a/modelscope/metrics/video_super_resolution_metric/video_super_resolution_metric.py b/modelscope/metrics/video_super_resolution_metric/video_super_resolution_metric.py
new file mode 100644
index 00000000..298d6836
--- /dev/null
+++ b/modelscope/metrics/video_super_resolution_metric/video_super_resolution_metric.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict
+
+import numpy as np
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.metrics.video_super_resolution_metric.niqe import \
+    calculate_niqe
+from modelscope.utils.registry import default_group
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.video_super_resolution_metric)
+class VideoSuperResolutionMetric(Metric):
+    """The metric computation class for real-world video super-resolution classes.
+    """
+    pred_name = 'pred'
+
+    def __init__(self):
+        super(VideoSuperResolutionMetric, self).__init__()
+        self.preds = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        eval_results = outputs[VideoSuperResolutionMetric.pred_name]
+        self.preds.append(eval_results)
+
+    def evaluate(self):
+        niqe_list = []
+        for pred in self.preds:
+            if isinstance(pred, list):
+                for item in pred:
+                    niqe_list.append(
+                        calculate_niqe(
+                            item[0].permute(1, 2, 0).numpy() * 255,
+                            crop_border=0))
+            else:
+                niqe_list.append(
+                    calculate_niqe(
+                        pred[0].permute(1, 2, 0).numpy() * 255, crop_border=0))
+        return {MetricKeys.NIQE: np.mean(niqe_list)}
+
+    def merge(self, other: 'VideoSuperResolutionMetric'):
+        self.preds.extend(other.preds)
+
+    def __getstate__(self):
+        return self.preds
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds = state
diff --git a/modelscope/models/audio/__init__.py b/modelscope/models/audio/__init__.py
index 07798cf4..3c3ba54a 100644
--- a/modelscope/models/audio/__init__.py
+++ b/modelscope/models/audio/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from . import ans, asr, kws, tts
+from . import ans, asr, itn, kws, tts
diff --git a/modelscope/models/audio/itn/__init__.py b/modelscope/models/audio/itn/__init__.py
new file mode 100644
index 00000000..be299f4a
--- /dev/null
+++ b/modelscope/models/audio/itn/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .generic_inverse_text_processing import GenericInverseTextProcessing
+
+else:
+    _import_structure = {
+        'generic_inverse_text_processing': ['GenericInverseTextProcessing'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/audio/itn/generic_inverse_text_processing.py b/modelscope/models/audio/itn/generic_inverse_text_processing.py
new file mode 100644
index 00000000..b71c7769
--- /dev/null
+++ b/modelscope/models/audio/itn/generic_inverse_text_processing.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Frameworks, Tasks
+
+
+@MODELS.register_module(
+    Tasks.inverse_text_processing, module_name=Models.generic_itn)
+class GenericInverseTextProcessing(Model):
+
+    def __init__(self, model_dir: str, itn_model_name: str,
+                 model_config: Dict[str, Any], *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            itn_model_name (str): the itn model name from configuration.json
+            model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, itn_model_name, model_config, *args,
+                         **kwargs)
+        self.model_cfg = {
+            # the recognition model dir path
+            'model_workspace': model_dir,
+            # the itn model name
+            'itn_model': itn_model_name,
+            # the am model file path
+            'itn_model_path': os.path.join(model_dir, itn_model_name),
+            # the recognition model config dict
+            'model_config': model_config
+        }
+
+    def forward(self) -> Dict[str, Any]:
+        """
+          just return the model config
+
+        """
+
+        return self.model_cfg
diff --git a/modelscope/models/audio/kws/__init__.py b/modelscope/models/audio/kws/__init__.py
index dd183fe5..ee39be36 100644
--- a/modelscope/models/audio/kws/__init__.py
+++ b/modelscope/models/audio/kws/__init__.py
@@ -6,11 +6,13 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .generic_key_word_spotting import GenericKeyWordSpotting
     from .farfield.model import FSMNSeleNetV2Decorator
+    from .nearfield.model import FSMNDecorator
 
 else:
     _import_structure = {
         'generic_key_word_spotting': ['GenericKeyWordSpotting'],
         'farfield.model': ['FSMNSeleNetV2Decorator'],
+        'nearfield.model': ['FSMNDecorator'],
     }
 
     import sys
diff --git a/modelscope/outputs/nlp/__init__.py b/modelscope/models/audio/kws/nearfield/__init__.py
similarity index 100%
rename from modelscope/outputs/nlp/__init__.py
rename to modelscope/models/audio/kws/nearfield/__init__.py
diff --git a/modelscope/models/audio/kws/nearfield/cmvn.py b/modelscope/models/audio/kws/nearfield/cmvn.py
new file mode 100644
index 00000000..bad065f7
--- /dev/null
+++ b/modelscope/models/audio/kws/nearfield/cmvn.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# Copyright (c) 2020 Binbin Zhang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import numpy as np
+import torch
+
+
+class GlobalCMVN(torch.nn.Module):
+
+    def __init__(self,
+                 mean: torch.Tensor,
+                 istd: torch.Tensor,
+                 norm_var: bool = True):
+        """
+        Args:
+            mean (torch.Tensor): mean stats
+            istd (torch.Tensor): inverse std, std which is 1.0 / std
+        """
+        super().__init__()
+        assert mean.shape == istd.shape
+        self.norm_var = norm_var
+        # The buffer can be accessed from this module using self.mean
+        self.register_buffer('mean', mean)
+        self.register_buffer('istd', istd)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): (batch, max_len, feat_dim)
+
+        Returns:
+            (torch.Tensor): normalized feature
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
+
+
+def load_kaldi_cmvn(cmvn_file):
+    """ Load the kaldi format cmvn stats file and no need to calculate
+
+    Args:
+        cmvn_file: cmvn stats file in kaldi format
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+
+    means = None
+    variance = None
+    with open(cmvn_file) as f:
+        all_lines = f.readlines()
+        for idx, line in enumerate(all_lines):
+            if line.find('AddShift') != -1:
+                segs = line.strip().split(' ')
+                assert len(segs) == 3
+                next_line = all_lines[idx + 1]
+                means_str = re.findall(r'[\[](.*?)[\]]', next_line)[0]
+                means_list = means_str.strip().split(' ')
+                means = [0 - float(s) for s in means_list]
+                assert len(means) == int(segs[1])
+            elif line.find('Rescale') != -1:
+                segs = line.strip().split(' ')
+                assert len(segs) == 3
+                next_line = all_lines[idx + 1]
+                vars_str = re.findall(r'[\[](.*?)[\]]', next_line)[0]
+                vars_list = vars_str.strip().split(' ')
+                variance = [float(s) for s in vars_list]
+                assert len(variance) == int(segs[1])
+            elif line.find('Splice') != -1:
+                segs = line.strip().split(' ')
+                assert len(segs) == 3
+                next_line = all_lines[idx + 1]
+                splice_str = re.findall(r'[\[](.*?)[\]]', next_line)[0]
+                splice_list = splice_str.strip().split(' ')
+                assert len(splice_list) * int(segs[2]) == int(segs[1])
+                copy_times = len(splice_list)
+            else:
+                continue
+
+    cmvn = np.array([means, variance])
+    cmvn = np.tile(cmvn, (1, copy_times))
+
+    return cmvn
diff --git a/modelscope/models/audio/kws/nearfield/fsmn.py b/modelscope/models/audio/kws/nearfield/fsmn.py
new file mode 100644
index 00000000..85c82a5a
--- /dev/null
+++ b/modelscope/models/audio/kws/nearfield/fsmn.py
@@ -0,0 +1,521 @@
+'''
+FSMN implementation.
+
+Copyright: 2022-03-09 yueyue.nyy
+'''
+
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def toKaldiMatrix(np_mat):
+    np.set_printoptions(threshold=np.inf, linewidth=np.nan)
+    out_str = str(np_mat)
+    out_str = out_str.replace('[', '')
+    out_str = out_str.replace(']', '')
+    return '[ %s ]\n' % out_str
+
+
+def printTensor(torch_tensor):
+    re_str = ''
+    x = torch_tensor.detach().squeeze().numpy()
+    re_str += toKaldiMatrix(x)
+    # re_str += '<!EndOfComponent>\n'
+    print(re_str)
+
+
+class LinearTransform(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(LinearTransform, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.linear = nn.Linear(input_dim, output_dim, bias=False)
+        self.quant = torch.quantization.QuantStub()
+        self.dequant = torch.quantization.DeQuantStub()
+
+    def forward(self, input):
+        output = self.quant(input)
+        output = self.linear(output)
+        output = self.dequant(output)
+
+        return output
+
+    def to_kaldi_net(self):
+        re_str = ''
+        re_str += '<LinearTransform> %d %d\n' % (self.output_dim,
+                                                 self.input_dim)
+        re_str += '<LearnRateCoef> 1\n'
+
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += toKaldiMatrix(x)
+        # re_str += '<!EndOfComponent>\n'
+
+        return re_str
+
+    def to_pytorch_net(self, fread):
+        linear_line = fread.readline()
+        linear_split = linear_line.strip().split()
+        assert len(linear_split) == 3
+        assert linear_split[0] == '<LinearTransform>'
+        self.output_dim = int(linear_split[1])
+        self.input_dim = int(linear_split[2])
+
+        learn_rate_line = fread.readline()
+        assert learn_rate_line.find('LearnRateCoef') != -1
+
+        self.linear.reset_parameters()
+
+        # linear_weights = self.state_dict()['linear.weight']
+        # print(linear_weights.shape)
+        new_weights = torch.zeros((self.output_dim, self.input_dim),
+                                  dtype=torch.float32)
+        for i in range(self.output_dim):
+            line = fread.readline()
+            splits = line.strip().strip('[]').strip().split()
+            assert len(splits) == self.input_dim
+            cols = torch.tensor([float(item) for item in splits],
+                                dtype=torch.float32)
+            new_weights[i, :] = cols
+
+        self.linear.weight.data = new_weights
+
+
+class AffineTransform(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(AffineTransform, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        self.linear = nn.Linear(input_dim, output_dim)
+        self.quant = torch.quantization.QuantStub()
+        self.dequant = torch.quantization.DeQuantStub()
+
+    def forward(self, input):
+        output = self.quant(input)
+        output = self.linear(output)
+        output = self.dequant(output)
+
+        return output
+
+    def to_kaldi_net(self):
+        re_str = ''
+        re_str += '<AffineTransform> %d %d\n' % (self.output_dim,
+                                                 self.input_dim)
+        re_str += '<LearnRateCoef> 1 <BiasLearnRateCoef> 1 <MaxNorm> 0\n'
+
+        linear_weights = self.state_dict()['linear.weight']
+        x = linear_weights.squeeze().numpy()
+        re_str += toKaldiMatrix(x)
+
+        linear_bias = self.state_dict()['linear.bias']
+        x = linear_bias.squeeze().numpy()
+        re_str += toKaldiMatrix(x)
+        # re_str += '<!EndOfComponent>\n'
+
+        return re_str
+
+    def to_pytorch_net(self, fread):
+        affine_line = fread.readline()
+        affine_split = affine_line.strip().split()
+        assert len(affine_split) == 3
+        assert affine_split[0] == '<AffineTransform>'
+        self.output_dim = int(affine_split[1])
+        self.input_dim = int(affine_split[2])
+        print('AffineTransform output/input dim: %d %d' %
+              (self.output_dim, self.input_dim))
+
+        learn_rate_line = fread.readline()
+        assert learn_rate_line.find('LearnRateCoef') != -1
+
+        # linear_weights = self.state_dict()['linear.weight']
+        # print(linear_weights.shape)
+        self.linear.reset_parameters()
+
+        new_weights = torch.zeros((self.output_dim, self.input_dim),
+                                  dtype=torch.float32)
+        for i in range(self.output_dim):
+            line = fread.readline()
+            splits = line.strip().strip('[]').strip().split()
+            assert len(splits) == self.input_dim
+            cols = torch.tensor([float(item) for item in splits],
+                                dtype=torch.float32)
+            new_weights[i, :] = cols
+
+        self.linear.weight.data = new_weights
+
+        # linear_bias = self.state_dict()['linear.bias']
+        # print(linear_bias.shape)
+        bias_line = fread.readline()
+        splits = bias_line.strip().strip('[]').strip().split()
+        assert len(splits) == self.output_dim
+        new_bias = torch.tensor([float(item) for item in splits],
+                                dtype=torch.float32)
+
+        self.linear.bias.data = new_bias
+
+
+class FSMNBlock(nn.Module):
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        lorder=None,
+        rorder=None,
+        lstride=1,
+        rstride=1,
+    ):
+        super(FSMNBlock, self).__init__()
+
+        self.dim = input_dim
+
+        if lorder is None:
+            return
+
+        self.lorder = lorder
+        self.rorder = rorder
+        self.lstride = lstride
+        self.rstride = rstride
+
+        self.conv_left = nn.Conv2d(
+            self.dim,
+            self.dim, [lorder, 1],
+            dilation=[lstride, 1],
+            groups=self.dim,
+            bias=False)
+
+        if rorder > 0:
+            self.conv_right = nn.Conv2d(
+                self.dim,
+                self.dim, [rorder, 1],
+                dilation=[rstride, 1],
+                groups=self.dim,
+                bias=False)
+        else:
+            self.conv_right = None
+
+        self.quant = torch.quantization.QuantStub()
+        self.dequant = torch.quantization.DeQuantStub()
+
+    def forward(self, input):
+        x = torch.unsqueeze(input, 1)
+        x_per = x.permute(0, 3, 2, 1)
+
+        y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])
+        y_left = self.quant(y_left)
+        y_left = self.conv_left(y_left)
+        y_left = self.dequant(y_left)
+        out = x_per + y_left
+
+        if self.conv_right is not None:
+            y_right = F.pad(x_per, [0, 0, 0, (self.rorder) * self.rstride])
+            y_right = y_right[:, :, self.rstride:, :]
+            y_right = self.quant(y_right)
+            y_right = self.conv_right(y_right)
+            y_right = self.dequant(y_right)
+            out += y_right
+
+        out_per = out.permute(0, 3, 2, 1)
+        output = out_per.squeeze(1)
+
+        return output
+
+    def to_kaldi_net(self):
+        re_str = ''
+        re_str += '<Fsmn> %d %d\n' % (self.dim, self.dim)
+        re_str += '<LearnRateCoef> %d <LOrder> %d <ROrder> %d <LStride> %d <RStride> %d <MaxNorm> 0\n' % (
+            1, self.lorder, self.rorder, self.lstride, self.rstride)
+
+        # print(self.conv_left.weight,self.conv_right.weight)
+        lfiters = self.state_dict()['conv_left.weight']
+        x = np.flipud(lfiters.squeeze().numpy().T)
+        re_str += toKaldiMatrix(x)
+
+        if self.conv_right is not None:
+            rfiters = self.state_dict()['conv_right.weight']
+            x = (rfiters.squeeze().numpy().T)
+            re_str += toKaldiMatrix(x)
+            # re_str += '<!EndOfComponent>\n'
+
+        return re_str
+
+    def to_pytorch_net(self, fread):
+        fsmn_line = fread.readline()
+        fsmn_split = fsmn_line.strip().split()
+        assert len(fsmn_split) == 3
+        assert fsmn_split[0] == '<Fsmn>'
+        self.dim = int(fsmn_split[1])
+
+        params_line = fread.readline()
+        params_split = params_line.strip().strip('[]').strip().split()
+        assert len(params_split) == 12
+        assert params_split[0] == '<LearnRateCoef>'
+        assert params_split[2] == '<LOrder>'
+        self.lorder = int(params_split[3])
+        assert params_split[4] == '<ROrder>'
+        self.rorder = int(params_split[5])
+        assert params_split[6] == '<LStride>'
+        self.lstride = int(params_split[7])
+        assert params_split[8] == '<RStride>'
+        self.rstride = int(params_split[9])
+        assert params_split[10] == '<MaxNorm>'
+
+        # lfilters = self.state_dict()['conv_left.weight']
+        # print(lfilters.shape)
+        print('read conv_left weight')
+        new_lfilters = torch.zeros((self.lorder, 1, self.dim, 1),
+                                   dtype=torch.float32)
+        for i in range(self.lorder):
+            print('read conv_left weight -- %d' % i)
+            line = fread.readline()
+            splits = line.strip().strip('[]').strip().split()
+            assert len(splits) == self.dim
+            cols = torch.tensor([float(item) for item in splits],
+                                dtype=torch.float32)
+            new_lfilters[self.lorder - 1 - i, 0, :, 0] = cols
+
+        new_lfilters = torch.transpose(new_lfilters, 0, 2)
+        # print(new_lfilters.shape)
+
+        self.conv_left.reset_parameters()
+        self.conv_left.weight.data = new_lfilters
+        # print(self.conv_left.weight.shape)
+
+        if self.rorder > 0:
+            # rfilters = self.state_dict()['conv_right.weight']
+            # print(rfilters.shape)
+            print('read conv_right weight')
+            new_rfilters = torch.zeros((self.rorder, 1, self.dim, 1),
+                                       dtype=torch.float32)
+            line = fread.readline()
+            for i in range(self.rorder):
+                print('read conv_right weight -- %d' % i)
+                line = fread.readline()
+                splits = line.strip().strip('[]').strip().split()
+                assert len(splits) == self.dim
+                cols = torch.tensor([float(item) for item in splits],
+                                    dtype=torch.float32)
+                new_rfilters[i, 0, :, 0] = cols
+
+            new_rfilters = torch.transpose(new_rfilters, 0, 2)
+            # print(new_rfilters.shape)
+            self.conv_right.reset_parameters()
+            self.conv_right.weight.data = new_rfilters
+            # print(self.conv_right.weight.shape)
+
+
+class RectifiedLinear(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(RectifiedLinear, self).__init__()
+        self.dim = input_dim
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, input):
+        out = self.relu(input)
+        # out = self.dropout(out)
+        return out
+
+    def to_kaldi_net(self):
+        re_str = ''
+        re_str += '<RectifiedLinear> %d %d\n' % (self.dim, self.dim)
+        # re_str += '<!EndOfComponent>\n'
+        return re_str
+
+        # re_str = ''
+        # re_str += '<ParametricRelu> %d %d\n' % (self.dim, self.dim)
+        # re_str += '<AlphaLearnRateCoef> 0 <BetaLearnRateCoef> 0\n'
+        # re_str += toKaldiMatrix(np.ones((self.dim), dtype = 'int32'))
+        # re_str += toKaldiMatrix(np.zeros((self.dim), dtype = 'int32'))
+        # re_str += '<!EndOfComponent>\n'
+        # return re_str
+
+    def to_pytorch_net(self, fread):
+        line = fread.readline()
+        splits = line.strip().split()
+        assert len(splits) == 3
+        assert splits[0] == '<RectifiedLinear>'
+        assert int(splits[1]) == int(splits[2])
+        assert int(splits[1]) == self.dim
+        self.dim = int(splits[1])
+
+
+def _build_repeats(
+    fsmn_layers: int,
+    linear_dim: int,
+    proj_dim: int,
+    lorder: int,
+    rorder: int,
+    lstride=1,
+    rstride=1,
+):
+    repeats = [
+        nn.Sequential(
+            LinearTransform(linear_dim, proj_dim),
+            FSMNBlock(proj_dim, proj_dim, lorder, rorder, 1, 1),
+            AffineTransform(proj_dim, linear_dim),
+            RectifiedLinear(linear_dim, linear_dim))
+        for i in range(fsmn_layers)
+    ]
+
+    return nn.Sequential(*repeats)
+
+
+class FSMN(nn.Module):
+
+    def __init__(
+        self,
+        input_dim: int,
+        input_affine_dim: int,
+        fsmn_layers: int,
+        linear_dim: int,
+        proj_dim: int,
+        lorder: int,
+        rorder: int,
+        lstride: int,
+        rstride: int,
+        output_affine_dim: int,
+        output_dim: int,
+    ):
+        """
+            Args:
+                input_dim:              input dimension
+                input_affine_dim:       input affine layer dimension
+                fsmn_layers:            no. of fsmn units
+                linear_dim:             fsmn input dimension
+                proj_dim:               fsmn projection dimension
+                lorder:                 fsmn left order
+                rorder:                 fsmn right order
+                lstride:                fsmn left stride
+                rstride:                fsmn right stride
+                output_affine_dim:      output affine layer dimension
+                output_dim:             output dimension
+        """
+        super(FSMN, self).__init__()
+
+        self.input_dim = input_dim
+        self.input_affine_dim = input_affine_dim
+        self.fsmn_layers = fsmn_layers
+        self.linear_dim = linear_dim
+        self.proj_dim = proj_dim
+        self.lorder = lorder
+        self.rorder = rorder
+        self.lstride = lstride
+        self.rstride = rstride
+        self.output_affine_dim = output_affine_dim
+        self.output_dim = output_dim
+
+        self.in_linear1 = AffineTransform(input_dim, input_affine_dim)
+        self.in_linear2 = AffineTransform(input_affine_dim, linear_dim)
+        self.relu = RectifiedLinear(linear_dim, linear_dim)
+
+        self.fsmn = _build_repeats(fsmn_layers, linear_dim, proj_dim, lorder,
+                                   rorder, lstride, rstride)
+
+        self.out_linear1 = AffineTransform(linear_dim, output_affine_dim)
+        self.out_linear2 = AffineTransform(output_affine_dim, output_dim)
+        # self.softmax = nn.Softmax(dim = -1)
+
+    def fuse_modules(self):
+        pass
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        in_cache: torch.Tensor = torch.zeros(0, 0, 0, dtype=torch.float)
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            input (torch.Tensor): Input tensor (B, T, D)
+            in_cache(torhc.Tensor): (B, D, C), C is the accumulated cache size
+        """
+
+        # print("FSMN forward!!!!")
+        # print(input.shape)
+        # print(input)
+        # print(self.in_linear1.input_dim)
+        # print(self.in_linear1.output_dim)
+
+        x1 = self.in_linear1(input)
+        x2 = self.in_linear2(x1)
+        x3 = self.relu(x2)
+        x4 = self.fsmn(x3)
+        x5 = self.out_linear1(x4)
+        x6 = self.out_linear2(x5)
+        # x7 = self.softmax(x6)
+
+        # return x7, None
+        return x6, in_cache
+
+    def to_kaldi_net(self):
+        re_str = ''
+        re_str += '<Nnet>\n'
+        re_str += self.in_linear1.to_kaldi_net()
+        re_str += self.in_linear2.to_kaldi_net()
+        re_str += self.relu.to_kaldi_net()
+
+        for fsmn in self.fsmn:
+            re_str += fsmn[0].to_kaldi_net()
+            re_str += fsmn[1].to_kaldi_net()
+            re_str += fsmn[2].to_kaldi_net()
+            re_str += fsmn[3].to_kaldi_net()
+
+        re_str += self.out_linear1.to_kaldi_net()
+        re_str += self.out_linear2.to_kaldi_net()
+        re_str += '<Softmax> %d %d\n' % (self.output_dim, self.output_dim)
+        # re_str += '<!EndOfComponent>\n'
+        re_str += '</Nnet>\n'
+
+        return re_str
+
+    def to_pytorch_net(self, kaldi_file):
+        with open(kaldi_file, 'r', encoding='utf8') as fread:
+            fread = open(kaldi_file, 'r')
+            nnet_start_line = fread.readline()
+            assert nnet_start_line.strip() == '<Nnet>'
+
+            self.in_linear1.to_pytorch_net(fread)
+            self.in_linear2.to_pytorch_net(fread)
+            self.relu.to_pytorch_net(fread)
+
+            for fsmn in self.fsmn:
+                fsmn[0].to_pytorch_net(fread)
+                fsmn[1].to_pytorch_net(fread)
+                fsmn[2].to_pytorch_net(fread)
+                fsmn[3].to_pytorch_net(fread)
+
+            self.out_linear1.to_pytorch_net(fread)
+            self.out_linear2.to_pytorch_net(fread)
+
+            softmax_line = fread.readline()
+            softmax_split = softmax_line.strip().split()
+            assert softmax_split[0].strip() == '<Softmax>'
+            assert int(softmax_split[1]) == self.output_dim
+            assert int(softmax_split[2]) == self.output_dim
+            # '<!EndOfComponent>\n'
+
+            nnet_end_line = fread.readline()
+            assert nnet_end_line.strip() == '</Nnet>'
+        fread.close()
+
+
+if __name__ == '__main__':
+    fsmn = FSMN(400, 140, 4, 250, 128, 10, 2, 1, 1, 140, 2599)
+    print(fsmn)
+
+    num_params = sum(p.numel() for p in fsmn.parameters())
+    print('the number of model params: {}'.format(num_params))
+    x = torch.zeros(128, 200, 400)  # batch-size * time * dim
+    y, _ = fsmn(x)  # batch-size * time * dim
+    print('input shape: {}'.format(x.shape))
+    print('output shape: {}'.format(y.shape))
+
+    print(fsmn.to_kaldi_net())
diff --git a/modelscope/models/audio/kws/nearfield/model.py b/modelscope/models/audio/kws/nearfield/model.py
new file mode 100644
index 00000000..7bf55c8b
--- /dev/null
+++ b/modelscope/models/audio/kws/nearfield/model.py
@@ -0,0 +1,178 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import sys
+import tempfile
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.utils.audio.audio_utils import update_conf
+from modelscope.utils.constant import Tasks
+from .cmvn import GlobalCMVN, load_kaldi_cmvn
+from .fsmn import FSMN
+
+
+@MODELS.register_module(
+    Tasks.keyword_spotting,
+    module_name=Models.speech_kws_fsmn_char_ctc_nearfield)
+class FSMNDecorator(TorchModel):
+    r""" A decorator of FSMN for integrating into modelscope framework """
+
+    def __init__(self,
+                 model_dir: str,
+                 cmvn_file: str = None,
+                 backbone: dict = None,
+                 input_dim: int = 400,
+                 output_dim: int = 2599,
+                 training: Optional[bool] = False,
+                 *args,
+                 **kwargs):
+        """initialize the fsmn model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            cmvn_file (str): cmvn file
+            backbone (dict): params related to backbone
+            input_dim (int): input dimention of network
+            output_dim (int): output dimention of network
+            training (bool): training or inference mode
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model = None
+        self.model_cfg = None
+
+        if training:
+            self.model = self.init_model(cmvn_file, backbone, input_dim,
+                                         output_dim)
+        else:
+            self.model_cfg = {
+                'model_workspace': model_dir,
+                'config_path': os.path.join(model_dir, 'config.yaml')
+            }
+
+    def __del__(self):
+        if hasattr(self, 'tmp_dir'):
+            self.tmp_dir.cleanup()
+
+    def forward(self, input) -> Dict[str, Tensor]:
+        """
+        Args:
+            input (torch.Tensor): Input tensor (B, T, D)
+        """
+        if self.model is not None and input is not None:
+            return self.model.forward(input)
+        else:
+            return self.model_cfg
+
+    def init_model(self, cmvn_file, backbone, input_dim, output_dim):
+        if cmvn_file is not None:
+            mean, istd = load_kaldi_cmvn(cmvn_file)
+            global_cmvn = GlobalCMVN(
+                torch.from_numpy(mean).float(),
+                torch.from_numpy(istd).float(),
+            )
+        else:
+            global_cmvn = None
+
+        hidden_dim = 128
+        preprocessing = None
+
+        input_affine_dim = backbone['input_affine_dim']
+        num_layers = backbone['num_layers']
+        linear_dim = backbone['linear_dim']
+        proj_dim = backbone['proj_dim']
+        left_order = backbone['left_order']
+        right_order = backbone['right_order']
+        left_stride = backbone['left_stride']
+        right_stride = backbone['right_stride']
+        output_affine_dim = backbone['output_affine_dim']
+        backbone = FSMN(input_dim, input_affine_dim, num_layers, linear_dim,
+                        proj_dim, left_order, right_order, left_stride,
+                        right_stride, output_affine_dim, output_dim)
+
+        classifier = None
+        activation = None
+
+        kws_model = KWSModel(input_dim, output_dim, hidden_dim, global_cmvn,
+                             preprocessing, backbone, classifier, activation)
+        return kws_model
+
+
+class KWSModel(nn.Module):
+    """Our model consists of four parts:
+    1. global_cmvn: Optional, (idim, idim)
+    2. preprocessing: feature dimention projection, (idim, hdim)
+    3. backbone: backbone or feature extractor of the whole network, (hdim, hdim)
+    4. classifier: output layer or classifier of KWS model, (hdim, odim)
+    5. activation:
+        nn.Sigmoid for wakeup word
+        nn.Identity for speech command dataset
+    """
+
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        hdim: int,
+        global_cmvn: Optional[nn.Module],
+        preprocessing: Optional[nn.Module],
+        backbone: nn.Module,
+        classifier: nn.Module,
+        activation: nn.Module,
+    ):
+        """
+        Args:
+            idim (int): input dimension of network
+            odim (int): output dimension of network
+            hdim (int): hidden dimension of network
+            global_cmvn (nn.Module): cmvn for input feature, (idim, idim)
+            preprocessing (nn.Module): feature dimention projection, (idim, hdim)
+            backbone (nn.Module): backbone or feature extractor of the whole network, (hdim, hdim)
+            classifier (nn.Module): output layer or classifier of KWS model, (hdim, odim)
+            activation (nn.Module): nn.Identity for training, nn.Sigmoid for inference
+        """
+        super().__init__()
+        self.idim = idim
+        self.odim = odim
+        self.hdim = hdim
+        self.global_cmvn = global_cmvn
+        self.preprocessing = preprocessing
+        self.backbone = backbone
+        self.classifier = classifier
+        self.activation = activation
+
+    def to_kaldi_net(self):
+        return self.backbone.to_kaldi_net()
+
+    def to_pytorch_net(self, kaldi_file):
+        return self.backbone.to_pytorch_net(kaldi_file)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        in_cache: torch.Tensor = torch.zeros(0, 0, 0, dtype=torch.float)
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.global_cmvn is not None:
+            x = self.global_cmvn(x)
+        if self.preprocessing is not None:
+            x = self.preprocessing(x)
+
+        x, out_cache = self.backbone(x, in_cache)
+
+        if self.classifier is not None:
+            x = self.classifier(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x, out_cache
+
+    def fuse_modules(self):
+        if self.preprocessing is not None:
+            self.preprocessing.fuse_modules()
+        self.backbone.fuse_modules()
diff --git a/modelscope/models/audio/punc/__init__.py b/modelscope/models/audio/punc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/punc/generic_punctuation.py b/modelscope/models/audio/punc/generic_punctuation.py
new file mode 100644
index 00000000..dabb6090
--- /dev/null
+++ b/modelscope/models/audio/punc/generic_punctuation.py
@@ -0,0 +1,43 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Frameworks, Tasks
+
+
+@MODELS.register_module(Tasks.punctuation, module_name=Models.generic_punc)
+class PunctuationProcessing(Model):
+
+    def __init__(self, model_dir: str, punc_model_name: str,
+                 punc_model_config: Dict[str, Any], *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            punc_model_name (str): the itn model name from configuration.json
+            punc_model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, punc_model_name, punc_model_config, *args,
+                         **kwargs)
+        self.model_cfg = {
+            # the recognition model dir path
+            'model_workspace': model_dir,
+            # the itn model name
+            'punc_model': punc_model_name,
+            # the am model file path
+            'punc_model_path': os.path.join(model_dir, punc_model_name),
+            # the recognition model config dict
+            'model_config': punc_model_config
+        }
+
+    def forward(self) -> Dict[str, Any]:
+        """
+          just return the model config
+
+        """
+
+        return self.model_cfg
diff --git a/modelscope/models/audio/separation/__init__.py b/modelscope/models/audio/separation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/separation/layer_norm.py b/modelscope/models/audio/separation/layer_norm.py
new file mode 100644
index 00000000..a4145cbc
--- /dev/null
+++ b/modelscope/models/audio/separation/layer_norm.py
@@ -0,0 +1,68 @@
+# Copyright  2018  Northwestern Polytechnical University (author: Ke Wang)
+# made publicly available under the MIT License
+# at https://github.com/wangkenpu/Conv-TasNet-PyTorch/blob/64188ffa48971218fdd68b66906970f215d7eca2/model/layer_norm.py
+
+from __future__ import absolute_import, division, print_function
+
+import torch
+import torch.nn as nn
+
+
+class CLayerNorm(nn.LayerNorm):
+    """Channel-wise layer normalization."""
+
+    def __init__(self, *args, **kwargs):
+        super(CLayerNorm, self).__init__(*args, **kwargs)
+
+    def forward(self, sample):
+        """Forward function.
+
+        Args:
+            sample: [batch_size, channels, length]
+        """
+        if sample.dim() != 3:
+            raise RuntimeError('{} only accept 3-D tensor as input'.format(
+                self.__name__))
+        # [N, C, T] -> [N, T, C]
+        sample = torch.transpose(sample, 1, 2)
+        # LayerNorm
+        sample = super().forward(sample)
+        # [N, T, C] -> [N, C, T]
+        sample = torch.transpose(sample, 1, 2)
+        return sample
+
+
+class GLayerNorm(nn.Module):
+    """Global Layer Normalization for TasNet."""
+
+    def __init__(self, channels, eps=1e-5):
+        super(GLayerNorm, self).__init__()
+        self.eps = eps
+        self.norm_dim = channels
+        self.gamma = nn.Parameter(torch.Tensor(channels))
+        self.beta = nn.Parameter(torch.Tensor(channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.ones_(self.gamma)
+        nn.init.zeros_(self.beta)
+
+    def forward(self, sample):
+        """Forward function.
+
+        Args:
+            sample: [batch_size, channels, length]
+        """
+        if sample.dim() != 3:
+            raise RuntimeError('{} only accept 3-D tensor as input'.format(
+                self.__name__))
+        # [N, C, T] -> [N, T, C]
+        sample = torch.transpose(sample, 1, 2)
+        # Mean and variance [N, 1, 1]
+        mean = torch.mean(sample, (1, 2), keepdim=True)
+        var = torch.mean((sample - mean)**2, (1, 2), keepdim=True)
+        sample = (sample
+                  - mean) / torch.sqrt(var + self.eps) * self.gamma + self.beta
+        # [N, T, C] -> [N, C, T]
+        sample = torch.transpose(sample, 1, 2)
+        return sample
diff --git a/modelscope/models/audio/separation/mossformer.py b/modelscope/models/audio/separation/mossformer.py
new file mode 100644
index 00000000..2316bb26
--- /dev/null
+++ b/modelscope/models/audio/separation/mossformer.py
@@ -0,0 +1,476 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+import os
+from typing import Any, Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.models.audio.separation.mossformer_block import (
+    MossFormerModule, ScaledSinuEmbedding)
+from modelscope.models.audio.separation.mossformer_conv_module import (
+    CumulativeLayerNorm, GlobalLayerNorm)
+from modelscope.models.base import Tensor
+from modelscope.utils.constant import Tasks
+
+EPS = 1e-8
+
+
+@MODELS.register_module(
+    Tasks.speech_separation,
+    module_name=Models.speech_mossformer_separation_temporal_8k)
+class MossFormer(TorchModel):
+    """Library to support MossFormer speech separation.
+
+        Args:
+            model_dir (str): the model path.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.encoder = Encoder(
+            kernel_size=kwargs['kernel_size'],
+            out_channels=kwargs['out_channels'])
+        self.decoder = Decoder(
+            in_channels=kwargs['in_channels'],
+            out_channels=1,
+            kernel_size=kwargs['kernel_size'],
+            stride=kwargs['stride'],
+            bias=kwargs['bias'])
+        self.mask_net = MossFormerMaskNet(
+            kwargs['in_channels'],
+            kwargs['out_channels'],
+            MossFormerM(kwargs['num_blocks'], kwargs['d_model'],
+                        kwargs['attn_dropout'], kwargs['group_size'],
+                        kwargs['query_key_dim'], kwargs['expansion_factor'],
+                        kwargs['causal']),
+            norm=kwargs['norm'],
+            num_spks=kwargs['num_spks'])
+        self.num_spks = kwargs['num_spks']
+
+    def forward(self, inputs: Tensor) -> Dict[str, Any]:
+        # Separation
+        mix_w = self.encoder(inputs)
+        est_mask = self.mask_net(mix_w)
+        mix_w = torch.stack([mix_w] * self.num_spks)
+        sep_h = mix_w * est_mask
+        # Decoding
+        est_source = torch.cat(
+            [
+                self.decoder(sep_h[i]).unsqueeze(-1)
+                for i in range(self.num_spks)
+            ],
+            dim=-1,
+        )
+        # T changed after conv1d in encoder, fix it here
+        t_origin = inputs.size(1)
+        t_est = est_source.size(1)
+        if t_origin > t_est:
+            est_source = F.pad(est_source, (0, 0, 0, t_origin - t_est))
+        else:
+            est_source = est_source[:, :t_origin, :]
+        return est_source
+
+    def load_check_point(self, load_path=None, device=None):
+        if not load_path:
+            load_path = self.model_dir
+        if not device:
+            device = torch.device('cpu')
+        self.encoder.load_state_dict(
+            torch.load(
+                os.path.join(load_path, 'encoder.bin'), map_location=device),
+            strict=True)
+        self.decoder.load_state_dict(
+            torch.load(
+                os.path.join(load_path, 'decoder.bin'), map_location=device),
+            strict=True)
+        self.mask_net.load_state_dict(
+            torch.load(
+                os.path.join(load_path, 'masknet.bin'), map_location=device),
+            strict=True)
+
+    def as_dict(self):
+        return dict(
+            encoder=self.encoder, decoder=self.decoder, masknet=self.mask_net)
+
+
+def select_norm(norm, dim, shape):
+    """Just a wrapper to select the normalization type.
+    """
+
+    if norm == 'gln':
+        return GlobalLayerNorm(dim, shape, elementwise_affine=True)
+    if norm == 'cln':
+        return CumulativeLayerNorm(dim, elementwise_affine=True)
+    if norm == 'ln':
+        return nn.GroupNorm(1, dim, eps=1e-8)
+    else:
+        return nn.BatchNorm1d(dim)
+
+
+class Encoder(nn.Module):
+    """Convolutional Encoder Layer.
+
+    Args:
+        kernel_size: Length of filters.
+        in_channels: Number of  input channels.
+        out_channels: Number of output channels.
+
+    Example:
+    -------
+    >>> x = torch.randn(2, 1000)
+    >>> encoder = Encoder(kernel_size=4, out_channels=64)
+    >>> h = encoder(x)
+    >>> h.shape
+    torch.Size([2, 64, 499])
+    """
+
+    def __init__(self,
+                 kernel_size: int = 2,
+                 out_channels: int = 64,
+                 in_channels: int = 1):
+        super(Encoder, self).__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=kernel_size // 2,
+            groups=1,
+            bias=False,
+        )
+        self.in_channels = in_channels
+
+    def forward(self, x: torch.Tensor):
+        """Return the encoded output.
+
+        Args:
+            x: Input tensor with dimensionality [B, L].
+
+        Returns:
+            Encoded tensor with dimensionality [B, N, T_out].
+            where B = Batchsize
+                  L = Number of timepoints
+                  N = Number of filters
+                  T_out = Number of timepoints at the output of the encoder
+        """
+        # B x L -> B x 1 x L
+        if self.in_channels == 1:
+            x = torch.unsqueeze(x, dim=1)
+        # B x 1 x L -> B x N x T_out
+        x = self.conv1d(x)
+        x = F.relu(x)
+
+        return x
+
+
+class Decoder(nn.ConvTranspose1d):
+    """A decoder layer that consists of ConvTranspose1d.
+
+    Args:
+        kernel_size: Length of filters.
+        in_channels: Number of  input channels.
+        out_channels: Number of output channels.
+
+    Example
+    ---------
+    >>> x = torch.randn(2, 100, 1000)
+    >>> decoder = Decoder(kernel_size=4, in_channels=100, out_channels=1)
+    >>> h = decoder(x)
+    >>> h.shape
+    torch.Size([2, 1003])
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(Decoder, self).__init__(*args, **kwargs)
+
+    def forward(self, x):
+        """Return the decoded output.
+
+        Args:
+            x: Input tensor with dimensionality [B, N, L].
+            where, B = Batchsize,
+                   N = number of filters
+                   L = time points
+        """
+
+        if x.dim() not in [2, 3]:
+            raise RuntimeError('{} accept 3/4D tensor as input'.format(
+                self.__name__))
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+
+        if torch.squeeze(x).dim() == 1:
+            x = torch.squeeze(x, dim=1)
+        else:
+            x = torch.squeeze(x)
+        return x
+
+
+class IdentityBlock:
+    """This block is used when we want to have identity transformation within the Dual_path block.
+
+    Example
+    -------
+    >>> x = torch.randn(10, 100)
+    >>> IB = IdentityBlock()
+    >>> xhat = IB(x)
+    """
+
+    def _init__(self, **kwargs):
+        pass
+
+    def __call__(self, x):
+        return x
+
+
+class MossFormerM(nn.Module):
+    """This class implements the transformer encoder.
+
+    Args:
+    num_blocks : int
+        Number of mossformer blocks to include.
+    d_model : int
+        The dimension of the input embedding.
+    attn_dropout : float
+        Dropout for the self-attention (Optional).
+    group_size: int
+        the chunk size
+    query_key_dim: int
+        the attention vector dimension
+    expansion_factor: int
+        the expansion factor for the linear projection in conv module
+    causal: bool
+        true for causal / false for non causal
+
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512)) #B, S, N
+    >>> net = MossFormerM(num_blocks=8, d_model=512)
+    >>> output, _ = net(x)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+
+    def __init__(self,
+                 num_blocks,
+                 d_model=None,
+                 attn_dropout=0.1,
+                 group_size=256,
+                 query_key_dim=128,
+                 expansion_factor=4.,
+                 causal=False):
+        super().__init__()
+
+        self.mossformerM = MossFormerModule(
+            dim=d_model,
+            depth=num_blocks,
+            group_size=group_size,
+            query_key_dim=query_key_dim,
+            expansion_factor=expansion_factor,
+            causal=causal,
+            attn_dropout=attn_dropout)
+        import speechbrain as sb
+        self.norm = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+
+    def forward(self, src: torch.Tensor):
+        """
+        Args:
+            src: Tensor shape [B, S, N],
+            where, B = Batchsize,
+                   S = time points
+                   N = number of filters
+            The sequence to the encoder layer (required).
+        """
+        output = self.mossformerM(src)
+        output = self.norm(output)
+
+        return output
+
+
+class ComputeAttention(nn.Module):
+    """Computation block for dual-path processing.
+
+    Args:
+    att_mdl : torch.nn.module
+        Model to process within the chunks.
+     out_channels : int
+        Dimensionality of attention model.
+     norm : str
+        Normalization type.
+     skip_connection : bool
+        Skip connection around the attention module.
+
+    Example
+    ---------
+        >>> att_block = MossFormerM(num_blocks=8, d_model=512)
+        >>> comp_att = ComputeAttention(att_block, 512)
+        >>> x = torch.randn(10, 64, 512)
+        >>> x = comp_att(x)
+        >>> x.shape
+        torch.Size([10, 64, 512])
+    """
+
+    def __init__(
+        self,
+        att_mdl,
+        out_channels,
+        norm='ln',
+        skip_connection=True,
+    ):
+        super(ComputeAttention, self).__init__()
+
+        self.att_mdl = att_mdl
+        self.skip_connection = skip_connection
+
+        # Norm
+        self.norm = norm
+        if norm is not None:
+            self.att_norm = select_norm(norm, out_channels, 3)
+
+    def forward(self, x: torch.Tensor):
+        """Returns the output tensor.
+
+        Args:
+            x: Input tensor of dimension [B, S, N].
+
+        Returns:
+            out: Output tensor of dimension [B, S, N].
+            where, B = Batchsize,
+               N = number of filters
+               S = time points
+        """
+        # [B, S, N]
+        att_out = x.permute(0, 2, 1).contiguous()
+
+        att_out = self.att_mdl(att_out)
+
+        # [B, N, S]
+        att_out = att_out.permute(0, 2, 1).contiguous()
+        if self.norm is not None:
+            att_out = self.att_norm(att_out)
+
+        # [B, N, S]
+        if self.skip_connection:
+            att_out = att_out + x
+
+        out = att_out
+        return out
+
+
+class MossFormerMaskNet(nn.Module):
+    """The dual path model which is the basis for dualpathrnn, sepformer, dptnet.
+
+    Args:
+    in_channels : int
+        Number of channels at the output of the encoder.
+    out_channels : int
+        Number of channels that would be inputted to the intra and inter blocks.
+    att_model : torch.nn.module
+        Attention model to process the input sequence.
+    norm : str
+        Normalization type.
+    num_spks : int
+        Number of sources (speakers).
+    skip_connection : bool
+        Skip connection around attention module.
+    use_global_pos_enc : bool
+        Global positional encodings.
+
+    Example
+    ---------
+    >>> mossformer_block = MossFormerM(num_blocks=8, d_model=512)
+    >>> mossformer_masknet = MossFormerMaskNet(64, 64, att_model, num_spks=2)
+    >>> x = torch.randn(10, 64, 2000)
+    >>> x = mossformer_masknet(x)
+    >>> x.shape
+    torch.Size([2, 10, 64, 2000])
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        att_model,
+        norm='ln',
+        num_spks=2,
+        skip_connection=True,
+        use_global_pos_enc=True,
+    ):
+        super(MossFormerMaskNet, self).__init__()
+        self.num_spks = num_spks
+        self.norm = select_norm(norm, in_channels, 3)
+        self.conv1d_encoder = nn.Conv1d(
+            in_channels, out_channels, 1, bias=False)
+        self.use_global_pos_enc = use_global_pos_enc
+
+        if self.use_global_pos_enc:
+            self.pos_enc = ScaledSinuEmbedding(out_channels)
+
+        self.mdl = copy.deepcopy(
+            ComputeAttention(
+                att_model,
+                out_channels,
+                norm,
+                skip_connection=skip_connection,
+            ))
+
+        self.conv1d_out = nn.Conv1d(
+            out_channels, out_channels * num_spks, kernel_size=1)
+        self.conv1_decoder = nn.Conv1d(
+            out_channels, in_channels, 1, bias=False)
+        self.prelu = nn.PReLU()
+        self.activation = nn.ReLU()
+        # gated output layer
+        self.output = nn.Sequential(
+            nn.Conv1d(out_channels, out_channels, 1), nn.Tanh())
+        self.output_gate = nn.Sequential(
+            nn.Conv1d(out_channels, out_channels, 1), nn.Sigmoid())
+
+    def forward(self, x: torch.Tensor):
+        """Returns the output tensor.
+
+        Args:
+            x: Input tensor of dimension [B, N, S].
+
+        Returns:
+            out: Output tensor of dimension [spks, B, N, S]
+            where, spks = Number of speakers
+               B = Batchsize,
+               N = number of filters
+               S = the number of time frames
+        """
+
+        # before each line we indicate the shape after executing the line
+        # [B, N, L]
+        x = self.norm(x)
+        # [B, N, L]
+        x = self.conv1d_encoder(x)
+        if self.use_global_pos_enc:
+            base = x
+            x = x.transpose(1, -1)
+            emb = self.pos_enc(x)
+            emb = emb.transpose(0, -1)
+            x = base + emb
+        # [B, N, S]
+        x = self.mdl(x)
+        x = self.prelu(x)
+        # [B, N*spks, S]
+        x = self.conv1d_out(x)
+        b, _, s = x.shape
+        # [B*spks, N, S]
+        x = x.view(b * self.num_spks, -1, s)
+        # [B*spks, N, S]
+        x = self.output(x) * self.output_gate(x)
+        # [B*spks, N, S]
+        x = self.conv1_decoder(x)
+        # [B, spks, N, S]
+        _, n, L = x.shape
+        x = x.view(b, self.num_spks, n, L)
+        x = self.activation(x)
+        # [spks, B, N, S]
+        x = x.transpose(0, 1)
+        return x
diff --git a/modelscope/models/audio/separation/mossformer_block.py b/modelscope/models/audio/separation/mossformer_block.py
new file mode 100644
index 00000000..1db8d010
--- /dev/null
+++ b/modelscope/models/audio/separation/mossformer_block.py
@@ -0,0 +1,265 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn.functional as F
+from torch import einsum, nn
+
+from modelscope.models.audio.separation.mossformer_conv_module import \
+    MossFormerConvModule
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+def padding_to_multiple_of(n, mult):
+    remainder = n % mult
+    if remainder == 0:
+        return 0
+    return mult - remainder
+
+
+class ScaleNorm(nn.Module):
+
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.scale = dim**-0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class ScaledSinuEmbedding(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(1, ))
+        inv_freq = 1. / (10000**(torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, x):
+        n, device = x.shape[1], x.device
+        t = torch.arange(n, device=device).type_as(self.inv_freq)
+        sinu = einsum('i , j -> i j', t, self.inv_freq)
+        emb = torch.cat((sinu.sin(), sinu.cos()), dim=-1)
+        return emb * self.scale
+
+
+class OffsetScale(nn.Module):
+
+    def __init__(self, dim, heads=1):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(heads, dim))
+        self.beta = nn.Parameter(torch.zeros(heads, dim))
+        nn.init.normal_(self.gamma, std=0.02)
+
+    def forward(self, x):
+        out = einsum('... d, h d -> ... h d', x, self.gamma) + self.beta
+        return out.unbind(dim=-2)
+
+
+class FFConvM(nn.Module):
+
+    def __init__(self, dim_in, dim_out, norm_klass=nn.LayerNorm, dropout=0.1):
+        super().__init__()
+        self.mdl = nn.Sequential(
+            norm_klass(dim_in), nn.Linear(dim_in, dim_out), nn.SiLU(),
+            MossFormerConvModule(dim_out), nn.Dropout(dropout))
+
+    def forward(self, x):
+        output = self.mdl(x)
+        return output
+
+
+class MossFormerBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 group_size=256,
+                 query_key_dim=128,
+                 expansion_factor=1.,
+                 causal=False,
+                 dropout=0.1,
+                 rotary_pos_emb=None,
+                 norm_klass=nn.LayerNorm,
+                 shift_tokens=True):
+        super().__init__()
+        hidden_dim = int(dim * expansion_factor)
+        self.group_size = group_size
+        self.causal = causal
+        self.shift_tokens = shift_tokens
+        # positional embeddings
+        self.rotary_pos_emb = rotary_pos_emb
+        # norm
+        self.dropout = nn.Dropout(dropout)
+        # projections
+        self.to_hidden = FFConvM(
+            dim_in=dim,
+            dim_out=hidden_dim,
+            norm_klass=norm_klass,
+            dropout=dropout,
+        )
+        self.to_qk = FFConvM(
+            dim_in=dim,
+            dim_out=query_key_dim,
+            norm_klass=norm_klass,
+            dropout=dropout,
+        )
+        self.qk_offset_scale = OffsetScale(query_key_dim, heads=4)
+        self.to_out = FFConvM(
+            dim_in=dim * 2,
+            dim_out=dim,
+            norm_klass=norm_klass,
+            dropout=dropout,
+        )
+        self.gateActivate = nn.Sigmoid()
+
+    def forward(self, x):
+        # prenorm
+        normed_x = x
+        # do token shift - a great, costless trick from an independent AI researcher in Shenzhen
+        if self.shift_tokens:
+            x_shift, x_pass = normed_x.chunk(2, dim=-1)
+            x_shift = F.pad(x_shift, (0, 0, 1, -1), value=0.)
+            normed_x = torch.cat((x_shift, x_pass), dim=-1)
+
+        # initial projections
+        v, u = self.to_hidden(normed_x).chunk(2, dim=-1)
+        qk = self.to_qk(normed_x)
+        # offset and scale
+        quad_q, lin_q, quad_k, lin_k = self.qk_offset_scale(qk)
+        att_v, att_u = self.cal_attention(x, quad_q, lin_q, quad_k, lin_k, v,
+                                          u)
+
+        # projection out and residual
+        out = (att_u * v) * self.gateActivate(att_v * u)
+        x = x + self.to_out(out)
+        return x
+
+    def cal_attention(self, x, quad_q, lin_q, quad_k, lin_k, v, u, mask=None):
+        b, n, device, g = x.shape[0], x.shape[-2], x.device, self.group_size
+
+        from einops import rearrange
+        if exists(mask):
+            lin_mask = rearrange(mask, '... -> ... 1')
+            lin_k = lin_k.masked_fill(~lin_mask, 0.)
+
+        # rotate queries and keys
+        if exists(self.rotary_pos_emb):
+            quad_q, lin_q, quad_k, lin_k = map(
+                self.rotary_pos_emb.rotate_queries_or_keys,
+                (quad_q, lin_q, quad_k, lin_k))
+
+        # padding for groups
+        padding = padding_to_multiple_of(n, g)
+        if padding > 0:
+            quad_q, quad_k, lin_q, lin_k, v, u = map(
+                lambda t: F.pad(t, (0, 0, 0, padding), value=0.),
+                (quad_q, quad_k, lin_q, lin_k, v, u))
+            mask = default(mask,
+                           torch.ones((b, n), device=device, dtype=torch.bool))
+            mask = F.pad(mask, (0, padding), value=False)
+
+        # group along sequence
+        quad_q, quad_k, lin_q, lin_k, v, u = map(
+            lambda t: rearrange(t, 'b (g n) d -> b g n d', n=self.group_size),
+            (quad_q, quad_k, lin_q, lin_k, v, u))
+
+        if exists(mask):
+            mask = rearrange(mask, 'b (g j) -> b g 1 j', j=g)
+
+        # calculate quadratic attention output
+        sim = einsum('... i d, ... j d -> ... i j', quad_q, quad_k) / g
+        attn = F.relu(sim)**2
+        attn = self.dropout(attn)
+        if exists(mask):
+            attn = attn.masked_fill(~mask, 0.)
+        if self.causal:
+            causal_mask = torch.ones((g, g), dtype=torch.bool,
+                                     device=device).triu(1)
+            attn = attn.masked_fill(causal_mask, 0.)
+
+        quad_out_v = einsum('... i j, ... j d -> ... i d', attn, v)
+        quad_out_u = einsum('... i j, ... j d -> ... i d', attn, u)
+
+        # calculate linear attention output
+        if self.causal:
+            lin_kv = einsum('b g n d, b g n e -> b g d e', lin_k, v) / g
+            # exclusive cumulative sum along group dimension
+            lin_kv = lin_kv.cumsum(dim=1)
+            lin_kv = F.pad(lin_kv, (0, 0, 0, 0, 1, -1), value=0.)
+            lin_out_v = einsum('b g d e, b g n d -> b g n e', lin_kv, lin_q)
+
+            lin_ku = einsum('b g n d, b g n e -> b g d e', lin_k, u) / g
+            # exclusive cumulative sum along group dimension
+            lin_ku = lin_ku.cumsum(dim=1)
+            lin_ku = F.pad(lin_ku, (0, 0, 0, 0, 1, -1), value=0.)
+            lin_out_u = einsum('b g d e, b g n d -> b g n e', lin_ku, lin_q)
+        else:
+            lin_kv = einsum('b g n d, b g n e -> b d e', lin_k, v) / n
+            lin_out_v = einsum('b g n d, b d e -> b g n e', lin_q, lin_kv)
+
+            lin_ku = einsum('b g n d, b g n e -> b d e', lin_k, u) / n
+            lin_out_u = einsum('b g n d, b d e -> b g n e', lin_q, lin_ku)
+
+        # fold back groups into full sequence, and excise out padding
+        quad_attn_out_v, lin_attn_out_v = map(
+            lambda t: rearrange(t, 'b g n d -> b (g n) d')[:, :n],
+            (quad_out_v, lin_out_v))
+        quad_attn_out_u, lin_attn_out_u = map(
+            lambda t: rearrange(t, 'b g n d -> b (g n) d')[:, :n],
+            (quad_out_u, lin_out_u))
+
+        # gate
+        return quad_attn_out_v + lin_attn_out_v, quad_attn_out_u + lin_attn_out_u
+
+
+class MossFormerModule(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 group_size=256,
+                 query_key_dim=128,
+                 expansion_factor=4.,
+                 causal=False,
+                 attn_dropout=0.1,
+                 norm_type='scalenorm',
+                 shift_tokens=True):
+        super().__init__()
+        assert norm_type in (
+            'scalenorm',
+            'layernorm'), 'norm_type must be one of scalenorm or layernorm'
+
+        if norm_type == 'scalenorm':
+            norm_klass = ScaleNorm
+        elif norm_type == 'layernorm':
+            norm_klass = nn.LayerNorm
+
+        from rotary_embedding_torch import RotaryEmbedding
+        rotary_pos_emb = RotaryEmbedding(dim=min(32, query_key_dim))
+        # max rotary embedding dimensions of 32, partial Rotary embeddings, from Wang et al - GPT-J
+        self.layers = nn.ModuleList([
+            MossFormerBlock(
+                dim=dim,
+                group_size=group_size,
+                query_key_dim=query_key_dim,
+                expansion_factor=expansion_factor,
+                causal=causal,
+                dropout=attn_dropout,
+                rotary_pos_emb=rotary_pos_emb,
+                norm_klass=norm_klass,
+                shift_tokens=shift_tokens) for _ in range(depth)
+        ])
+
+    def forward(self, x):
+        for mossformer_layer in self.layers:
+            x = mossformer_layer(x)
+        return x
diff --git a/modelscope/models/audio/separation/mossformer_conv_module.py b/modelscope/models/audio/separation/mossformer_conv_module.py
new file mode 100644
index 00000000..283269b3
--- /dev/null
+++ b/modelscope/models/audio/separation/mossformer_conv_module.py
@@ -0,0 +1,272 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from torch import Tensor
+
+EPS = 1e-8
+
+
+class GlobalLayerNorm(nn.Module):
+    """Calculate Global Layer Normalization.
+
+    Args:
+       dim : (int or list or torch.Size)
+           Input shape from an expected input of size.
+       eps : float
+           A value added to the denominator for numerical stability.
+       elementwise_affine : bool
+          A boolean value that when set to True,
+          this module has learnable per-element affine parameters
+          initialized to ones (for weights) and zeros (for biases).
+
+    Example
+    -------
+    >>> x = torch.randn(5, 10, 20)
+    >>> GLN = GlobalLayerNorm(10, 3)
+    >>> x_norm = GLN(x)
+    """
+
+    def __init__(self, dim, shape, eps=1e-8, elementwise_affine=True):
+        super(GlobalLayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+
+        if self.elementwise_affine:
+            if shape == 3:
+                self.weight = nn.Parameter(torch.ones(self.dim, 1))
+                self.bias = nn.Parameter(torch.zeros(self.dim, 1))
+            if shape == 4:
+                self.weight = nn.Parameter(torch.ones(self.dim, 1, 1))
+                self.bias = nn.Parameter(torch.zeros(self.dim, 1, 1))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+
+    def forward(self, x):
+        """Returns the normalized tensor.
+
+        Args:
+            x: Tensor of size [N, C, K, S] or [N, C, L].
+        """
+        # N x 1 x 1
+        # cln: mean,var N x 1 x K x S
+        # gln: mean,var N x 1 x 1
+        if x.dim() == 3:
+            mean = torch.mean(x, (1, 2), keepdim=True)
+            var = torch.mean((x - mean)**2, (1, 2), keepdim=True)
+            if self.elementwise_affine:
+                x = (self.weight * (x - mean) / torch.sqrt(var + self.eps)
+                     + self.bias)  # yapf: disable
+            else:
+                x = (x - mean) / torch.sqrt(var + self.eps)
+
+        if x.dim() == 4:
+            mean = torch.mean(x, (1, 2, 3), keepdim=True)
+            var = torch.mean((x - mean)**2, (1, 2, 3), keepdim=True)
+            if self.elementwise_affine:
+                x = (self.weight * (x - mean) / torch.sqrt(var + self.eps)
+                     + self.bias)  # yapf: disable
+            else:
+                x = (x - mean) / torch.sqrt(var + self.eps)
+        return x
+
+
+class CumulativeLayerNorm(nn.LayerNorm):
+    """Calculate Cumulative Layer Normalization.
+
+    Args:
+       dim: Dimension that you want to normalize.
+       elementwise_affine: Learnable per-element affine parameters.
+
+    Example
+    -------
+    >>> x = torch.randn(5, 10, 20)
+    >>> CLN = CumulativeLayerNorm(10)
+    >>> x_norm = CLN(x)
+    """
+
+    def __init__(self, dim, elementwise_affine=True):
+        super(CumulativeLayerNorm, self).__init__(
+            dim, elementwise_affine=elementwise_affine, eps=1e-8)
+
+    def forward(self, x):
+        """Returns the normalized tensor.
+
+        Args:
+            x: Tensor size [N, C, K, S] or [N, C, L]
+        """
+        # N x K x S x C
+        if x.dim() == 4:
+            x = x.permute(0, 2, 3, 1).contiguous()
+            # N x K x S x C == only channel norm
+            x = super().forward(x)
+            # N x C x K x S
+            x = x.permute(0, 3, 1, 2).contiguous()
+        if x.dim() == 3:
+            x = torch.transpose(x, 1, 2)
+            # N x L x C == only channel norm
+            x = super().forward(x)
+            # N x C x L
+            x = torch.transpose(x, 1, 2)
+        return x
+
+
+def select_norm(norm, dim, shape):
+    """Just a wrapper to select the normalization type.
+    """
+
+    if norm == 'gln':
+        return GlobalLayerNorm(dim, shape, elementwise_affine=True)
+    if norm == 'cln':
+        return CumulativeLayerNorm(dim, elementwise_affine=True)
+    if norm == 'ln':
+        return nn.GroupNorm(1, dim, eps=1e-8)
+    else:
+        return nn.BatchNorm1d(dim)
+
+
+class Swish(nn.Module):
+    """
+    Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied
+    to a variety of challenging domains such as Image classification and Machine translation.
+    """
+
+    def __init__(self):
+        super(Swish, self).__init__()
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return inputs * inputs.sigmoid()
+
+
+class GLU(nn.Module):
+    """
+    The gating mechanism is called Gated Linear Units (GLU), which was first introduced for natural language processing
+    in the paper “Language Modeling with Gated Convolutional Networks”
+    """
+
+    def __init__(self, dim: int) -> None:
+        super(GLU, self).__init__()
+        self.dim = dim
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        outputs, gate = inputs.chunk(2, dim=self.dim)
+        return outputs * gate.sigmoid()
+
+
+class Transpose(nn.Module):
+    """ Wrapper class of torch.transpose() for Sequential module. """
+
+    def __init__(self, shape: tuple):
+        super(Transpose, self).__init__()
+        self.shape = shape
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.transpose(*self.shape)
+
+
+class Linear(nn.Module):
+    """
+    Wrapper class of torch.nn.Linear
+    Weight initialize by xavier initialization and bias initialize to zeros.
+    """
+
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True) -> None:
+        super(Linear, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+        init.xavier_uniform_(self.linear.weight)
+        if bias:
+            init.zeros_(self.linear.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(x)
+
+
+class DepthwiseConv1d(nn.Module):
+    """
+    When groups == in_channels and out_channels == K * in_channels, where K is a positive integer,
+    this operation is termed in literature as depthwise convolution.
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by depthwise 1-D convolution.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        bias: bool = False,
+    ) -> None:
+        super(DepthwiseConv1d, self).__init__()
+        assert out_channels % in_channels == 0, 'out_channels should be constant multiple of in_channels'
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.conv(inputs)
+
+
+class MossFormerConvModule(nn.Module):
+    """Conformer convolution module starts with a pointwise convolution and a gated linear unit (GLU).
+
+    This is followed by a single 1-D depthwise convolution layer. Batchnorm is  deployed just after the convolution
+    to aid training deep models.
+
+    Args:
+        in_channels (int): Number of channels in the input
+        kernel_size (int or tuple, optional): Size of the convolving kernel Default: 17
+        dropout_p (float, optional): probability of dropout
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 kernel_size: int = 17,
+                 expansion_factor: int = 2) -> None:
+        super(MossFormerConvModule, self).__init__()
+        assert (
+            kernel_size - 1
+        ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
+        assert expansion_factor == 2, 'Currently, Only Supports expansion_factor 2'
+
+        self.sequential = nn.Sequential(
+            Transpose(shape=(1, 2)),
+            DepthwiseConv1d(
+                in_channels,
+                in_channels,
+                kernel_size,
+                stride=1,
+                padding=(kernel_size - 1) // 2),
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        """
+        Args:
+            inputs (batch, time, dim): Tensor contains input sequences
+
+        Returns:
+            outputs (batch, time, dim): Tensor produces by conformer convolution module.
+        """
+        return inputs + self.sequential(inputs).transpose(1, 2)
diff --git a/modelscope/models/audio/sv/__init__.py b/modelscope/models/audio/sv/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/audio/sv/generic_speaker_verification.py b/modelscope/models/audio/sv/generic_speaker_verification.py
new file mode 100644
index 00000000..686ec93b
--- /dev/null
+++ b/modelscope/models/audio/sv/generic_speaker_verification.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Frameworks, Tasks
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.generic_sv)
+class SpeakerVerification(Model):
+
+    def __init__(self, model_dir: str, sv_model_name: str,
+                 model_config: Dict[str, Any], *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            punc_model_name (str): the itn model name from configuration.json
+            punc_model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, sv_model_name, model_config, *args,
+                         **kwargs)
+        self.model_cfg = {
+            # the recognition model dir path
+            'model_workspace': model_dir,
+            # the itn model name
+            'sv_model': sv_model_name,
+            # the am model file path
+            'sv_model_path': os.path.join(model_dir, sv_model_name),
+            # the recognition model config dict
+            'model_config': model_config
+        }
+
+    def forward(self) -> Dict[str, Any]:
+        """
+          just return the model config
+
+        """
+
+        return self.model_cfg
diff --git a/modelscope/models/audio/tts/kantts/datasets/dataset.py b/modelscope/models/audio/tts/kantts/datasets/dataset.py
index 83afc050..d5dd4da7 100644
--- a/modelscope/models/audio/tts/kantts/datasets/dataset.py
+++ b/modelscope/models/audio/tts/kantts/datasets/dataset.py
@@ -18,7 +18,7 @@ from modelscope.models.audio.tts.kantts.utils.ling_unit.ling_unit import (
 from modelscope.utils.logger import get_logger
 
 DATASET_RANDOM_SEED = 1234
-
+torch.multiprocessing.set_sharing_strategy('file_system')
 logging = get_logger()
 
 
@@ -249,9 +249,27 @@ class VocDataset(KanttsDataset):
             mel_data = np.concatenate((mel_data, frame_f0_data, frame_uv_data),
                                       axis=1)
 
-        # make sure the audio length and feature length are matched
-        wav_data = np.pad(wav_data, (0, self.n_fft), mode='reflect')
-        wav_data = wav_data[:len(mel_data) * self.hop_length]
+        # make sure mel_data length greater than batch_max_frames at least 1 frame
+        if mel_data.shape[0] <= self.batch_max_frames:
+            mel_data = np.concatenate(
+                (
+                    mel_data,
+                    np.zeros((
+                        self.batch_max_frames - mel_data.shape[0] + 1,
+                        mel_data.shape[1],
+                    )),
+                ),
+                axis=0,
+            )
+            wav_cache = np.zeros(
+                mel_data.shape[0] * self.hop_length, dtype=np.float32)
+            wav_cache[:len(wav_data)] = wav_data
+            wav_data = wav_cache
+        else:
+            # make sure the audio length and feature length are matched
+            wav_data = np.pad(wav_data, (0, self.n_fft), mode='reflect')
+            wav_data = wav_data[:len(mel_data) * self.hop_length]
+
         assert len(mel_data) * self.hop_length == len(wav_data)
 
         if self.allow_cache:
@@ -561,11 +579,12 @@ class AmDataset(KanttsDataset):
                         os.path.join(frame_f0_dir, index + '.npy'))
                         or not os.path.exists(
                             os.path.join(frame_uv_dir, index + '.npy'))
-                        or not os.path.exists(
-                            os.path.join(duration_dir, index + '.npy'))
                         or not os.path.exists(
                             os.path.join(mel_dir, index + '.npy'))):
                     continue
+                if os.path.exists(duration_dir) and not os.path.exists(
+                        os.path.join(duration_dir, index + '.npy')):
+                    continue
                 f.write(line)
 
         with open(valid_meta_file, 'w') as f:
@@ -577,62 +596,86 @@ class AmDataset(KanttsDataset):
                         os.path.join(frame_f0_dir, index + '.npy'))
                         or not os.path.exists(
                             os.path.join(frame_uv_dir, index + '.npy'))
-                        or not os.path.exists(
-                            os.path.join(duration_dir, index + '.npy'))
                         or not os.path.exists(
                             os.path.join(mel_dir, index + '.npy'))):
                     continue
+                if os.path.exists(duration_dir) and not os.path.exists(
+                        os.path.join(duration_dir, index + '.npy')):
+                    continue
                 f.write(line)
 
     def collate_fn(self, batch):
         data_dict = {}
 
         max_input_length = max((len(x[0][0]) for x in batch))
-        max_dur_length = max((x[2].shape[0] for x in batch)) + 1
+        if self.with_duration:
+            max_dur_length = max((x[2].shape[0] for x in batch)) + 1
 
-        # pure linguistic info: sy|tone|syllable_flag|word_segment
-        lfeat_type = self.ling_unit._lfeat_type_list[0]
-        inputs_sy = self.padder._prepare_scalar_inputs(
-            [x[0][0] for x in batch],
-            max_input_length,
-            self.ling_unit._sub_unit_pad[lfeat_type],
-        ).long()
-        # tone
-        lfeat_type = self.ling_unit._lfeat_type_list[1]
-        inputs_tone = self.padder._prepare_scalar_inputs(
-            [x[0][1] for x in batch],
-            max_input_length,
-            self.ling_unit._sub_unit_pad[lfeat_type],
-        ).long()
+        lfeat_type_index = 0
+        lfeat_type = self.ling_unit._lfeat_type_list[lfeat_type_index]
+        if self.ling_unit.using_byte():
+            # for byte-based model only
+            inputs_byte_index = self.padder._prepare_scalar_inputs(
+                [x[0][lfeat_type_index] for x in batch],
+                max_input_length,
+                self.ling_unit._sub_unit_pad[lfeat_type],
+            ).long()
 
-        # syllable_flag
-        lfeat_type = self.ling_unit._lfeat_type_list[2]
-        inputs_syllable_flag = self.padder._prepare_scalar_inputs(
-            [x[0][2] for x in batch],
-            max_input_length,
-            self.ling_unit._sub_unit_pad[lfeat_type],
-        ).long()
+            data_dict['input_lings'] = torch.stack([inputs_byte_index], dim=2)
+        else:
+            # pure linguistic info: sy|tone|syllable_flag|word_segment
+            # sy
+            inputs_sy = self.padder._prepare_scalar_inputs(
+                [x[0][lfeat_type_index] for x in batch],
+                max_input_length,
+                self.ling_unit._sub_unit_pad[lfeat_type],
+            ).long()
 
-        # word_segment
-        lfeat_type = self.ling_unit._lfeat_type_list[3]
-        inputs_ws = self.padder._prepare_scalar_inputs(
-            [x[0][3] for x in batch],
-            max_input_length,
-            self.ling_unit._sub_unit_pad[lfeat_type],
-        ).long()
+            # tone
+            lfeat_type_index = lfeat_type_index + 1
+            lfeat_type = self.ling_unit._lfeat_type_list[lfeat_type_index]
+            inputs_tone = self.padder._prepare_scalar_inputs(
+                [x[0][lfeat_type_index] for x in batch],
+                max_input_length,
+                self.ling_unit._sub_unit_pad[lfeat_type],
+            ).long()
+
+            # syllable_flag
+            lfeat_type_index = lfeat_type_index + 1
+            lfeat_type = self.ling_unit._lfeat_type_list[lfeat_type_index]
+            inputs_syllable_flag = self.padder._prepare_scalar_inputs(
+                [x[0][lfeat_type_index] for x in batch],
+                max_input_length,
+                self.ling_unit._sub_unit_pad[lfeat_type],
+            ).long()
+
+            # word_segment
+            lfeat_type_index = lfeat_type_index + 1
+            lfeat_type = self.ling_unit._lfeat_type_list[lfeat_type_index]
+            inputs_ws = self.padder._prepare_scalar_inputs(
+                [x[0][lfeat_type_index] for x in batch],
+                max_input_length,
+                self.ling_unit._sub_unit_pad[lfeat_type],
+            ).long()
+
+            data_dict['input_lings'] = torch.stack(
+                [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws],
+                dim=2)
 
         # emotion category
-        lfeat_type = self.ling_unit._lfeat_type_list[4]
+        lfeat_type_index = lfeat_type_index + 1
+        lfeat_type = self.ling_unit._lfeat_type_list[lfeat_type_index]
         data_dict['input_emotions'] = self.padder._prepare_scalar_inputs(
-            [x[0][4] for x in batch],
+            [x[0][lfeat_type_index] for x in batch],
             max_input_length,
             self.ling_unit._sub_unit_pad[lfeat_type],
         ).long()
 
         # speaker category
-        lfeat_type = self.ling_unit._lfeat_type_list[5]
+        lfeat_type_index = lfeat_type_index + 1
+        lfeat_type = self.ling_unit._lfeat_type_list[lfeat_type_index]
         data_dict['input_speakers'] = self.padder._prepare_scalar_inputs(
-            [x[0][5] for x in batch],
+            [x[0][lfeat_type_index] for x in batch],
             max_input_length,
             self.ling_unit._sub_unit_pad[lfeat_type],
         ).long()
@@ -645,8 +688,6 @@ class AmDataset(KanttsDataset):
                 0,
             ).long()
 
-        data_dict['input_lings'] = torch.stack(
-            [inputs_sy, inputs_tone, inputs_syllable_flag, inputs_ws], dim=2)
         data_dict['valid_input_lengths'] = torch.as_tensor(
             [len(x[0][0]) - 1 for x in batch], dtype=torch.long
         )  # 输入的symbol sequence会在后面拼一个“~”，影响duration计算，所以把length-1
diff --git a/modelscope/models/audio/tts/kantts/models/hifigan/hifigan.py b/modelscope/models/audio/tts/kantts/models/hifigan/hifigan.py
index 9b3ad788..c21e6714 100644
--- a/modelscope/models/audio/tts/kantts/models/hifigan/hifigan.py
+++ b/modelscope/models/audio/tts/kantts/models/hifigan/hifigan.py
@@ -163,7 +163,7 @@ class Generator(torch.nn.Module):
             else:
                 # transconv
                 up = self.transpose_upsamples[i](x)
-                x = rep + up
+                x = rep + up[:, :, :rep.shape[-1]]
 
             xs = None
             for j in range(self.num_kernels):
diff --git a/modelscope/models/audio/tts/kantts/models/sambert/kantts_sambert.py b/modelscope/models/audio/tts/kantts/models/sambert/kantts_sambert.py
index f654aa58..bf17d12f 100644
--- a/modelscope/models/audio/tts/kantts/models/sambert/kantts_sambert.py
+++ b/modelscope/models/audio/tts/kantts/models/sambert/kantts_sambert.py
@@ -253,15 +253,25 @@ class TextFftEncoder(nn.Module):
     def __init__(self, config):
         super(TextFftEncoder, self).__init__()
 
-        # linguistic unit lookup table
-        nb_ling_sy = config['sy']
-        nb_ling_tone = config['tone']
-        nb_ling_syllable_flag = config['syllable_flag']
-        nb_ling_ws = config['word_segment']
+        d_emb = config['embedding_dim']
+        self.using_byte = False
+        if config.get('using_byte', False):
+            self.using_byte = True
+            nb_ling_byte_index = config['byte_index']
+            self.byte_index_emb = nn.Embedding(nb_ling_byte_index, d_emb)
+        else:
+            # linguistic unit lookup table
+            nb_ling_sy = config['sy']
+            nb_ling_tone = config['tone']
+            nb_ling_syllable_flag = config['syllable_flag']
+            nb_ling_ws = config['word_segment']
+            self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
+            self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
+            self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
+            self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
 
         max_len = config['max_len']
 
-        d_emb = config['embedding_dim']
         nb_layers = config['encoder_num_layers']
         nb_heads = config['encoder_num_heads']
         d_model = config['encoder_num_units']
@@ -274,11 +284,6 @@ class TextFftEncoder(nn.Module):
 
         self.d_model = d_model
 
-        self.sy_emb = nn.Embedding(nb_ling_sy, d_emb)
-        self.tone_emb = nn.Embedding(nb_ling_tone, d_emb)
-        self.syllable_flag_emb = nn.Embedding(nb_ling_syllable_flag, d_emb)
-        self.ws_emb = nn.Embedding(nb_ling_ws, d_emb)
-
         position_enc = SinusoidalPositionEncoder(max_len, d_emb)
 
         self.ling_enc = SelfAttentionEncoder(
@@ -298,20 +303,26 @@ class TextFftEncoder(nn.Module):
 
     def forward(self, inputs_ling, masks=None, return_attns=False):
         # Parse inputs_ling_seq
-        inputs_sy = inputs_ling[:, :, 0]
-        inputs_tone = inputs_ling[:, :, 1]
-        inputs_syllable_flag = inputs_ling[:, :, 2]
-        inputs_ws = inputs_ling[:, :, 3]
+        if self.using_byte:
+            inputs_byte_index = inputs_ling[:, :, 0]
+            byte_index_embedding = self.byte_index_emb(inputs_byte_index)
+            ling_embedding = byte_index_embedding
+        else:
+            inputs_sy = inputs_ling[:, :, 0]
+            inputs_tone = inputs_ling[:, :, 1]
+            inputs_syllable_flag = inputs_ling[:, :, 2]
+            inputs_ws = inputs_ling[:, :, 3]
 
-        # Lookup table
-        sy_embedding = self.sy_emb(inputs_sy)
-        tone_embedding = self.tone_emb(inputs_tone)
-        syllable_flag_embedding = self.syllable_flag_emb(inputs_syllable_flag)
-        ws_embedding = self.ws_emb(inputs_ws)
+            # Lookup table
+            sy_embedding = self.sy_emb(inputs_sy)
+            tone_embedding = self.tone_emb(inputs_tone)
+            syllable_flag_embedding = self.syllable_flag_emb(
+                inputs_syllable_flag)
+            ws_embedding = self.ws_emb(inputs_ws)
 
-        ling_embedding = (
-            sy_embedding + tone_embedding + syllable_flag_embedding
-            + ws_embedding)
+            ling_embedding = (
+                sy_embedding + tone_embedding + syllable_flag_embedding
+                + ws_embedding)
 
         enc_output, enc_slf_attn_list = self.ling_enc(ling_embedding, masks,
                                                       return_attns)
diff --git a/modelscope/models/audio/tts/kantts/preprocess/audio_processor/core/utils.py b/modelscope/models/audio/tts/kantts/preprocess/audio_processor/core/utils.py
index ecc7ee21..f122eaed 100644
--- a/modelscope/models/audio/tts/kantts/preprocess/audio_processor/core/utils.py
+++ b/modelscope/models/audio/tts/kantts/preprocess/audio_processor/core/utils.py
@@ -420,6 +420,57 @@ def compute_std(data_list, mean_vector, dims=80):
     return std_vector
 
 
+F0_MIN = 0.0
+F0_MAX = 800.0
+
+ENERGY_MIN = 0.0
+ENERGY_MAX = 200.0
+
+CLIP_FLOOR = 1e-3
+
+
+def f0_norm_min_max(f0):
+    zero_idxs = np.where(f0 <= CLIP_FLOOR)[0]
+    res = (2 * f0 - F0_MIN - F0_MAX) / (F0_MAX - F0_MIN)
+    res[zero_idxs] = 0.0
+    return res
+
+
+def f0_denorm_min_max(f0):
+    zero_idxs = np.where(f0 == 0.0)[0]
+    res = (f0 * (F0_MAX - F0_MIN) + F0_MIN + F0_MAX) / 2
+    res[zero_idxs] = 0.0
+    return res
+
+
+def energy_norm_min_max(energy):
+    zero_idxs = np.where(energy == 0.0)[0]
+    res = (2 * energy - ENERGY_MIN - ENERGY_MAX) / (ENERGY_MAX - ENERGY_MIN)
+    res[zero_idxs] = 0.0
+    return res
+
+
+def energy_denorm_min_max(energy):
+    zero_idxs = np.where(energy == 0.0)[0]
+    res = (energy * (ENERGY_MAX - ENERGY_MIN) + ENERGY_MIN + ENERGY_MAX) / 2
+    res[zero_idxs] = 0.0
+    return res
+
+
+def norm_log(x):
+    zero_idxs = np.where(x <= CLIP_FLOOR)[0]
+    x[zero_idxs] = 1.0
+    res = np.log(x)
+    return res
+
+
+def denorm_log(x):
+    zero_idxs = np.where(x == 0.0)[0]
+    res = np.exp(x)
+    res[zero_idxs] = 0.0
+    return res
+
+
 def f0_norm_mean_std(x, mean, std):
     zero_idxs = np.where(x == 0.0)[0]
     x = (x - mean) / std
diff --git a/modelscope/models/audio/tts/kantts/preprocess/data_process.py b/modelscope/models/audio/tts/kantts/preprocess/data_process.py
index 319e653d..68025375 100644
--- a/modelscope/models/audio/tts/kantts/preprocess/data_process.py
+++ b/modelscope/models/audio/tts/kantts/preprocess/data_process.py
@@ -110,6 +110,8 @@ def process_data(
                                 languages[targetLang]['s2p_map_path'])
 
     logging.info(f'phoneset_path={phoneset_path}')
+    # dir of plain text/sentences for training byte based model
+    plain_text_dir = os.path.join(voice_input_dir, 'text')
 
     if speaker_name is None:
         speaker_name = os.path.basename(voice_input_dir)
@@ -130,28 +132,35 @@ def process_data(
     raw_metafile = None
     #  Script processor
     if not skip_script:
-        tsc = TextScriptConvertor(
-            phoneset_path,
-            posset_path,
-            targetLang,
-            foreignLang,
-            f2t_map_path,
-            s2p_map_path,
-            emo_tag_path,
-            speaker_name,
-        )
-        tsc.process(
-            os.path.join(voice_input_dir, 'prosody', 'prosody.txt'),
-            os.path.join(voice_output_dir, 'Script.xml'),
-            os.path.join(voice_output_dir, 'raw_metafile.txt'),
-        )
+        if os.path.exists(plain_text_dir):
+            TextScriptConvertor.turn_text_into_bytes(
+                os.path.join(plain_text_dir, 'text.txt'),
+                os.path.join(voice_output_dir, 'raw_metafile.txt'),
+                speaker_name,
+            )
+            fp_enable = False
+        else:
+            tsc = TextScriptConvertor(
+                phoneset_path,
+                posset_path,
+                targetLang,
+                foreignLang,
+                f2t_map_path,
+                s2p_map_path,
+                emo_tag_path,
+                speaker_name,
+            )
+            tsc.process(
+                os.path.join(voice_input_dir, 'prosody', 'prosody.txt'),
+                os.path.join(voice_output_dir, 'Script.xml'),
+                os.path.join(voice_output_dir, 'raw_metafile.txt'),
+            )
+            prosody = os.path.join(voice_input_dir, 'prosody', 'prosody.txt')
+            # FP processor
+            with codecs.open(prosody, 'r', 'utf-8') as f:
+                lines = f.readlines()
+                fp_enable = is_fp_line(lines[1])
         raw_metafile = os.path.join(voice_output_dir, 'raw_metafile.txt')
-        prosody = os.path.join(voice_input_dir, 'prosody', 'prosody.txt')
-
-    # FP processor
-    with codecs.open(prosody, 'r', 'utf-8') as f:
-        lines = f.readlines()
-        fp_enable = is_fp_line(lines[1])
 
     if fp_enable:
         FP = FpProcessor()
diff --git a/modelscope/models/audio/tts/kantts/preprocess/fp_processor.py b/modelscope/models/audio/tts/kantts/preprocess/fp_processor.py
index d285b8a1..910a374c 100644
--- a/modelscope/models/audio/tts/kantts/preprocess/fp_processor.py
+++ b/modelscope/models/audio/tts/kantts/preprocess/fp_processor.py
@@ -22,8 +22,20 @@ def is_fp_line(line):
 class FpProcessor:
 
     def __init__(self):
+        #  TODO: Add more audio processing methods.
         self.res = []
 
+    def is_fp_line(line):
+        fp_category_list = ['FP', 'I', 'N', 'Q']
+        elements = line.strip().split(' ')
+        res = True
+        for ele in elements:
+            if ele not in fp_category_list:
+                res = False
+                break
+        return res
+
+    # TODO: adjust idx judgment rule
     def addfp(self, voice_output_dir, prosody, raw_metafile_lines):
 
         fp_category_list = ['FP', 'I', 'N']
@@ -35,15 +47,28 @@ class FpProcessor:
         idx = ''
         fp = ''
         fp_label_dict = {}
-        for i in range(len(prosody_lines)):
-            if i % 5 == 0:
+        i = 0
+        while i < len(prosody_lines):
+            if len(prosody_lines[i].strip().split('\t')) == 2:
                 idx = prosody_lines[i].strip().split('\t')[0]
-            elif i % 5 == 1:  # according to prosody.txt
-                fp = prosody_lines[i].strip().split('\t')[0].split(' ')
-                for label in fp:
-                    if label not in fp_category_list:
-                        logging.warning('fp label not in fp_category_list')
-                        break
+                i += 1
+            else:
+                fp_enable = is_fp_line(prosody_lines[i])
+                if fp_enable:
+                    fp = prosody_lines[i].strip().split('\t')[0].split(' ')
+                    for label in fp:
+                        if label not in fp_category_list:
+                            logging.warning('fp label not in fp_category_list')
+                            break
+                    i += 4
+                else:
+                    fp = [
+                        'N' for _ in range(
+                            len(prosody_lines[i].strip().split('\t')
+                                [0].replace('/ ', '').replace('. ', '').split(
+                                    ' ')))
+                    ]
+                    i += 1
                 fp_label_dict[idx] = fp
 
         fpadd_metafile = os.path.join(voice_output_dir, 'fpadd_metafile.txt')
@@ -76,9 +101,12 @@ class FpProcessor:
                         error_flag = True
                     out_str = out_str + this_symbol_sequence + ' '
 
-                if idx != len(fp_label_dict[uttname]):
-                    logging.warning('{} length mismatch, length: {} '.format(
-                        idx, len(fp_label_dict[uttname])))
+                # if idx != len(fp_label_dict[uttname]):
+                #     logging.warning(
+                #         "{} length mismatch, length: {} ".format(
+                #             idx, len(fp_label_dict[uttname])
+                #         )
+                #     )
 
                 if not error_flag:
                     f_out.write(out_str.strip() + '\n')
diff --git a/modelscope/models/audio/tts/kantts/preprocess/script_convertor/core/utils.py b/modelscope/models/audio/tts/kantts/preprocess/script_convertor/core/utils.py
index 17b76cf3..d493e3dc 100644
--- a/modelscope/models/audio/tts/kantts/preprocess/script_convertor/core/utils.py
+++ b/modelscope/models/audio/tts/kantts/preprocess/script_convertor/core/utils.py
@@ -99,14 +99,18 @@ def format_prosody(src_prosody):
     formatted_lines = []
     with codecs.open(src_prosody, 'r', 'utf-8') as f:
         lines = f.readlines()
-        fp_enable = is_fp_line(lines[1])
 
-        for i in range(0, len(lines)):
-            line = do_character_normalization(lines[i])
-            if fp_enable:
-                if i % 5 == 1 or i % 5 == 2 or i % 5 == 3:
-                    continue
+        idx = 0
+        while idx < len(lines):
+            line = do_character_normalization(lines[idx])
+
             if len(line.strip().split('\t')) == 2:
                 line = do_prosody_text_normalization(line)
+            else:
+                fp_enable = is_fp_line(line)
+                if fp_enable:
+                    idx += 3
+                    continue
             formatted_lines.append(line)
+            idx += 1
     return formatted_lines
diff --git a/modelscope/models/audio/tts/kantts/preprocess/script_convertor/text_script_convertor.py b/modelscope/models/audio/tts/kantts/preprocess/script_convertor/text_script_convertor.py
index 90ea3a4e..8bb0f45a 100644
--- a/modelscope/models/audio/tts/kantts/preprocess/script_convertor/text_script_convertor.py
+++ b/modelscope/models/audio/tts/kantts/preprocess/script_convertor/text_script_convertor.py
@@ -4,6 +4,7 @@ import argparse
 import os
 import re
 
+from bitstring import BitArray
 from tqdm import tqdm
 
 from modelscope.utils.logger import get_logger
@@ -461,3 +462,39 @@ class TextScriptConvertor:
 
         logging.info('TextScriptConvertor.process:\nSave metafile to: %s',
                      outputMetafile)
+
+    @staticmethod
+    def turn_text_into_bytes(plain_text_path, output_meta_file_path, speaker):
+        meta_lines = []
+        with open(plain_text_path, 'r') as in_file:
+            for text_line in in_file:
+                [sentence_id, sentence] = text_line.strip().split('\t')
+                sequence = []
+                for character in sentence:
+                    hex_string = character.encode('utf-8').hex()
+                    i = 0
+                    while i < len(hex_string):
+                        byte_hex = hex_string[i:i + 2]
+                        bit_array = BitArray(hex=byte_hex)
+                        integer = bit_array.uint
+                        if integer > 255:
+                            logging.error(
+                                'TextScriptConverter.turn_text_into_bytes: invalid byte conversion in sentence {} \
+                                        character {}: (uint) {} - (hex) {}'.
+                                format(
+                                    sentence_id,
+                                    character,
+                                    integer,
+                                    character.encode('utf-8').hex(),
+                                ))
+                            continue
+                        sequence.append('{{{}$emotion_neutral${}}}'.format(
+                            integer, speaker))
+                        i += 2
+                if sequence[-1][1:].split('$')[0] not in ['33', '46', '63']:
+                    sequence.append(
+                        '{{46$emotion_neutral${}}}'.format(speaker))
+                meta_lines.append('{}\t{}\n'.format(sentence_id,
+                                                    ' '.join(sequence)))
+        with open(output_meta_file_path, 'w') as out_file:
+            out_file.writelines(meta_lines)
diff --git a/modelscope/models/audio/tts/kantts/utils/ling_unit/ling_unit.py b/modelscope/models/audio/tts/kantts/utils/ling_unit/ling_unit.py
index 2cd49a04..a1a9ffdb 100644
--- a/modelscope/models/audio/tts/kantts/utils/ling_unit/ling_unit.py
+++ b/modelscope/models/audio/tts/kantts/utils/ling_unit/ling_unit.py
@@ -26,9 +26,10 @@ def _clean_text(text, cleaner_names):
 
 def get_fpdict(config):
     # eomtion_neutral(F7) can be other emotion(speaker) types in the corresponding list in config file.
-    en_sy = '{ge$tone5$s_begin$word_begin$emotion_neutral$F7} {en_c$tone5$s_end$word_end$emotion_neutral$F7} {#3$tone_none$s_none$word_none$emotion_neutral$F7}'  # NOQA: E501
-    a_sy = '{ga$tone5$s_begin$word_begin$emotion_neutral$F7} {a_c$tone5$s_end$word_end$emotion_neutral$F7} {#3$tone_none$s_none$word_none$emotion_neutral$F7}'  # NOQA: E501
-    e_sy = '{ge$tone5$s_begin$word_begin$emotion_neutral$F7} {e_c$tone5$s_end$word_end$emotion_neutral$F7} {#3$tone_none$s_none$word_none$emotion_neutral$F7}'  # NOQA: E501
+    default_sp = config['linguistic_unit']['speaker_list'].split(',')[0]
+    en_sy = f'{{ge$tone5$s_begin$word_begin$emotion_neutral${default_sp}}} {{en_c$tone5$s_end$word_end$emotion_neutral${default_sp}}} {{#3$tone_none$s_none$word_none$emotion_neutral${default_sp}}}'  # NOQA: E501
+    a_sy = f'{{ga$tone5$s_begin$word_begin$emotion_neutral${default_sp}}} {{a_c$tone5$s_end$word_end$emotion_neutral${default_sp}}} {{#3$tone_none$s_none$word_none$emotion_neutral${default_sp}}}'  # NOQA: E501
+    e_sy = f'{{ge$tone5$s_begin$word_begin$emotion_neutral${default_sp}}} {{e_c$tone5$s_end$word_end$emotion_neutral${default_sp}}} {{#3$tone_none$s_none$word_none$emotion_neutral${default_sp}}}'  # NOQA: E501
     ling_unit = KanTtsLinguisticUnit(config)
 
     en_lings = ling_unit.encode_symbol_sequence(en_sy)
@@ -39,7 +40,7 @@ def get_fpdict(config):
     a_ling = np.stack(a_lings, axis=1)[:3, :4]
     e_ling = np.stack(e_lings, axis=1)[:3, :4]
 
-    fp_dict = {1: a_ling, 2: en_ling, 3: e_ling}
+    fp_dict = {1: en_ling, 2: a_ling, 3: e_ling}
     return fp_dict
 
 
@@ -92,12 +93,18 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
 
         self.build()
 
+    def using_byte(self):
+        return 'byte_index' in self._lfeat_type_list
+
     def get_unit_size(self):
         ling_unit_size = {}
-        ling_unit_size['sy'] = len(self.sy)
-        ling_unit_size['tone'] = len(self.tone)
-        ling_unit_size['syllable_flag'] = len(self.syllable_flag)
-        ling_unit_size['word_segment'] = len(self.word_segment)
+        if self.using_byte():
+            ling_unit_size['byte_index'] = len(self.byte_index)
+        else:
+            ling_unit_size['sy'] = len(self.sy)
+            ling_unit_size['tone'] = len(self.tone)
+            ling_unit_size['syllable_flag'] = len(self.syllable_flag)
+            ling_unit_size['word_segment'] = len(self.word_segment)
 
         if 'emo_category' in self._lfeat_type_list:
             ling_unit_size['emotion'] = len(self.emo_category)
@@ -107,77 +114,96 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
         return ling_unit_size
 
     def build(self):
-
         self._sub_unit_dim = {}
         self._sub_unit_pad = {}
-        # sy sub-unit
-        _characters = ''
+        if self.using_byte():
+            # Export all byte indices:
+            self.byte_index = ['@' + str(idx) for idx in range(256)] + [
+                self._pad,
+                self._eos,
+            ]
+            if self.has_mask:
+                self.byte_index.append(self._mask)
+            self._byte_index_to_id = {
+                s: i
+                for i, s in enumerate(self.byte_index)
+            }
+            self._id_to_byte_index = {
+                i: s
+                for i, s in enumerate(self.byte_index)
+            }
+            self._sub_unit_dim['byte_index'] = len(self.byte_index)
+            self._sub_unit_pad['byte_index'] = self._byte_index_to_id['_']
+        else:
+            # sy sub-unit
+            _characters = ''
 
-        # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-        # _arpabet = ['@' + s for s in cmudict.valid_symbols]
-        _arpabet = ['@' + s for s in self.lang_phones]
+            # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
+            # _arpabet = ['@' + s for s in cmudict.valid_symbols]
+            _arpabet = ['@' + s for s in self.lang_phones]
 
-        # Export all symbols:
-        self.sy = list(_characters) + _arpabet + [self._pad, self._eos]
-        if self.has_mask:
-            self.sy.append(self._mask)
-        self._sy_to_id = {s: i for i, s in enumerate(self.sy)}
-        self._id_to_sy = {i: s for i, s in enumerate(self.sy)}
-        self._sub_unit_dim['sy'] = len(self.sy)
-        self._sub_unit_pad['sy'] = self._sy_to_id['_']
+            # Export all symbols:
+            self.sy = list(_characters) + _arpabet + [self._pad, self._eos]
+            if self.has_mask:
+                self.sy.append(self._mask)
+            self._sy_to_id = {s: i for i, s in enumerate(self.sy)}
+            self._id_to_sy = {i: s for i, s in enumerate(self.sy)}
+            self._sub_unit_dim['sy'] = len(self.sy)
+            self._sub_unit_pad['sy'] = self._sy_to_id['_']
 
-        # tone sub-unit
-        _characters = ''
+            # tone sub-unit
+            _characters = ''
 
-        # Export all tones:
-        self.tone = (
-            list(_characters) + self.lang_tones + [self._pad, self._eos])
-        if self.has_mask:
-            self.tone.append(self._mask)
-        self._tone_to_id = {s: i for i, s in enumerate(self.tone)}
-        self._id_to_tone = {i: s for i, s in enumerate(self.tone)}
-        self._sub_unit_dim['tone'] = len(self.tone)
-        self._sub_unit_pad['tone'] = self._tone_to_id['_']
+            # Export all tones:
+            self.tone = (
+                list(_characters) + self.lang_tones + [self._pad, self._eos])
+            if self.has_mask:
+                self.tone.append(self._mask)
+            self._tone_to_id = {s: i for i, s in enumerate(self.tone)}
+            self._id_to_tone = {i: s for i, s in enumerate(self.tone)}
+            self._sub_unit_dim['tone'] = len(self.tone)
+            self._sub_unit_pad['tone'] = self._tone_to_id['_']
 
-        # syllable flag sub-unit
-        _characters = ''
+            # syllable flag sub-unit
+            _characters = ''
 
-        # Export all syllable_flags:
-        self.syllable_flag = (
-            list(_characters) + self.lang_syllable_flags
-            + [self._pad, self._eos])
-        if self.has_mask:
-            self.syllable_flag.append(self._mask)
-        self._syllable_flag_to_id = {
-            s: i
-            for i, s in enumerate(self.syllable_flag)
-        }
-        self._id_to_syllable_flag = {
-            i: s
-            for i, s in enumerate(self.syllable_flag)
-        }
-        self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag)
-        self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id['_']
+            # Export all syllable_flags:
+            self.syllable_flag = (
+                list(_characters) + self.lang_syllable_flags
+                + [self._pad, self._eos])
+            if self.has_mask:
+                self.syllable_flag.append(self._mask)
+            self._syllable_flag_to_id = {
+                s: i
+                for i, s in enumerate(self.syllable_flag)
+            }
+            self._id_to_syllable_flag = {
+                i: s
+                for i, s in enumerate(self.syllable_flag)
+            }
+            self._sub_unit_dim['syllable_flag'] = len(self.syllable_flag)
+            self._sub_unit_pad['syllable_flag'] = self._syllable_flag_to_id[
+                '_']
 
-        # word segment sub-unit
-        _characters = ''
+            # word segment sub-unit
+            _characters = ''
 
-        # Export all syllable_flags:
-        self.word_segment = (
-            list(_characters) + self.lang_word_segments
-            + [self._pad, self._eos])
-        if self.has_mask:
-            self.word_segment.append(self._mask)
-        self._word_segment_to_id = {
-            s: i
-            for i, s in enumerate(self.word_segment)
-        }
-        self._id_to_word_segment = {
-            i: s
-            for i, s in enumerate(self.word_segment)
-        }
-        self._sub_unit_dim['word_segment'] = len(self.word_segment)
-        self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_']
+            # Export all syllable_flags:
+            self.word_segment = (
+                list(_characters) + self.lang_word_segments
+                + [self._pad, self._eos])
+            if self.has_mask:
+                self.word_segment.append(self._mask)
+            self._word_segment_to_id = {
+                s: i
+                for i, s in enumerate(self.word_segment)
+            }
+            self._id_to_word_segment = {
+                i: s
+                for i, s in enumerate(self.word_segment)
+            }
+            self._sub_unit_dim['word_segment'] = len(self.word_segment)
+            self._sub_unit_pad['word_segment'] = self._word_segment_to_id['_']
 
         if 'emo_category' in self._lfeat_type_list:
             # emotion category sub-unit
@@ -247,6 +273,8 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
             sequence_item = sequence[i].tolist()
             if lfeat_type == 'sy':
                 s = self.decode_sy(sequence_item)
+            elif lfeat_type == 'byte_index':
+                s = self.decode_byte_index(sequence_item)
             elif lfeat_type == 'tone':
                 s = self.decode_tone(sequence_item)
             elif lfeat_type == 'syllable_flag':
@@ -261,7 +289,7 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
                 raise Exception('Unknown lfeat type: %s' % lfeat_type)
             result.append('%s:%s' % (lfeat_type, s))
 
-        return result
+        return
 
     def encode_sub_unit(self, this_lfeat_symbol, lfeat_type):
         sequence = []
@@ -276,6 +304,8 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
                 index = index + 1
             sequence = self.encode_text(this_lfeat_symbol_format,
                                         self._cleaner_names)
+        elif lfeat_type == 'byte_index':
+            sequence = self.encode_byte_index(this_lfeat_symbol)
         elif lfeat_type == 'tone':
             sequence = self.encode_tone(this_lfeat_symbol)
         elif lfeat_type == 'syllable_flag':
@@ -288,7 +318,6 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
             sequence = self.encode_speaker_category(this_lfeat_symbol)
         else:
             raise Exception('Unknown lfeat type: %s' % lfeat_type)
-
         return sequence
 
     def encode_text(self, text, cleaner_names):
@@ -323,6 +352,20 @@ class KanTtsLinguisticUnit(LinguisticBaseUnit):
     def encode_arpanet(self, text):
         return self.encode_sy(['@' + s for s in text.split()])
 
+    def encode_byte_index(self, byte_index):
+        byte_indices = ['@' + s for s in byte_index.strip().split(' ')]
+        sequence = []
+        for this_byte_index in byte_indices:
+            sequence.append(self._byte_index_to_id[this_byte_index])
+        sequence.append(self._byte_index_to_id['~'])
+        return sequence
+
+    def decode_byte_index(self, id):
+        s = self._id_to_byte_index[id]
+        if len(s) > 1 and s[0] == '@':
+            s = s[1:]
+        return s
+
     def encode_tone(self, tone):
         tones = tone.strip().split(' ')
         sequence = []
diff --git a/modelscope/models/audio/tts/voice.py b/modelscope/models/audio/tts/voice.py
index 60e32b06..765f6d83 100644
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -123,21 +123,47 @@ class Voice:
             with torch.no_grad():
                 inputs_feat_lst = self.__ling_unit.encode_symbol_sequence(
                     symbol_seq)
-                inputs_sy = torch.from_numpy(inputs_feat_lst[0]).long().to(
-                    self.__device)
-                inputs_tone = torch.from_numpy(inputs_feat_lst[1]).long().to(
-                    self.__device)
-                inputs_syllable = torch.from_numpy(
-                    inputs_feat_lst[2]).long().to(self.__device)
-                inputs_ws = torch.from_numpy(inputs_feat_lst[3]).long().to(
-                    self.__device)
-                inputs_ling = torch.stack(
-                    [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
-                    dim=-1).unsqueeze(0)
-                inputs_emo = torch.from_numpy(inputs_feat_lst[4]).long().to(
-                    self.__device).unsqueeze(0)
-                inputs_spk = torch.from_numpy(inputs_feat_lst[5]).long().to(
-                    self.__device).unsqueeze(0)
+                inputs_feat_index = 0
+                if self.__ling_unit.using_byte():
+                    inputs_byte_index = (
+                        torch.from_numpy(
+                            inputs_feat_lst[inputs_feat_index]).long().to(
+                                self.__device))
+                    inputs_ling = torch.stack([inputs_byte_index],
+                                              dim=-1).unsqueeze(0)
+                else:
+                    inputs_sy = (
+                        torch.from_numpy(
+                            inputs_feat_lst[inputs_feat_index]).long().to(
+                                self.__device))
+                    inputs_feat_index = inputs_feat_index + 1
+                    inputs_tone = (
+                        torch.from_numpy(
+                            inputs_feat_lst[inputs_feat_index]).long().to(
+                                self.__device))
+                    inputs_feat_index = inputs_feat_index + 1
+                    inputs_syllable = (
+                        torch.from_numpy(
+                            inputs_feat_lst[inputs_feat_index]).long().to(
+                                self.__device))
+                    inputs_feat_index = inputs_feat_index + 1
+                    inputs_ws = (
+                        torch.from_numpy(
+                            inputs_feat_lst[inputs_feat_index]).long().to(
+                                self.__device))
+                    inputs_ling = torch.stack(
+                        [inputs_sy, inputs_tone, inputs_syllable, inputs_ws],
+                        dim=-1).unsqueeze(0)
+                inputs_feat_index = inputs_feat_index + 1
+                inputs_emo = (
+                    torch.from_numpy(
+                        inputs_feat_lst[inputs_feat_index]).long().to(
+                            self.__device).unsqueeze(0))
+                inputs_feat_index = inputs_feat_index + 1
+                inputs_spk = (
+                    torch.from_numpy(
+                        inputs_feat_lst[inputs_feat_index]).long().to(
+                            self.__device).unsqueeze(0))
                 inputs_len = (torch.zeros(1).to(self.__device).long()
                               + inputs_emo.size(1) - 1)  # minus 1 for "~"
                 res = self.__am(inputs_ling[:, :-1, :], inputs_emo[:, :-1],
@@ -148,9 +174,19 @@ class Voice:
                 postnet_outputs = postnet_outputs[0, :valid_length, :].cpu()
                 return postnet_outputs
 
+    def __binarize(mel, threshold=0.6):
+        # vuv binarize
+        res_mel = mel.clone()
+        index = torch.where(mel[:, -1] < threshold)[0]
+        res_mel[:, -1] = 1.0
+        res_mel[:, -1][index] = 0.0
+        return res_mel
+
     def __vocoder_forward(self, melspec):
         with torch.no_grad():
             x = melspec.to(self.__device)
+            if self.__voc_model.nsf_enable:
+                x = self.__binarize(x)
             x = x.transpose(1, 0).unsqueeze(0)
             y = self.__voc_model(x)
             if hasattr(self.__voc_model, 'pqmf'):
diff --git a/modelscope/models/base/base_head.py b/modelscope/models/base/base_head.py
index 11bda32f..6dc04971 100644
--- a/modelscope/models/base/base_head.py
+++ b/modelscope/models/base/base_head.py
@@ -13,8 +13,7 @@ Input = Union[Dict[str, Tensor], Model]
 
 
 class Head(ABC):
-    """
-    The head base class is for the tasks head method definition
+    """The head base class is for the tasks head method definition
 
     """
 
diff --git a/modelscope/models/base/base_model.py b/modelscope/models/base/base_model.py
index 94757641..d933d8ae 100644
--- a/modelscope/models/base/base_model.py
+++ b/modelscope/models/base/base_model.py
@@ -4,6 +4,7 @@ import os.path as osp
 from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, List, Optional, Union
 
+from modelscope.hub.check_model import check_local_model_is_latest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.models.builder import build_model
 from modelscope.utils.checkpoint import (save_checkpoint, save_configuration,
@@ -19,6 +20,8 @@ Tensor = Union['torch.Tensor', 'tf.Tensor']
 
 
 class Model(ABC):
+    """Base model interface.
+    """
 
     def __init__(self, model_dir, *args, **kwargs):
         self.model_dir = model_dir
@@ -94,10 +97,11 @@ class Model(ABC):
         prefetched = kwargs.get('model_prefetched')
         if prefetched is not None:
             kwargs.pop('model_prefetched')
-
         invoked_by = kwargs.get(Invoke.KEY)
         if invoked_by is not None:
             kwargs.pop(Invoke.KEY)
+        else:
+            invoked_by = Invoke.PRETRAINED
 
         if osp.exists(model_name_or_path):
             local_model_dir = model_name_or_path
@@ -107,10 +111,7 @@ class Model(ABC):
                     'Expecting model is pre-fetched locally, but is not found.'
                 )
 
-            if invoked_by is not None:
-                invoked_by = '%s/%s' % (Invoke.KEY, invoked_by)
-            else:
-                invoked_by = '%s/%s' % (Invoke.KEY, Invoke.PRETRAINED)
+            invoked_by = '%s/%s' % (Invoke.KEY, invoked_by)
             local_model_dir = snapshot_download(
                 model_name_or_path, revision, user_agent=invoked_by)
         logger.info(f'initialize model from {local_model_dir}')
diff --git a/modelscope/models/base/base_torch_model.py b/modelscope/models/base/base_torch_model.py
index b5515b25..98221682 100644
--- a/modelscope/models/base/base_torch_model.py
+++ b/modelscope/models/base/base_torch_model.py
@@ -1,9 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from copy import deepcopy
 from typing import Any, Dict
 
 import torch
 from torch import nn
+from torch.nn.parallel import DataParallel, DistributedDataParallel
 
 from modelscope.utils.file_utils import func_receive_dict_inputs
 from modelscope.utils.hub import parse_label_mapping
@@ -29,6 +31,34 @@ class TorchModel(Model, torch.nn.Module):
         else:
             return self.postprocess(self.forward(*args, **kwargs))
 
+    def _load_pretrained(self,
+                         net,
+                         load_path,
+                         strict=True,
+                         param_key='params'):
+        if isinstance(net, (DataParallel, DistributedDataParallel)):
+            net = net.module
+        load_net = torch.load(
+            load_path, map_location=lambda storage, loc: storage)
+        if param_key is not None:
+            if param_key not in load_net and 'params' in load_net:
+                param_key = 'params'
+                logger.info(
+                    f'Loading: {param_key} does not exist, use params.')
+            if param_key in load_net:
+                load_net = load_net[param_key]
+        logger.info(
+            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
+        )
+        # remove unnecessary 'module.'
+        for k, v in deepcopy(load_net).items():
+            if k.startswith('module.'):
+                load_net[k[7:]] = v
+                load_net.pop(k)
+        net.load_state_dict(load_net, strict=strict)
+        logger.info('load model done.')
+        return net
+
     def forward(self, *args, **kwargs) -> Dict[str, Any]:
         raise NotImplementedError
 
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index b781a89d..b906aa12 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -5,16 +5,20 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints,
                body_3d_keypoints, cartoon, cmdssl_video_embedding,
                crowd_counting, face_2d_keypoints, face_detection,
                face_generation, human_wholebody_keypoint, image_classification,
-               image_color_enhance, image_colorization, image_denoise,
-               image_inpainting, image_instance_segmentation,
+               image_color_enhance, image_colorization, image_defrcn_fewshot,
+               image_denoise, image_inpainting, image_instance_segmentation,
+               image_matching, image_mvs_depth_estimation,
                image_panoptic_segmentation, image_portrait_enhancement,
                image_reid_person, image_semantic_segmentation,
                image_to_image_generation, image_to_image_translation,
                language_guided_video_summarization, movie_scene_segmentation,
-               object_detection, product_retrieval_embedding,
+               object_detection, panorama_depth_estimation,
+               pointcloud_sceneflow_estimation, product_retrieval_embedding,
                realtime_object_detection, referring_video_object_segmentation,
                salient_detection, shop_segmentation, super_resolution,
-               video_object_segmentation, video_single_object_tracking,
-               video_summarization, virual_tryon)
+               video_frame_interpolation, video_object_segmentation,
+               video_single_object_tracking, video_stabilization,
+               video_summarization, video_super_resolution, virual_tryon,
+               vision_middleware, vop_retrieval)
 
 # yapf: enable
diff --git a/modelscope/models/cv/face_detection/peppa_pig_face/LK/__init__.py b/modelscope/models/cv/face_detection/peppa_pig_face/LK/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/peppa_pig_face/LK/lk.py b/modelscope/models/cv/face_detection/peppa_pig_face/LK/lk.py
new file mode 100644
index 00000000..3b4a670c
--- /dev/null
+++ b/modelscope/models/cv/face_detection/peppa_pig_face/LK/lk.py
@@ -0,0 +1,97 @@
+# The implementation here is modified based on InsightFace_Pytorch, originally Apache License and publicly available
+# at https://github.com/610265158/Peppa_Pig_Face_Engine
+import numpy as np
+
+
+class GroupTrack():
+
+    def __init__(self):
+        self.old_frame = None
+        self.previous_landmarks_set = None
+        self.with_landmark = True
+        self.thres = 1
+        self.alpha = 0.95
+        self.iou_thres = 0.5
+
+    def calculate(self, img, current_landmarks_set):
+        if self.previous_landmarks_set is None:
+            self.previous_landmarks_set = current_landmarks_set
+            result = current_landmarks_set
+        else:
+            previous_lm_num = self.previous_landmarks_set.shape[0]
+            if previous_lm_num == 0:
+                self.previous_landmarks_set = current_landmarks_set
+                result = current_landmarks_set
+                return result
+            else:
+                result = []
+                for i in range(current_landmarks_set.shape[0]):
+                    not_in_flag = True
+                    for j in range(previous_lm_num):
+                        if self.iou(current_landmarks_set[i],
+                                    self.previous_landmarks_set[j]
+                                    ) > self.iou_thres:
+                            result.append(
+                                self.smooth(current_landmarks_set[i],
+                                            self.previous_landmarks_set[j]))
+                            not_in_flag = False
+                            break
+                    if not_in_flag:
+                        result.append(current_landmarks_set[i])
+
+        result = np.array(result)
+        self.previous_landmarks_set = result
+
+        return result
+
+    def iou(self, p_set0, p_set1):
+        rec1 = [
+            np.min(p_set0[:, 0]),
+            np.min(p_set0[:, 1]),
+            np.max(p_set0[:, 0]),
+            np.max(p_set0[:, 1])
+        ]
+        rec2 = [
+            np.min(p_set1[:, 0]),
+            np.min(p_set1[:, 1]),
+            np.max(p_set1[:, 0]),
+            np.max(p_set1[:, 1])
+        ]
+
+        # computing area of each rectangles
+        S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1])
+        S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1])
+
+        # computing the sum_area
+        sum_area = S_rec1 + S_rec2
+
+        # find the each edge of intersect rectangle
+        x1 = max(rec1[0], rec2[0])
+        y1 = max(rec1[1], rec2[1])
+        x2 = min(rec1[2], rec2[2])
+        y2 = min(rec1[3], rec2[3])
+
+        # judge if there is an intersect
+        intersect = max(0, x2 - x1) * max(0, y2 - y1)
+
+        iou = intersect / (sum_area - intersect)
+        return iou
+
+    def smooth(self, now_landmarks, previous_landmarks):
+        result = []
+        for i in range(now_landmarks.shape[0]):
+            x = now_landmarks[i][0] - previous_landmarks[i][0]
+            y = now_landmarks[i][1] - previous_landmarks[i][1]
+            dis = np.sqrt(np.square(x) + np.square(y))
+            if dis < self.thres:
+                result.append(previous_landmarks[i])
+            else:
+                result.append(
+                    self.do_moving_average(now_landmarks[i],
+                                           previous_landmarks[i]))
+
+        return np.array(result)
+
+    def do_moving_average(self, p_now, p_previous):
+        p = self.alpha * p_now + (1 - self.alpha) * p_previous
+        return p
diff --git a/modelscope/models/cv/face_detection/peppa_pig_face/__init__.py b/modelscope/models/cv/face_detection/peppa_pig_face/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/face_detection/peppa_pig_face/face_detector.py b/modelscope/models/cv/face_detection/peppa_pig_face/face_detector.py
new file mode 100644
index 00000000..4deff9bf
--- /dev/null
+++ b/modelscope/models/cv/face_detection/peppa_pig_face/face_detector.py
@@ -0,0 +1,115 @@
+# The implementation here is modified based on InsightFace_Pytorch, originally Apache License and publicly available
+# at https://github.com/610265158/Peppa_Pig_Face_Engine
+import cv2
+import numpy as np
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
+
+class FaceDetector:
+
+    def __init__(self, dir):
+
+        self.model_path = dir + '/detector.pb'
+        self.thres = 0.8
+        self.input_shape = (512, 512, 3)
+        self.pixel_means = np.array([123., 116., 103.])
+
+        self._graph = tf.Graph()
+
+        with self._graph.as_default():
+            self._graph, self._sess = self.init_model(self.model_path)
+
+            self.input_image = tf.get_default_graph().get_tensor_by_name(
+                'tower_0/images:0')
+            self.training = tf.get_default_graph().get_tensor_by_name(
+                'training_flag:0')
+            self.output_ops = [
+                tf.get_default_graph().get_tensor_by_name('tower_0/boxes:0'),
+                tf.get_default_graph().get_tensor_by_name('tower_0/scores:0'),
+                tf.get_default_graph().get_tensor_by_name(
+                    'tower_0/num_detections:0'),
+            ]
+
+    def __call__(self, image):
+
+        image, scale_x, scale_y = self.preprocess(
+            image,
+            target_width=self.input_shape[1],
+            target_height=self.input_shape[0])
+
+        image = np.expand_dims(image, 0)
+
+        boxes, scores, num_boxes = self._sess.run(
+            self.output_ops,
+            feed_dict={
+                self.input_image: image,
+                self.training: False
+            })
+
+        num_boxes = num_boxes[0]
+        boxes = boxes[0][:num_boxes]
+
+        scores = scores[0][:num_boxes]
+
+        to_keep = scores > self.thres
+        boxes = boxes[to_keep]
+        scores = scores[to_keep]
+
+        y1 = self.input_shape[0] / scale_y
+        x1 = self.input_shape[1] / scale_x
+        y2 = self.input_shape[0] / scale_y
+        x2 = self.input_shape[1] / scale_x
+        scaler = np.array([y1, x1, y2, x2], dtype='float32')
+        boxes = boxes * scaler
+
+        scores = np.expand_dims(scores, 0).reshape([-1, 1])
+
+        for i in range(boxes.shape[0]):
+            boxes[i] = np.array(
+                [boxes[i][1], boxes[i][0], boxes[i][3], boxes[i][2]])
+        return np.concatenate([boxes, scores], axis=1)
+
+    def preprocess(self, image, target_height, target_width, label=None):
+
+        h, w, c = image.shape
+
+        bimage = np.zeros(
+            shape=[target_height, target_width, c],
+            dtype=image.dtype) + np.array(
+                self.pixel_means, dtype=image.dtype)
+        long_side = max(h, w)
+
+        scale_x = scale_y = target_height / long_side
+
+        image = cv2.resize(image, None, fx=scale_x, fy=scale_y)
+
+        h_, w_, _ = image.shape
+        bimage[:h_, :w_, :] = image
+
+        return bimage, scale_x, scale_y
+
+    def init_model(self, *args):
+        pb_path = args[0]
+
+        def init_pb(model_path):
+            config = tf.ConfigProto()
+            config.gpu_options.per_process_gpu_memory_fraction = 0.2
+            compute_graph = tf.Graph()
+            compute_graph.as_default()
+            sess = tf.Session(config=config)
+            with tf.gfile.GFile(model_path, 'rb') as fid:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(fid.read())
+                tf.import_graph_def(graph_def, name='')
+
+            return (compute_graph, sess)
+
+        model = init_pb(pb_path)
+
+        graph = model[0]
+        sess = model[1]
+
+        return graph, sess
diff --git a/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py b/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py
new file mode 100644
index 00000000..03a3b5b7
--- /dev/null
+++ b/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py
@@ -0,0 +1,154 @@
+# The implementation here is modified based on InsightFace_Pytorch, originally Apache License and publicly available
+# at https://github.com/610265158/Peppa_Pig_Face_Engine
+import cv2
+import numpy as np
+import tensorflow as tf
+
+if tf.__version__ >= '2.0':
+    tf = tf.compat.v1
+
+
+class FaceLandmark:
+
+    def __init__(self, dir):
+        self.model_path = dir + '/keypoints.pb'
+        self.min_face = 60
+        self.keypoint_num = 136
+        self.pixel_means = np.array([123., 116., 103.])
+        self.kp_extend_range = [0.2, 0.3]
+        self.kp_shape = (160, 160, 3)
+
+        self._graph = tf.Graph()
+
+        with self._graph.as_default():
+
+            self._graph, self._sess = self.init_model(self.model_path)
+            self.img_input = tf.get_default_graph().get_tensor_by_name(
+                'tower_0/images:0')
+            self.embeddings = tf.get_default_graph().get_tensor_by_name(
+                'tower_0/prediction:0')
+            self.training = tf.get_default_graph().get_tensor_by_name(
+                'training_flag:0')
+
+            self.landmark = self.embeddings[:, :self.keypoint_num]
+            self.headpose = self.embeddings[:, -7:-4] * 90.
+            self.state = tf.nn.sigmoid(self.embeddings[:, -4:])
+
+    def __call__(self, img, bboxes):
+        landmark_result = []
+        state_result = []
+        for i, bbox in enumerate(bboxes):
+            landmark, state = self._one_shot_run(img, bbox, i)
+            if landmark is not None:
+                landmark_result.append(landmark)
+                state_result.append(state)
+        return np.array(landmark_result), np.array(state_result)
+
+    def simple_run(self, cropped_img):
+        with self._graph.as_default():
+
+            cropped_img = np.expand_dims(cropped_img, axis=0)
+            landmark, p, states = self._sess.run(
+                [self.landmark, self.headpose, self.state],
+                feed_dict={
+                    self.img_input: cropped_img,
+                    self.training: False
+                })
+
+        return landmark, states
+
+    def _one_shot_run(self, image, bbox, i):
+
+        bbox_width = bbox[2] - bbox[0]
+        bbox_height = bbox[3] - bbox[1]
+        if (bbox_width <= self.min_face and bbox_height <= self.min_face):
+            return None, None
+        add = int(max(bbox_width, bbox_height))
+        bimg = cv2.copyMakeBorder(
+            image,
+            add,
+            add,
+            add,
+            add,
+            borderType=cv2.BORDER_CONSTANT,
+            value=self.pixel_means)
+        bbox += add
+
+        one_edge = (1 + 2 * self.kp_extend_range[0]) * bbox_width
+        center = [(bbox[0] + bbox[2]) // 2, (bbox[1] + bbox[3]) // 2]
+
+        bbox[0] = center[0] - one_edge // 2
+        bbox[1] = center[1] - one_edge // 2
+        bbox[2] = center[0] + one_edge // 2
+        bbox[3] = center[1] + one_edge // 2
+
+        bbox = bbox.astype(np.int)
+        crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+        h, w, _ = crop_image.shape
+        crop_image = cv2.resize(crop_image,
+                                (self.kp_shape[1], self.kp_shape[0]))
+        crop_image = crop_image.astype(np.float32)
+
+        keypoints, state = self.simple_run(crop_image)
+
+        res = keypoints[0][:self.keypoint_num].reshape((-1, 2))
+        res[:, 0] = res[:, 0] * w / self.kp_shape[1]
+        res[:, 1] = res[:, 1] * h / self.kp_shape[0]
+
+        landmark = []
+        for _index in range(res.shape[0]):
+            x_y = res[_index]
+            landmark.append([
+                int(x_y[0] * self.kp_shape[0] + bbox[0] - add),
+                int(x_y[1] * self.kp_shape[1] + bbox[1] - add)
+            ])
+
+        landmark = np.array(landmark, np.float32)
+
+        return landmark, state
+
+    def init_model(self, *args):
+
+        if len(args) == 1:
+            use_pb = True
+            pb_path = args[0]
+        else:
+            use_pb = False
+            meta_path = args[0]
+            restore_model_path = args[1]
+
+        def ini_ckpt():
+            graph = tf.Graph()
+            graph.as_default()
+            configProto = tf.ConfigProto()
+            configProto.gpu_options.allow_growth = True
+            sess = tf.Session(config=configProto)
+            # load_model(model_path, sess)
+            saver = tf.train.import_meta_graph(meta_path)
+            saver.restore(sess, restore_model_path)
+
+            print('Model restored!')
+            return (graph, sess)
+
+        def init_pb(model_path):
+            config = tf.ConfigProto()
+            config.gpu_options.per_process_gpu_memory_fraction = 0.2
+            compute_graph = tf.Graph()
+            compute_graph.as_default()
+            sess = tf.Session(config=config)
+            with tf.gfile.GFile(model_path, 'rb') as fid:
+                graph_def = tf.GraphDef()
+                graph_def.ParseFromString(fid.read())
+                tf.import_graph_def(graph_def, name='')
+
+            return (compute_graph, sess)
+
+        if use_pb:
+            model = init_pb(pb_path)
+        else:
+            model = ini_ckpt()
+
+        graph = model[0]
+        sess = model[1]
+
+        return graph, sess
diff --git a/modelscope/models/cv/face_detection/peppa_pig_face/facer.py b/modelscope/models/cv/face_detection/peppa_pig_face/facer.py
new file mode 100644
index 00000000..16039d83
--- /dev/null
+++ b/modelscope/models/cv/face_detection/peppa_pig_face/facer.py
@@ -0,0 +1,138 @@
+# The implementation here is modified based on InsightFace_Pytorch, originally Apache License and publicly available
+# at https://github.com/610265158/Peppa_Pig_Face_Engine
+import cv2
+import numpy as np
+
+from .face_detector import FaceDetector
+from .face_landmark import FaceLandmark
+from .LK.lk import GroupTrack
+
+
+class FaceAna():
+
+    def __init__(self, model_dir):
+        self.face_detector = FaceDetector(model_dir)
+        self.face_landmark = FaceLandmark(model_dir)
+        self.trace = GroupTrack()
+
+        self.track_box = None
+        self.previous_image = None
+        self.previous_box = None
+
+        self.diff_thres = 5
+        self.top_k = 10
+        self.iou_thres = 0.5
+        self.alpha = 0.3
+
+    def run(self, image):
+
+        boxes = self.face_detector(image)
+
+        if boxes.shape[0] > self.top_k:
+            boxes = self.sort(boxes)
+
+        boxes_return = np.array(boxes)
+        landmarks, states = self.face_landmark(image, boxes)
+
+        if 1:
+            track = []
+            for i in range(landmarks.shape[0]):
+                track.append([
+                    np.min(landmarks[i][:, 0]),
+                    np.min(landmarks[i][:, 1]),
+                    np.max(landmarks[i][:, 0]),
+                    np.max(landmarks[i][:, 1])
+                ])
+            tmp_box = np.array(track)
+
+            self.track_box = self.judge_boxs(boxes_return, tmp_box)
+
+        self.track_box, landmarks = self.sort_res(self.track_box, landmarks)
+        return self.track_box, landmarks, states
+
+    def sort_res(self, bboxes, points):
+        area = []
+        for bbox in bboxes:
+            bbox_width = bbox[2] - bbox[0]
+            bbox_height = bbox[3] - bbox[1]
+            area.append(bbox_height * bbox_width)
+
+        area = np.array(area)
+        picked = area.argsort()[::-1]
+        sorted_bboxes = [bboxes[x] for x in picked]
+        sorted_points = [points[x] for x in picked]
+        return np.array(sorted_bboxes), np.array(sorted_points)
+
+    def diff_frames(self, previous_frame, image):
+        if previous_frame is None:
+            return True
+        else:
+            _diff = cv2.absdiff(previous_frame, image)
+            diff = np.sum(
+                _diff) / previous_frame.shape[0] / previous_frame.shape[1] / 3.
+            return diff > self.diff_thres
+
+    def sort(self, bboxes):
+        if self.top_k > 100:
+            return bboxes
+        area = []
+        for bbox in bboxes:
+
+            bbox_width = bbox[2] - bbox[0]
+            bbox_height = bbox[3] - bbox[1]
+            area.append(bbox_height * bbox_width)
+
+        area = np.array(area)
+
+        picked = area.argsort()[-self.top_k:][::-1]
+        sorted_bboxes = [bboxes[x] for x in picked]
+        return np.array(sorted_bboxes)
+
+    def judge_boxs(self, previuous_bboxs, now_bboxs):
+
+        def iou(rec1, rec2):
+
+            S_rec1 = (rec1[2] - rec1[0]) * (rec1[3] - rec1[1])
+            S_rec2 = (rec2[2] - rec2[0]) * (rec2[3] - rec2[1])
+
+            sum_area = S_rec1 + S_rec2
+
+            x1 = max(rec1[0], rec2[0])
+            y1 = max(rec1[1], rec2[1])
+            x2 = min(rec1[2], rec2[2])
+            y2 = min(rec1[3], rec2[3])
+
+            intersect = max(0, x2 - x1) * max(0, y2 - y1)
+
+            return intersect / (sum_area - intersect)
+
+        if previuous_bboxs is None:
+            return now_bboxs
+
+        result = []
+
+        for i in range(now_bboxs.shape[0]):
+            contain = False
+            for j in range(previuous_bboxs.shape[0]):
+                if iou(now_bboxs[i], previuous_bboxs[j]) > self.iou_thres:
+                    result.append(
+                        self.smooth(now_bboxs[i], previuous_bboxs[j]))
+                    contain = True
+                    break
+            if not contain:
+                result.append(now_bboxs[i])
+
+        return np.array(result)
+
+    def smooth(self, now_box, previous_box):
+
+        return self.do_moving_average(now_box[:4], previous_box[:4])
+
+    def do_moving_average(self, p_now, p_previous):
+        p = self.alpha * p_now + (1 - self.alpha) * p_previous
+        return p
+
+    def reset(self):
+        self.track_box = None
+        self.previous_image = None
+        self.previous_box = None
diff --git a/modelscope/models/cv/face_recognition/torchkit/__init__.py b/modelscope/models/cv/face_recognition/torchkit/__init__.py
index e69de29b..5531670a 100755
--- a/modelscope/models/cv/face_recognition/torchkit/__init__.py
+++ b/modelscope/models/cv/face_recognition/torchkit/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .rts_backbone import RTSBackbone
+else:
+    _import_structure = {'rts_backbone': ['RTSBackbone']}
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__)
diff --git a/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py b/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py
new file mode 100644
index 00000000..6bd627eb
--- /dev/null
+++ b/modelscope/models/cv/face_recognition/torchkit/rts_backbone.py
@@ -0,0 +1,223 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from collections import namedtuple
+from math import lgamma
+
+import torch
+import torch.nn as nn
+from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d,
+                      Dropout, Linear, MaxPool2d, Module, PReLU, ReLU,
+                      Sequential, Sigmoid)
+from torch.nn.modules.flatten import Flatten
+
+from modelscope.models import MODELS
+from modelscope.models.base import TorchModel
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module('face-recognition-ood', 'rts-backbone')
+class RTSBackbone(TorchModel):
+
+    def __init__(self, *args, **kwargs):
+        super(RTSBackbone, self).__init__()
+        # model initialization
+        self.alpha = kwargs.get('alpha')
+        self.rts_plus = kwargs.get('rts_plus')
+        resnet = Backbone([112, 112], 64, mode='ir_se')
+
+        self.features = nn.Sequential(
+            resnet.input_layer, resnet.body,
+            Sequential(
+                BatchNorm2d(512),
+                Dropout(),
+                Flatten(),
+            ))
+
+        self.features_backbone = nn.Sequential(
+            Linear(512 * 7 * 7, 512),
+            BatchNorm1d(512),
+        )
+
+        self.logvar_rts_backbone = nn.Sequential(
+            Linear(512 * 7 * 7, 1),
+            BatchNorm1d(1),
+        )
+
+        self.logvar_rts_plus_backbone = nn.Sequential(
+            Linear(512 * 7 * 7, self.alpha),
+            BatchNorm1d(self.alpha),
+        )
+
+    def forward(self, img):
+        x = self.features(img)
+        image_features = self.features_backbone(x)
+        if not self.rts_plus:
+            logvar = self.logvar_rts_backbone(x)
+        else:
+            logvar = self.logvar_rts_plus_backbone(x)
+        return image_features, logvar
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        model_file = kwargs.get('am_model_name', ModelFile.TORCH_MODEL_FILE)
+        ckpt_path = os.path.join(kwargs['model_dir'], model_file)
+        logger.info(f'loading model from {ckpt_path}')
+        model_dir = kwargs.pop('model_dir')
+        model = cls(**kwargs)
+        ckpt_path = os.path.join(model_dir, model_file)
+        model.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
+        return model
+
+
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+
+    return output
+
+
+class SEModule(Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+
+        nn.init.xavier_uniform_(self.fc1.weight.data)
+
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+
+        return module_input * x
+
+
+class bottleneck_IR_SE(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR_SE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth), SEModule(depth, 16))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    '''A named tuple describing a ResNet block.'''
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+
+    return [Bottleneck(in_channel, depth, stride)
+            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 64:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=16),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+
+    return blocks
+
+
+class Backbone(Module):
+
+    def __init__(self, input_size, num_layers, mode='ir'):
+        super(Backbone, self).__init__()
+        assert input_size[0] in [
+            112, 224
+        ], 'input_size should be [112, 112] or [224, 224]'
+        assert num_layers in [50, 64, 100,
+                              152], 'num_layers should be 50, 64, 100 or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = bottleneck_IR
+        elif mode == 'ir_se':
+            unit_module = bottleneck_IR_SE
+        self.input_layer = Sequential(
+            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
+            PReLU(64))
+        if input_size[0] == 112:
+            self.output_layer = Sequential(
+                BatchNorm2d(512), Dropout(), Flatten(),
+                Linear(512 * 7 * 7, 512), BatchNorm1d(512))
+        else:
+            self.output_layer = Sequential(
+                BatchNorm2d(512), Dropout(), Flatten(),
+                Linear(512 * 14 * 14, 512), BatchNorm1d(512))
+
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x)
+
+        return x
diff --git a/modelscope/models/cv/image_binary_quant_classification/__init__.py b/modelscope/models/cv/image_binary_quant_classification/__init__.py
new file mode 100644
index 00000000..88911773
--- /dev/null
+++ b/modelscope/models/cv/image_binary_quant_classification/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .binary_quant_model import BinaryQuantClassificationModel
+
+else:
+    _import_structure = {
+        'binary_quant_model': ['BinaryQuantClassificationModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_binary_quant_classification/binary_quant_model.py b/modelscope/models/cv/image_binary_quant_classification/binary_quant_model.py
new file mode 100644
index 00000000..0afcbe1d
--- /dev/null
+++ b/modelscope/models/cv/image_binary_quant_classification/binary_quant_model.py
@@ -0,0 +1,82 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from collections import OrderedDict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.hub import read_config
+from modelscope.utils.logger import get_logger
+from .bnext import BNext
+
+logger = get_logger()
+
+__all__ = ['BinaryQuantClassificationModel']
+
+
+@MODELS.register_module(Tasks.image_classification, module_name=Models.bnext)
+class BinaryQuantClassificationModel(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+            logger.info('Use GPU: {}'.format(self._device))
+        else:
+            self._device = torch.device('cpu')
+            logger.info('Use CPU: {}'.format(self._device))
+
+        self.model = BNext(num_classes=1000)
+        self.model = self.model.to(self._device)
+
+        self.model_dir = model_dir
+
+        self._load_pretrained_checkpoint()
+
+    def forward(self, inputs, return_loss=False):
+
+        return self.model(**inputs)
+
+    def _convert_state_dict(self, state_dict):
+        """Converts a state dict saved from a dataParallel module to normal
+        module state_dict inplace
+        :param state_dict is the loaded DataParallel model_state
+        """
+        if not next(iter(state_dict)).startswith('module.'):
+            return state_dict  # abort if dict is not a DataParallel model_state
+        new_state_dict = OrderedDict()
+
+        split_index = 0
+        for cur_key, _ in state_dict.items():
+            if cur_key.startswith('module.model'):
+                split_index = 13
+            elif cur_key.startswith('module'):
+                split_index = 7
+            break
+
+        for k, v in state_dict.items():
+            name = k[split_index:]  # remove `module.`
+            new_state_dict[name] = v
+        return new_state_dict
+
+    def _load_pretrained_checkpoint(self):
+        model_path = os.path.join(self.model_dir, ModelFile.TORCH_MODEL_FILE)
+        logger.info(model_path)
+        if os.path.exists(model_path):
+            ckpt = torch.load(model_path, 'cpu')
+            model_state = self._convert_state_dict(ckpt['state_dict'])
+
+            if ckpt.get('meta', None):
+                self.CLASSES = ckpt['meta']
+                self.config_type = 'ms_config'
+            self.model.load_state_dict(model_state)
+            self.model.to(self._device)
+
+        else:
+            logger.error(
+                '[checkModelPath]:model path dose not exits!!! model Path:'
+                + model_path)
+            raise Exception('[checkModelPath]:model path dose not exits!')
diff --git a/modelscope/models/cv/image_binary_quant_classification/bnext.py b/modelscope/models/cv/image_binary_quant_classification/bnext.py
new file mode 100644
index 00000000..fd1f7344
--- /dev/null
+++ b/modelscope/models/cv/image_binary_quant_classification/bnext.py
@@ -0,0 +1,664 @@
+# Part of the implementation is borrowed and modified from BNext,
+# publicly available at https://github.com/hpi-xnor/BNext
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# stage ratio: 1:1:3:1
+stage_out_channel_tiny = [32] + [
+    64
+] + [128] * 2 + [256] * 2 + [512] * 6 + [1024] * 2
+
+# stage ratio 1:1:3:1
+stage_out_channel_small = [48] + [
+    96
+] + [192] * 2 + [384] * 2 + [768] * 6 + [1536] * 2
+
+# stage ratio 2:2:4:2
+stage_out_channel_middle = [48] + [
+    96
+] + [192] * 4 + [384] * 4 + [768] * 8 + [1536] * 4
+
+# stage ratio 2:2:8:2
+stage_out_channel_large = [64] + [
+    128
+] + [256] * 4 + [512] * 4 + [1024] * 16 + [2048] * 4
+
+
+def conv3x3(in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=1,
+            groups=1,
+            dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=kernel_size // 2,
+        dilation=dilation,
+        groups=groups,
+        bias=False)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class HardSigmoid(nn.Module):
+
+    def __init__(self, ):
+        super(HardSigmoid, self).__init__()
+
+    def forward(self, x):
+        return F.relu6(x + 3) / 6
+
+
+class firstconv3x3(nn.Module):
+
+    def __init__(self, inp, oup, stride):
+        super(firstconv3x3, self).__init__()
+
+        self.conv1 = nn.Conv2d(inp, oup, 3, stride, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(oup)
+        self.prelu = nn.PReLU(oup, oup)
+
+    def forward(self, x):
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+
+        return out
+
+
+class LearnableBias(nn.Module):
+
+    def __init__(self, out_chn):
+        super(LearnableBias, self).__init__()
+        self.bias = nn.Parameter(
+            torch.zeros(1, out_chn, 1, 1), requires_grad=True)
+
+    def forward(self, x):
+        out = x + self.bias.expand_as(x)
+        return out
+
+
+class HardSign(nn.Module):
+
+    def __init__(self, range=[-1, 1], progressive=False):
+        super(HardSign, self).__init__()
+        self.range = range
+        self.progressive = progressive
+        self.register_buffer('temperature', torch.ones(1))
+
+    def adjust(self, x, scale=0.1):
+        self.temperature.mul_(scale)
+
+    def forward(self, x):
+        replace = x.clamp(self.range[0], self.range[1])
+        x = x.div(self.temperature.clamp(min=1e-8)).clamp(-1, 1)
+        if not self.progressive:
+            sign = x.sign()
+        else:
+            sign = x
+        return (sign - replace).detach() + replace
+
+
+class HardBinaryConv(nn.Module):
+
+    def __init__(self,
+                 in_chn,
+                 out_chn,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1):
+        super(HardBinaryConv, self).__init__()
+        self.stride = stride
+        self.padding = kernel_size // 2
+        self.groups = groups
+        self.number_of_weights = in_chn // groups * out_chn * kernel_size * kernel_size
+        self.shape = (out_chn, in_chn // groups, kernel_size, kernel_size)
+        self.weight = nn.Parameter(
+            torch.randn((self.shape)) * 0.001, requires_grad=True)
+
+        self.register_buffer('temperature', torch.ones(1))
+
+    def forward(self, x):
+        if self.training:
+            self.weight.data.clamp_(-1.5, 1.5)
+
+        real_weights = self.weight
+
+        if self.temperature < 1e-7:
+            binary_weights_no_grad = real_weights.sign()
+        else:
+            binary_weights_no_grad = (
+                real_weights / self.temperature.clamp(min=1e-8)).clamp(-1, 1)
+        cliped_weights = real_weights
+
+        if self.training:
+            binary_weights = binary_weights_no_grad.detach(
+            ) - cliped_weights.detach() + cliped_weights
+        else:
+            binary_weights = binary_weights_no_grad
+
+        y = F.conv2d(
+            x,
+            binary_weights,
+            stride=self.stride,
+            padding=self.padding,
+            groups=self.groups)
+
+        return y
+
+
+class SqueezeAndExpand(nn.Module):
+
+    def __init__(self,
+                 channels,
+                 planes,
+                 ratio=8,
+                 attention_mode='hard_sigmoid'):
+        super(SqueezeAndExpand, self).__init__()
+        self.se = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            nn.Conv2d(channels, channels // ratio, kernel_size=1, padding=0),
+            nn.ReLU(channels // ratio),
+            nn.Conv2d(channels // ratio, planes, kernel_size=1, padding=0),
+        )
+
+        if attention_mode == 'sigmoid':
+            self.attention = nn.Sigmoid()
+
+        elif attention_mode == 'hard_sigmoid':
+            self.attention = HardSigmoid()
+
+        else:
+            self.attention = nn.Softmax(dim=1)
+
+    def forward(self, x):
+        x = self.se(x)
+        x = self.attention(x)
+        return x
+
+
+class Attention(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 drop_rate=0.1,
+                 infor_recoupling=True,
+                 groups=1):
+        super(Attention, self).__init__()
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.infor_recoupling = infor_recoupling
+
+        self.move = LearnableBias(inplanes)
+        self.binary_activation = HardSign(range=[-1.5, 1.5])
+        self.binary_conv = HardBinaryConv(
+            inplanes, planes, kernel_size=3, stride=stride, groups=groups)
+
+        self.norm1 = nn.BatchNorm2d(planes)
+        self.norm2 = nn.BatchNorm2d(planes)
+
+        self.activation1 = nn.PReLU(inplanes)
+        self.activation2 = nn.PReLU(planes)
+
+        self.downsample = downsample
+        self.stride = stride
+        if stride == 2:
+            self.pooling = nn.AvgPool2d(2, 2)
+
+        if self.infor_recoupling:
+            self.se = SqueezeAndExpand(
+                planes, planes, attention_mode='sigmoid')
+            self.scale = nn.Parameter(torch.ones(1, planes, 1, 1) * 0.5)
+
+    def forward(self, input):
+
+        residual = self.activation1(input)
+
+        if self.stride == 2:
+            residual = self.pooling(residual)
+
+        x = self.move(input)
+        x = self.binary_activation(x)
+        x = self.binary_conv(x)
+        x = self.norm1(x)
+        x = self.activation2(x)
+
+        if self.infor_recoupling:
+            if self.training:
+                self.scale.data.clamp_(0, 1)
+            if self.stride == 2:
+                input = self.pooling(input)
+            mix = self.scale * input + x * (1 - self.scale)
+            x = self.se(mix) * x
+        else:
+            pass
+        x = x * residual
+        x = self.norm2(x)
+        x = x + residual
+
+        return x
+
+
+class FFN_3x3(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 drop_rate=0.1,
+                 infor_recoupling=True,
+                 groups=1):
+        super(FFN_3x3, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.infor_recoupling = infor_recoupling
+
+        self.move = LearnableBias(inplanes)
+        self.binary_activation = HardSign(range=[-1.5, 1.5])
+        self.binary_conv = HardBinaryConv(
+            inplanes, planes, kernel_size=3, stride=stride, groups=groups)
+
+        self.norm1 = nn.BatchNorm2d(planes)
+        self.norm2 = nn.BatchNorm2d(planes)
+
+        self.activation1 = nn.PReLU(inplanes)
+        self.activation2 = nn.PReLU(planes)
+
+        if stride == 2:
+            self.pooling = nn.AvgPool2d(2, 2)
+
+        if self.infor_recoupling:
+            self.se = SqueezeAndExpand(
+                inplanes, planes, attention_mode='sigmoid')
+            self.scale = nn.Parameter(torch.ones(1, planes, 1, 1) * 0.5)
+
+    def forward(self, input):
+
+        residual = input
+
+        if self.stride == 2:
+            residual = self.pooling(residual)
+
+        x = self.move(input)
+        x = self.binary_activation(x)
+        x = self.binary_conv(x)
+        x = self.norm1(x)
+        x = self.activation2(x)
+
+        if self.infor_recoupling:
+            if self.training:
+                self.scale.data.clamp_(0, 1)
+            if self.stride == 2:
+                input = self.pooling(input)
+            mix = self.scale * input + (1 - self.scale) * x
+            x = self.se(mix) * x
+            x = self.norm2(x)
+        else:
+            pass
+
+        x = x + residual
+
+        return x
+
+
+class FFN_1x1(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 attention=True,
+                 drop_rate=0.1,
+                 infor_recoupling=True):
+        super(FFN_1x1, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.infor_recoupling = infor_recoupling
+
+        self.move = LearnableBias(inplanes)
+        self.binary_activation = HardSign(range=[-1.5, 1.5])
+        self.binary_conv = HardBinaryConv(
+            inplanes, planes, kernel_size=1, stride=stride, padding=0)
+
+        self.norm1 = nn.BatchNorm2d(planes)
+        self.norm2 = nn.BatchNorm2d(planes)
+
+        self.activation1 = nn.PReLU(inplanes)
+        self.activation2 = nn.PReLU(planes)
+
+        if stride == 2:
+            self.pooling = nn.AvgPool2d(2, 2)
+
+        if self.infor_recoupling:
+            self.se = SqueezeAndExpand(
+                inplanes, planes, attention_mode='sigmoid')
+            self.scale = nn.Parameter(torch.ones(1, planes, 1, 1) * 0.5)
+
+    def forward(self, input):
+
+        residual = input
+
+        if self.stride == 2:
+            residual = self.pooling(residual)
+
+        x = self.move(input)
+        x = self.binary_activation(x)
+        x = self.binary_conv(x)
+        x = self.norm1(x)
+        x = self.activation2(x)
+        if self.infor_recoupling:
+            self.scale.data.clamp_(0, 1)
+            mix = self.scale * input + (1 - self.scale) * x
+            x = self.se(mix) * x
+            x = self.norm2(x)
+        else:
+            pass
+
+        x = x + residual
+
+        return x
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 drop_rate=0.1,
+                 mode='scale'):
+        super(BasicBlock, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+
+        if mode == 'scale':
+            self.Attention = Attention(
+                inplanes,
+                inplanes,
+                stride,
+                None,
+                drop_rate=drop_rate,
+                groups=1)
+        else:
+            self.Attention = FFN_3x3(
+                inplanes,
+                inplanes,
+                stride,
+                None,
+                drop_rate=drop_rate,
+                groups=1)
+
+        if inplanes == planes:
+            self.FFN = FFN_1x1(inplanes, inplanes, drop_rate=drop_rate)
+
+        else:
+            self.FFN_1 = FFN_1x1(inplanes, inplanes, drop_rate=drop_rate)
+
+            self.FFN_2 = FFN_1x1(inplanes, inplanes, drop_rate=drop_rate)
+
+    def forward(self, input):
+        x = self.Attention(input)
+
+        if self.inplanes == self.planes:
+            y = self.FFN(x)
+
+        else:
+            y_1 = self.FFN_1(x)
+            y_2 = self.FFN_2(x)
+            y = torch.cat((y_1, y_2), dim=1)
+
+        return y
+
+
+class BasicBlock_No_ELM_Attention(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 drop_rate=0.1,
+                 mode='scale'):
+        super(BasicBlock_No_ELM_Attention, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+
+        self.FFN_3x3 = FFN_3x3(
+            inplanes, inplanes, stride, None, drop_rate=drop_rate, groups=1)
+
+        if self.inplanes == self.planes:
+            self.FFN = FFN_1x1(inplanes, inplanes, drop_rate=drop_rate)
+        else:
+            self.FFN_1 = FFN_1x1(inplanes, inplanes, drop_rate=drop_rate)
+            self.FFN_2 = FFN_1x1(inplanes, inplanes, drop_rate=drop_rate)
+
+    def forward(self, input):
+        x = self.FFN_3x3(input)
+        if self.inplanes == self.planes:
+            y = self.FFN(x)
+        else:
+            y_1 = self.FFN_1(x)
+            y_2 = self.FFN_2(x)
+            y = torch.cat((y_1, y_2), dim=1)
+
+        return y
+
+
+class BasicBlock_No_Infor_Recoupling(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 drop_rate=0.1,
+                 mode='scale'):
+        super(BasicBlock_No_Infor_Recoupling, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+
+        if mode == 'scale':
+            self.Attention = Attention(
+                inplanes,
+                inplanes,
+                stride,
+                None,
+                drop_rate,
+                infor_recoupling=False,
+                groups=1)
+        else:
+            self.Attention = FFN_3x3(
+                inplanes,
+                inplanes,
+                stride,
+                None,
+                drop_rate=drop_rate,
+                infor_recoupling=False,
+                groups=1)
+
+        if self.inplanes == self.planes:
+            self.FFN = FFN_1x1(
+                inplanes,
+                inplanes,
+                drop_rate=drop_rate,
+                infor_recoupling=False)
+        else:
+            self.FFN_1 = FFN_1x1(
+                inplanes,
+                inplanes,
+                drop_rate=drop_rate,
+                infor_recoupling=False)
+            self.FFN_2 = FFN_1x1(
+                inplanes,
+                inplanes,
+                drop_rate=drop_rate,
+                infor_recoupling=False)
+
+    def forward(self, input):
+        x = self.Attention(input)
+        if self.inplanes == self.planes:
+            y = self.FFN(x)
+        else:
+            y_1 = self.FFN_1(x)
+            y_2 = self.FFN_2(x)
+            y = torch.cat((y_1, y_2), dim=1)
+
+        return y
+
+
+class BasicBlock_No_Extra_Design(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 drop_rate=0.1,
+                 mode='scale'):
+        super(BasicBlock_No_Extra_Design, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+
+        self.FFN_3x3 = FFN_3x3(
+            inplanes,
+            inplanes,
+            stride,
+            None,
+            drop_rate,
+            infor_recoupling=False,
+            groups=1)
+        if self.inplanes == self.planes:
+            self.FFN = FFN_1x1(
+                inplanes,
+                inplanes,
+                drop_rate=drop_rate,
+                infor_recoupling=False)
+        else:
+            self.FFN_1 = FFN_1x1(
+                inplanes,
+                inplanes,
+                drop_rate=drop_rate,
+                infor_recoupling=False)
+            self.FFN_2 = FFN_1x1(
+                inplanes,
+                inplanes,
+                drop_rate=drop_rate,
+                infor_recoupling=False)
+
+    def forward(self, input):
+        x = self.FFN_3x3(input)
+        if self.inplanes == self.planes:
+            y = self.FFN(x)
+        else:
+            y_1 = self.FFN_1(x)
+            y_2 = self.FFN_2(x)
+            y = torch.cat((y_1, y_2), dim=1)
+
+        return y
+
+
+class BNext(nn.Module):
+
+    def __init__(self,
+                 num_classes=1000,
+                 size='small',
+                 ELM_Attention=True,
+                 Infor_Recoupling=True):
+        super(BNext, self).__init__()
+        drop_rate = 0.2 if num_classes == 100 else 0.0
+
+        if size == 'tiny':
+            stage_out_channel = stage_out_channel_tiny
+        elif size == 'small':
+            stage_out_channel = stage_out_channel_small
+        elif size == 'middle':
+            stage_out_channel = stage_out_channel_middle
+        elif size == 'large':
+            stage_out_channel = stage_out_channel_large
+        else:
+            raise ValueError('The size is not defined!')
+
+        if ELM_Attention and Infor_Recoupling:
+            basicblock = BasicBlock
+            print('Model with ELM Attention and Infor-Recoupling')
+        elif (ELM_Attention and not Infor_Recoupling):
+            basicblock = BasicBlock_No_Infor_Recoupling
+            print('Model with ELM Attention, No Infor-Recoupling')
+        elif (not ELM_Attention and Infor_Recoupling):
+            basicblock = BasicBlock_No_ELM_Attention
+            print('Model with Infor-Recoupling, No ELM Attention')
+        else:
+            basicblock = BasicBlock_No_Extra_Design
+            print('Model with no Extra Design')
+
+        self.feature = nn.ModuleList()
+        drop_rates = [
+            x.item()
+            for x in torch.linspace(0, drop_rate, (len(stage_out_channel)))
+        ]
+
+        for i in range(len(stage_out_channel)):
+            if i == 0:
+                self.feature.append(
+                    firstconv3x3(3, stage_out_channel[i],
+                                 1 if num_classes != 1000 else 2))
+            elif i == 1:
+                self.feature.append((basicblock(
+                    stage_out_channel[i - 1],
+                    stage_out_channel[i],
+                    1,
+                    drop_rate=drop_rates[i],
+                    mode='bias')))
+            elif stage_out_channel[i - 1] != stage_out_channel[
+                    i] and stage_out_channel[i] != stage_out_channel[1]:
+                self.feature.append(
+                    basicblock(
+                        stage_out_channel[i - 1],
+                        stage_out_channel[i],
+                        2,
+                        drop_rate=drop_rates[i],
+                        mode='scale' if i % 2 == 0 else 'bias'))
+            else:
+                self.feature.append(
+                    basicblock(
+                        stage_out_channel[i - 1],
+                        stage_out_channel[i],
+                        1,
+                        drop_rate=drop_rates[i],
+                        mode='scale' if i % 2 == 0 else 'bias'))
+
+        self.prelu = nn.PReLU(stage_out_channel[-1])
+        self.pool1 = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(stage_out_channel[-1], num_classes)
+
+    def forward(self, img, return_loss=False, img_metas=None):
+        x = img
+        for i, block in enumerate(self.feature):
+            x = block(x)
+        x = self.prelu(x)
+        x = self.pool1(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        x = list(x.detach().cpu().numpy())
+        return x
diff --git a/modelscope/models/cv/image_classification/backbones/__init__.py b/modelscope/models/cv/image_classification/backbones/__init__.py
index 79a3a4ed..b25ff4c9 100644
--- a/modelscope/models/cv/image_classification/backbones/__init__.py
+++ b/modelscope/models/cv/image_classification/backbones/__init__.py
@@ -1,2 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from .beit_v2 import BEiTv2
 from .nextvit import NextViT
diff --git a/modelscope/models/cv/image_classification/backbones/beit_v2.py b/modelscope/models/cv/image_classification/backbones/beit_v2.py
new file mode 100644
index 00000000..eda11727
--- /dev/null
+++ b/modelscope/models/cv/image_classification/backbones/beit_v2.py
@@ -0,0 +1,529 @@
+# Part of the implementation is borrowed and modified from beit2,
+# publicly available at https://github.com/microsoft/unilm/tree/master/beit2
+import collections.abc
+import itertools
+import math
+import os
+import warnings
+from functools import partial
+from typing import Dict, Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from mmcls.models.backbones.base_backbone import BaseBackbone
+from mmcls.models.builder import BACKBONES
+from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..utils import to_2tuple, trunc_normal_
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0]
+                                          - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance,
+                            num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h,
+                                                 coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :,
+                                             None] - coords_flatten[:,
+                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :,
+                            0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer('relative_position_index',
+                                 relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self,
+                x,
+                rel_pos_bias=None,
+                return_attention=False,
+                return_qkv=False):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat(
+                (self.q_bias,
+                 torch.zeros_like(self.v_bias,
+                                  requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple) (B, H, N, C)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        if return_attention:
+            return attn
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        if return_qkv:
+            return x, qkv
+
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        if init_values > 0:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self,
+                x,
+                rel_pos_bias=None,
+                return_attention=False,
+                return_qkv=False):
+        if return_attention:
+            return self.attn(
+                self.norm1(x),
+                rel_pos_bias=rel_pos_bias,
+                return_attention=True)
+        if return_qkv:
+            y, qkv = self.attn(
+                self.norm1(x),
+                rel_pos_bias=rel_pos_bias,
+                return_qkv=return_qkv)
+            x = x + self.drop_path(self.gamma_1 * y)
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+            return x, qkv
+
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (
+            img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0],
+                            img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0]
+                                      - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance,
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:,
+                                1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+@BACKBONES.register_module()
+class BEiTv2(BaseBackbone):
+    embed_dims = {'base': 768, 'large': 1024, 'huge': 1280, 'giant': 1408}
+    depths = {'base': 12, 'large': 24, 'huge': 32, 'giant': 40}
+    num_heads = {'base': 12, 'large': 16, 'huge': 16, 'giant': 16}
+    mlp_ratios = {'base': 4, 'large': 4, 'huge': 4, 'giant': 6144 / 1408}
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 arch='base',
+                 patch_size=16,
+                 img_size=224,
+                 in_chans=3,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 init_values=None,
+                 use_abs_pos_emb=True,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True,
+                 init_scale=0.001,
+                 out_indices=-1,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        embed_dim = self.embed_dims[arch]
+        depth = self.depths[arch]
+        num_heads = self.num_heads[arch]
+        mlp_ratio = self.mlp_ratios[arch]
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None) for i in range(depth)
+        ])
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(
+            embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+
+    def init_weights(self):
+        super(BEiTv2, self).init_weights()
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress default init if use pretrained model.
+            return
+
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def fix_init_weight(self):
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        class_pos_embed = self.pos_embed[:, 0]
+        patch_pos_embed = self.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_embed.patch_size[0]
+        h0 = h // self.patch_embed.patch_size[0]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
+                                    dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(
+            h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed),
+                         dim=1)
+
+    def forward_features(self,
+                         x,
+                         return_patch_tokens=False,
+                         return_all_tokens=False,
+                         **kwargs):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            if x.shape[1] != self.pos_embed.shape[1]:
+                x = x + self.interpolate_pos_encoding(x, w, h)
+            else:
+                x = x + self.pos_embed
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        x = self.norm(x)
+        if self.fc_norm is not None:
+            if return_all_tokens:
+                return self.fc_norm(x)
+            t = x[:, 1:, :]
+            if return_patch_tokens:
+                return self.fc_norm(t)
+            else:
+                return self.fc_norm(t.mean(1))
+        else:
+            if return_all_tokens:
+                return x
+            elif return_patch_tokens:
+                return x[:, 1:]
+            else:
+                return x[:, 0]
+
+    def forward(self,
+                x,
+                return_patch_tokens=False,
+                return_all_tokens=False,
+                **kwargs):
+        x = self.forward_features(
+            x,
+            return_patch_tokens=return_patch_tokens,
+            return_all_tokens=return_all_tokens,
+            **kwargs)
+        return tuple([x])
+
+    def _freeze_stages(self):
+        if self.frozen_stages > 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for idx, layer in enumerate(self.blocks):
+                if idx <= self.frozen_stages - 1:
+                    layer.eval()
+                    for param in layer.parameters():
+                        param.requires_grad = False
+
+    def train(self, mode=True):
+        super(BEiTv2, self).train(mode)
+        self._freeze_stages()
diff --git a/modelscope/models/cv/image_classification/backbones/nextvit.py b/modelscope/models/cv/image_classification/backbones/nextvit.py
index ecf0d15e..0b69f40e 100644
--- a/modelscope/models/cv/image_classification/backbones/nextvit.py
+++ b/modelscope/models/cv/image_classification/backbones/nextvit.py
@@ -17,50 +17,11 @@ from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer
 from mmcv.runner import BaseModule
 from torch.nn.modules.batchnorm import _BatchNorm
 
+from ..utils import trunc_normal_
+
 NORM_EPS = 1e-5
 
 
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
-            'The distribution of values may be incorrect.',
-            stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        ll = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [ll, u], then translate to
-        # [2ll-1, 2u-1].
-        tensor.uniform_(2 * ll - 1, 2 * u - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-
-
 class ConvBNReLU(nn.Module):
 
     def __init__(self,
diff --git a/modelscope/models/cv/image_classification/utils.py b/modelscope/models/cv/image_classification/utils.py
index 32777b9b..5acb53ae 100644
--- a/modelscope/models/cv/image_classification/utils.py
+++ b/modelscope/models/cv/image_classification/utils.py
@@ -1,7 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import collections.abc
+import math
 import os.path as osp
+from itertools import repeat
 
 import numpy as np
+import torch
 from mmcls.datasets.base_dataset import BaseDataset
 
 
@@ -73,12 +77,17 @@ def get_classes(classes=None):
 
 class MmDataset(BaseDataset):
 
-    def __init__(self, ms_dataset, pipeline, classes=None, test_mode=False):
+    def __init__(self,
+                 ms_dataset,
+                 pipeline,
+                 classes=None,
+                 test_mode=False,
+                 data_prefix=''):
         self.ms_dataset = ms_dataset
         if len(self.ms_dataset) < 1:
             raise ValueError('Dataset Error: dataset is empty')
         super(MmDataset, self).__init__(
-            data_prefix='',
+            data_prefix=data_prefix,
             pipeline=pipeline,
             classes=classes,
             test_mode=test_mode)
@@ -98,3 +107,63 @@ class MmDataset(BaseDataset):
             data_infos.append(info)
 
         return data_infos
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    v = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [v, u], then translate to
+    # [2v-1, 2u-1].
+    tensor.uniform_(2 * v - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+    return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    with torch.no_grad():
+        return _trunc_normal_(tensor, mean, std, a, b)
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
diff --git a/modelscope/models/cv/image_color_enhance/image_color_enhance.py b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
index 0bd74197..5c17aa58 100644
--- a/modelscope/models/cv/image_color_enhance/image_color_enhance.py
+++ b/modelscope/models/cv/image_color_enhance/image_color_enhance.py
@@ -1,10 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path as osp
-from copy import deepcopy
 from typing import Dict, Union
 
 import torch
-from torch.nn.parallel import DataParallel, DistributedDataParallel
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
@@ -40,37 +38,13 @@ class ImageColorEnhance(TorchModel):
             self._device = torch.device('cpu')
         self.model = self.model.to(self._device)
 
-        self.model = self.load_pretrained(self.model, model_path)
+        self.model = self._load_pretrained(self.model, model_path)
 
         if self.training:
             self.model.train()
         else:
             self.model.eval()
 
-    def load_pretrained(self, net, load_path, strict=True, param_key='params'):
-        if isinstance(net, (DataParallel, DistributedDataParallel)):
-            net = net.module
-        load_net = torch.load(
-            load_path, map_location=lambda storage, loc: storage)
-        if param_key is not None:
-            if param_key not in load_net and 'params' in load_net:
-                param_key = 'params'
-                logger.info(
-                    f'Loading: {param_key} does not exist, use params.')
-            if param_key in load_net:
-                load_net = load_net[param_key]
-        logger.info(
-            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
-        )
-        # remove unnecessary 'module.'
-        for k, v in deepcopy(load_net).items():
-            if k.startswith('module.'):
-                load_net[k[7:]] = v
-                load_net.pop(k)
-        net.load_state_dict(load_net, strict=strict)
-        logger.info('load model done.')
-        return net
-
     def _evaluate_postprocess(self, src: Tensor,
                               target: Tensor) -> Dict[str, list]:
         preds = self.model(src)
diff --git a/modelscope/models/cv/image_colorization/__init__.py b/modelscope/models/cv/image_colorization/__init__.py
index 9dbb07a5..a9facc8a 100644
--- a/modelscope/models/cv/image_colorization/__init__.py
+++ b/modelscope/models/cv/image_colorization/__init__.py
@@ -4,13 +4,13 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .unet import DynamicUnetWide, DynamicUnetDeep
-    from .utils import NormType
+    from .unet import DynamicUnetWide, DynamicUnetDeep, NormType
+    from .ddcolor import DDColorForImageColorization
 
 else:
     _import_structure = {
-        'unet': ['DynamicUnetWide', 'DynamicUnetDeep'],
-        'utils': ['NormType']
+        'unet': ['DynamicUnetWide', 'DynamicUnetDeep', 'NormType'],
+        'ddcolor': ['DDColorForImageColorization'],
     }
 
     import sys
diff --git a/modelscope/models/cv/image_colorization/ddcolor/__init__.py b/modelscope/models/cv/image_colorization/ddcolor/__init__.py
new file mode 100644
index 00000000..a185e294
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .ddcolor_for_image_colorization import DDColorForImageColorization
diff --git a/modelscope/models/cv/image_colorization/ddcolor/ddcolor.py b/modelscope/models/cv/image_colorization/ddcolor/ddcolor.py
new file mode 100644
index 00000000..75ae44f2
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/ddcolor.py
@@ -0,0 +1,283 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+
+from .utils.convnext import ConvNeXt
+from .utils.position_encoding import PositionEmbeddingSine
+from .utils.transformer_utils import (MLP, CrossAttentionLayer, FFNLayer,
+                                      SelfAttentionLayer)
+from .utils.unet import (CustomPixelShuffle_ICNR, Hook, NormType,
+                         UnetBlockWide, custom_conv_layer)
+
+
+class DDColor(nn.Module):
+
+    def __init__(self,
+                 encoder_name='convnext-l',
+                 input_size=(256, 256),
+                 num_queries=100):
+
+        super().__init__()
+
+        self.encoder = Encoder(encoder_name,
+                               ['norm0', 'norm1', 'norm2', 'norm3'])
+        self.encoder.eval()
+        test_input = torch.randn(1, 3, *input_size)
+        self.encoder(test_input)
+
+        self.decoder = Decoder(
+            self.encoder.hooks,
+            nf=512,
+            last_norm='Spectral',
+            num_queries=num_queries,
+            num_scales=3,
+            dec_layers=9,
+        )
+        self.refine_net = nn.Sequential(
+            custom_conv_layer(
+                num_queries + 3,
+                2,
+                ks=1,
+                use_activ=False,
+                norm_type=NormType.Spectral))
+
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+
+    def normalize(self, img):
+        return (img - self.mean) / self.std
+
+    def forward(self, img):
+        if img.shape[1] == 3:
+            img = self.normalize(img)
+
+        self.encoder(img)
+        out_feat = self.decoder()
+        coarse_input = torch.cat([out_feat, img], dim=1)
+        out = self.refine_net(coarse_input)
+
+        return out
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 hooks,
+                 nf=512,
+                 blur=True,
+                 last_norm='Spectral',
+                 num_queries=100,
+                 num_scales=3,
+                 dec_layers=9):
+        super().__init__()
+        self.hooks = hooks
+        self.nf = nf
+        self.blur = blur
+        self.last_norm = getattr(NormType, last_norm)
+
+        self.layers = self.make_layers()
+        embed_dim = nf // 2
+
+        self.last_shuf = CustomPixelShuffle_ICNR(
+            embed_dim,
+            embed_dim,
+            blur=self.blur,
+            norm_type=self.last_norm,
+            scale=4)
+
+        self.color_decoder = MultiScaleColorDecoder(
+            in_channels=[512, 512, 256],
+            num_queries=num_queries,
+            num_scales=num_scales,
+            dec_layers=dec_layers,
+        )
+
+    def forward(self):
+        encode_feat = self.hooks[-1].feature
+        out0 = self.layers[0](encode_feat)
+        out1 = self.layers[1](out0)
+        out2 = self.layers[2](out1)
+        out3 = self.last_shuf(out2)
+        out = self.color_decoder([out0, out1, out2], out3)
+
+        return out
+
+    def make_layers(self):
+        decoder_layers = []
+
+        e_in_c = self.hooks[-1].feature.shape[1]
+        in_c = e_in_c
+
+        out_c = self.nf
+        setup_hooks = self.hooks[-2::-1]
+        for layer_index, hook in enumerate(setup_hooks):
+            feature_c = hook.feature.shape[1]
+            if layer_index == len(setup_hooks) - 1:
+                out_c = out_c // 2
+            decoder_layers.append(
+                UnetBlockWide(
+                    in_c,
+                    feature_c,
+                    out_c,
+                    hook,
+                    blur=self.blur,
+                    self_attention=False,
+                    norm_type=NormType.Spectral))
+            in_c = out_c
+        return nn.Sequential(*decoder_layers)
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, encoder_name, hook_names, **kwargs):
+        super().__init__()
+        if encoder_name == 'convnext-t' or encoder_name == 'convnext':
+            self.arch = ConvNeXt()
+        elif encoder_name == 'convnext-s':
+            self.arch = ConvNeXt(
+                depths=[3, 3, 27, 3], dims=[96, 192, 384, 768])
+        elif encoder_name == 'convnext-b':
+            self.arch = ConvNeXt(
+                depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024])
+        elif encoder_name == 'convnext-l':
+            self.arch = ConvNeXt(
+                depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
+        else:
+            raise NotImplementedError
+
+        self.hook_names = hook_names
+        self.hooks = self.setup_hooks()
+
+    def setup_hooks(self):
+        hooks = [Hook(self.arch._modules[name]) for name in self.hook_names]
+        return hooks
+
+    def forward(self, img):
+        return self.arch(img)
+
+
+class MultiScaleColorDecoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 hidden_dim=256,
+                 num_queries=100,
+                 nheads=8,
+                 dim_feedforward=2048,
+                 dec_layers=9,
+                 pre_norm=False,
+                 color_embed_dim=256,
+                 enforce_input_project=True,
+                 num_scales=3):
+        super().__init__()
+
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        # define Transformer decoder
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.transformer_self_attention_layers = nn.ModuleList()
+        self.transformer_cross_attention_layers = nn.ModuleList()
+        self.transformer_ffn_layers = nn.ModuleList()
+
+        for _ in range(self.num_layers):
+            self.transformer_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                ))
+            self.transformer_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                ))
+            self.transformer_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm,
+                ))
+
+        self.decoder_norm = nn.LayerNorm(hidden_dim)
+
+        self.num_queries = num_queries
+        # learnable color query features
+        self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        # learnable color query p.e.
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+
+        # level embedding
+        self.num_feature_levels = num_scales
+        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)
+
+        # input projections
+        self.input_proj = nn.ModuleList()
+        for i in range(self.num_feature_levels):
+            if in_channels[i] != hidden_dim or enforce_input_project:
+                self.input_proj.append(
+                    nn.Conv2d(in_channels[i], hidden_dim, kernel_size=1))
+                nn.init.kaiming_uniform_(self.input_proj[-1].weight, a=1)
+                if self.input_proj[-1].bias is not None:
+                    nn.init.constant_(self.input_proj[-1].bias, 0)
+            else:
+                self.input_proj.append(nn.Sequential())
+
+        # output FFNs
+        self.color_embed = MLP(hidden_dim, hidden_dim, color_embed_dim, 3)
+
+    def forward(self, feature_pyramid, last_img_feature):
+        assert len(feature_pyramid) == self.num_feature_levels
+        src, pos = [], []
+
+        for i in range(self.num_feature_levels):
+            pos.append(self.pe_layer(feature_pyramid[i], None).flatten(2))
+            src.append(self.input_proj[i](feature_pyramid[i]).flatten(2)
+                       + self.level_embed.weight[i][None, :, None])
+
+            # flatten NxCxHxW to HWxNxC
+            pos[-1] = pos[-1].permute(2, 0, 1)
+            src[-1] = src[-1].permute(2, 0, 1)
+
+        _, bs, _ = src[0].shape
+
+        # QxNxC
+        query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1)
+        output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1)
+
+        for i in range(self.num_layers):
+            level_index = i % self.num_feature_levels
+            # attention: cross-attention first
+            output = self.transformer_cross_attention_layers[i](
+                output,
+                src[level_index],
+                memory_mask=None,
+                memory_key_padding_mask=None,
+                pos=pos[level_index],
+                query_pos=query_embed)
+            output = self.transformer_self_attention_layers[i](
+                output,
+                tgt_mask=None,
+                tgt_key_padding_mask=None,
+                query_pos=query_embed)
+            # FFN
+            output = self.transformer_ffn_layers[i](output)
+
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(
+            0, 1)  # [N, bs, C]  -> [bs, N, C]
+        color_embed = self.color_embed(decoder_output)
+        out = torch.einsum('bqc,bchw->bqhw', color_embed, last_img_feature)
+
+        return out
diff --git a/modelscope/models/cv/image_colorization/ddcolor/ddcolor_for_image_colorization.py b/modelscope/models/cv/image_colorization/ddcolor/ddcolor_for_image_colorization.py
new file mode 100644
index 00000000..0d2acbd2
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/ddcolor_for_image_colorization.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Dict, Union
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .ddcolor import DDColor
+
+logger = get_logger()
+
+__all__ = ['DDColorForImageColorization']
+
+
+@MODELS.register_module(Tasks.image_colorization, module_name=Models.ddcolor)
+class DDColorForImageColorization(TorchModel):
+
+    def __init__(self,
+                 model_dir,
+                 encoder_name='convnext-l',
+                 input_size=(512, 512),
+                 num_queries=100,
+                 *args,
+                 **kwargs):
+        """initialize the image colorization model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+            encoder_name (str): the encoder name.
+            input_size (tuple): size of the model input image.
+            num_queries (int): number of decoder queries
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model = DDColor(encoder_name, input_size, num_queries)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.model = self._load_pretrained(self.model, model_path)
+
+    def forward(self, input: Dict[str,
+                                  Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result of the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        return self.model(**input)
diff --git a/modelscope/models/cv/image_colorization/ddcolor/utils/__init__.py b/modelscope/models/cv/image_colorization/ddcolor/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_colorization/ddcolor/utils/convnext.py b/modelscope/models/cv/image_colorization/ddcolor/utils/convnext.py
new file mode 100644
index 00000000..3da14c48
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/utils/convnext.py
@@ -0,0 +1,177 @@
+# The implementation here is modified based on ConvNeXt, originally MIT license
+# and publicly available at https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath, trunc_normal_
+
+
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim,
+            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(
+            layer_scale_init_value * torch.ones((dim)),
+            requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(
+        self,
+        in_chans=3,
+        num_classes=1000,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.,
+        layer_scale_init_value=1e-6,
+        head_init_scale=1.,
+    ):
+        super().__init__()
+
+        self.downsample_layers = nn.ModuleList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format='channels_first'))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format='channels_first'),
+                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(*[
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        # add norm layers for each output
+        out_indices = (0, 1, 2, 3)
+        for i in out_indices:
+            layer = LayerNorm(dims[i], eps=1e-6, data_format='channels_first')
+            # layer = nn.Identity()
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
+        self.head_cls = nn.Linear(dims[-1], 4)
+
+        self.apply(self._init_weights)
+        self.head_cls.weight.data.mul_(head_init_scale)
+        self.head_cls.bias.data.mul_(head_init_scale)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+
+            # add extra norm
+            norm_layer = getattr(self, f'norm{i}')
+            norm_layer(x)
+
+        return self.norm(x.mean(
+            [-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head_cls(x)
+        return x
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 eps=1e-6,
+                 data_format='channels_last'):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ['channels_last', 'channels_first']:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == 'channels_last':  # B H W C
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == 'channels_first':  # B C H W
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
diff --git a/modelscope/models/cv/image_colorization/ddcolor/utils/position_encoding.py b/modelscope/models/cv/image_colorization/ddcolor/utils/position_encoding.py
new file mode 100644
index 00000000..e613d15b
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/utils/position_encoding.py
@@ -0,0 +1,57 @@
+# The implementation here is modified based on Mask2Former, originally MIT license and publicly available at
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py
+
+import math
+
+import torch
+from torch import nn
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperature=10000,
+                 normalize=False,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)),
+                               device=x.device,
+                               dtype=torch.bool)
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(
+            self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
diff --git a/modelscope/models/cv/image_colorization/ddcolor/utils/transformer_utils.py b/modelscope/models/cv/image_colorization/ddcolor/utils/transformer_utils.py
new file mode 100644
index 00000000..66f2d821
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/utils/transformer_utils.py
@@ -0,0 +1,232 @@
+# The implementation here is modified based on Mask2Former, originally MIT license and publicly available at
+# https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py
+
+from typing import Optional
+
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+
+class SelfAttentionLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dropout=0.0,
+                 activation='relu',
+                 normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     tgt,
+                     tgt_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+
+        return tgt
+
+    def forward_pre(self,
+                    tgt,
+                    tgt_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt2,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+
+        return tgt
+
+    def forward(self,
+                tgt,
+                tgt_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, tgt_mask, tgt_key_padding_mask,
+                                    query_pos)
+        return self.forward_post(tgt, tgt_mask, tgt_key_padding_mask,
+                                 query_pos)
+
+
+class CrossAttentionLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dropout=0.0,
+                 activation='relu',
+                 normalize_before=False):
+        super().__init__()
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout)
+
+        self.norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     tgt,
+                     memory,
+                     memory_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+
+        return tgt
+
+    def forward_pre(self,
+                    tgt,
+                    memory,
+                    memory_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout(tgt2)
+
+        return tgt
+
+    def forward(self,
+                tgt,
+                memory,
+                memory_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, memory_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, memory_mask,
+                                 memory_key_padding_mask, pos, query_pos)
+
+
+class FFNLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 dim_feedforward=2048,
+                 dropout=0.0,
+                 activation='relu',
+                 normalize_before=False):
+        super().__init__()
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm = nn.LayerNorm(d_model)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, tgt):
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout(tgt2)
+        tgt = self.norm(tgt)
+        return tgt
+
+    def forward_pre(self, tgt):
+        tgt2 = self.norm(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout(tgt2)
+        return tgt
+
+    def forward(self, tgt):
+        if self.normalize_before:
+            return self.forward_pre(tgt)
+        return self.forward_post(tgt)
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == 'relu':
+        return F.relu
+    if activation == 'gelu':
+        return F.gelu
+    if activation == 'glu':
+        return F.glu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/modelscope/models/cv/image_colorization/ddcolor/utils/unet.py b/modelscope/models/cv/image_colorization/ddcolor/utils/unet.py
new file mode 100644
index 00000000..2b8b9c14
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/ddcolor/utils/unet.py
@@ -0,0 +1,203 @@
+# The implementation here is modified based on DeOldify, originally MIT License
+# and publicly available at https://github.com/jantic/DeOldify/blob/master/deoldify/unet.py
+
+import collections
+from enum import Enum
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+NormType = Enum('NormType', 'Batch BatchZero Weight Spectral')
+
+
+class Hook:
+    feature = None
+
+    def __init__(self, module):
+        self.hook = module.register_forward_hook(self.hook_fn)
+
+    def hook_fn(self, module, input, output):
+        if isinstance(output, torch.Tensor):
+            self.feature = output
+        elif isinstance(output, collections.OrderedDict):
+            self.feature = output['out']
+
+    def remove(self):
+        self.hook.remove()
+
+
+class SelfAttention(nn.Module):
+    'Self attention layer for nd.'
+
+    def __init__(self, n_channels: int):
+        super().__init__()
+        self.query = conv1d(n_channels, n_channels // 8)
+        self.key = conv1d(n_channels, n_channels // 8)
+        self.value = conv1d(n_channels, n_channels)
+        self.gamma = nn.Parameter(torch.tensor([0.]))
+
+    def forward(self, x):
+        # Notation from https://arxiv.org/pdf/1805.08318.pdf
+        size = x.size()
+        x = x.view(*size[:2], -1)
+        f, g, h = self.query(x), self.key(x), self.value(x)
+        beta = F.softmax(torch.bmm(f.permute(0, 2, 1).contiguous(), g), dim=1)
+        o = self.gamma * torch.bmm(h, beta) + x
+        return o.view(*size).contiguous()
+
+
+def batchnorm_2d(nf: int, norm_type: NormType = NormType.Batch):
+    'A batchnorm2d layer with `nf` features initialized depending on `norm_type`.'
+    bn = nn.BatchNorm2d(nf)
+    with torch.no_grad():
+        bn.bias.fill_(1e-3)
+        bn.weight.fill_(0. if norm_type == NormType.BatchZero else 1.)
+    return bn
+
+
+def init_default(m: nn.Module, func=nn.init.kaiming_normal_) -> None:
+    'Initialize `m` weights with `func` and set `bias` to 0.'
+    if func:
+        if hasattr(m, 'weight'):
+            func(m.weight)
+        if hasattr(m, 'bias') and hasattr(m.bias, 'data'):
+            m.bias.data.fill_(0.)
+    return m
+
+
+def icnr(x, scale=2, init=nn.init.kaiming_normal_):
+    'ICNR init of `x`, with `scale` and `init` function.'
+    ni, nf, h, w = x.shape
+    ni2 = int(ni / (scale**2))
+    k = init(torch.zeros([ni2, nf, h, w])).transpose(0, 1)
+    k = k.contiguous().view(ni2, nf, -1)
+    k = k.repeat(1, 1, scale**2)
+    k = k.contiguous().view([nf, ni, h, w]).transpose(0, 1)
+    x.data.copy_(k)
+
+
+def conv1d(ni: int,
+           no: int,
+           ks: int = 1,
+           stride: int = 1,
+           padding: int = 0,
+           bias: bool = False):
+    'Create and initialize a `nn.Conv1d` layer with spectral normalization.'
+    conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)
+    nn.init.kaiming_normal_(conv.weight)
+    if bias:
+        conv.bias.data.zero_()
+    return nn.utils.spectral_norm(conv)
+
+
+def custom_conv_layer(
+    ni: int,
+    nf: int,
+    ks: int = 3,
+    stride: int = 1,
+    padding: int = None,
+    bias: bool = None,
+    is_1d: bool = False,
+    norm_type=NormType.Batch,
+    use_activ: bool = True,
+    transpose: bool = False,
+    init=nn.init.kaiming_normal_,
+    self_attention: bool = False,
+    extra_bn: bool = False,
+):
+    'Create a sequence of convolutional (`ni` to `nf`), ReLU (if `use_activ`) and batchnorm (if `bn`) layers.'
+    if padding is None:
+        padding = (ks - 1) // 2 if not transpose else 0
+    bn = norm_type in (NormType.Batch, NormType.BatchZero) or extra_bn
+    if bias is None:
+        bias = not bn
+    conv_func = nn.ConvTranspose2d if transpose else nn.Conv1d if is_1d else nn.Conv2d
+    conv = init_default(
+        conv_func(
+            ni, nf, kernel_size=ks, bias=bias, stride=stride, padding=padding),
+        init,
+    )
+
+    if norm_type == NormType.Weight:
+        conv = nn.utils.weight_norm(conv)
+    elif norm_type == NormType.Spectral:
+        conv = nn.utils.spectral_norm(conv)
+    layers = [conv]
+    if use_activ:
+        layers.append(nn.ReLU(True))
+    if bn:
+        layers.append((nn.BatchNorm1d if is_1d else nn.BatchNorm2d)(nf))
+    if self_attention:
+        layers.append(SelfAttention(nf))
+    return nn.Sequential(*layers)
+
+
+class CustomPixelShuffle_ICNR(nn.Module):
+    """
+    Upsample by `scale` from `ni` filters to `nf` (default `ni`),
+    using `nn.PixelShuffle`, `icnr` init, and `weight_norm`.
+    """
+
+    def __init__(self,
+                 ni: int,
+                 nf: int = None,
+                 scale: int = 2,
+                 blur: bool = True,
+                 norm_type=NormType.Spectral,
+                 extra_bn=False):
+        super().__init__()
+        self.conv = custom_conv_layer(
+            ni,
+            nf * (scale**2),
+            ks=1,
+            use_activ=False,
+            norm_type=norm_type,
+            extra_bn=extra_bn)
+        icnr(self.conv[0].weight)
+        self.shuf = nn.PixelShuffle(scale)
+        self.do_blur = blur
+        # Blurring over (h*w) kernel
+        # "Super-Resolution using Convolutional Neural Networks without Any Checkerboard Artifacts"
+        # - https://arxiv.org/abs/1806.02658
+        self.pad = nn.ReplicationPad2d((1, 0, 1, 0))
+        self.blur = nn.AvgPool2d(2, stride=1)
+        self.relu = nn.ReLU(True)
+
+    def forward(self, x):
+        x = self.shuf(self.relu(self.conv(x)))
+        return self.blur(self.pad(x)) if self.do_blur else x
+
+
+class UnetBlockWide(nn.Module):
+    'A quasi-UNet block, using `PixelShuffle_ICNR upsampling`.'
+
+    def __init__(self,
+                 up_in_c: int,
+                 x_in_c: int,
+                 n_out: int,
+                 hook,
+                 blur: bool = False,
+                 self_attention: bool = False,
+                 norm_type=NormType.Spectral):
+        super().__init__()
+
+        self.hook = hook
+        up_out = n_out
+        self.shuf = CustomPixelShuffle_ICNR(
+            up_in_c, up_out, blur=blur, norm_type=norm_type, extra_bn=True)
+        self.bn = batchnorm_2d(x_in_c)
+        ni = up_out + x_in_c
+        self.conv = custom_conv_layer(
+            ni,
+            n_out,
+            norm_type=norm_type,
+            self_attention=self_attention,
+            extra_bn=True)
+        self.relu = nn.ReLU()
+
+    def forward(self, up_in):
+        s = self.hook.feature
+        up_out = self.shuf(up_in)
+        cat_x = self.relu(torch.cat([up_out, self.bn(s)], dim=1))
+        return self.conv(cat_x)
diff --git a/modelscope/models/cv/image_colorization/unet/__init__.py b/modelscope/models/cv/image_colorization/unet/__init__.py
new file mode 100644
index 00000000..66879422
--- /dev/null
+++ b/modelscope/models/cv/image_colorization/unet/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .unet import DynamicUnetDeep, DynamicUnetWide
+from .utils import NormType
diff --git a/modelscope/models/cv/image_colorization/unet.py b/modelscope/models/cv/image_colorization/unet/unet.py
similarity index 100%
rename from modelscope/models/cv/image_colorization/unet.py
rename to modelscope/models/cv/image_colorization/unet/unet.py
diff --git a/modelscope/models/cv/image_colorization/utils.py b/modelscope/models/cv/image_colorization/unet/utils.py
similarity index 100%
rename from modelscope/models/cv/image_colorization/utils.py
rename to modelscope/models/cv/image_colorization/unet/utils.py
diff --git a/modelscope/models/cv/image_deblur/__init__.py b/modelscope/models/cv/image_deblur/__init__.py
new file mode 100644
index 00000000..36edb912
--- /dev/null
+++ b/modelscope/models/cv/image_deblur/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .nafnet_for_image_deblur import NAFNetForImageDeblur
+
+else:
+    _import_structure = {'nafnet_for_image_deblur': ['NAFNetForImageDeblur']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_deblur/nafnet_for_image_deblur.py b/modelscope/models/cv/image_deblur/nafnet_for_image_deblur.py
new file mode 100644
index 00000000..129e8061
--- /dev/null
+++ b/modelscope/models/cv/image_deblur/nafnet_for_image_deblur.py
@@ -0,0 +1,106 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import torch.cuda
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_denoise.nafnet.NAFNet_arch import (NAFNet,
+                                                                   PSNRLoss)
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['NAFNetForImageDeblur']
+
+
+@MODELS.register_module(Tasks.image_deblurring, module_name=Models.nafnet)
+class NAFNetForImageDeblur(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image deblur model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.model = NAFNet(**self.config.model.network_g)
+        self.loss = PSNRLoss()
+        self.model = self._load_pretrained(self.model, model_path)
+
+    def crop_process(self, input):
+        output = torch.zeros_like(input)  # [1, C, H, W]
+        # determine crop_h and crop_w
+        ih, iw = input.shape[-2:]
+        crop_rows, crop_cols = max(ih // 512, 1), max(iw // 512, 1)
+        overlap = 16
+
+        step_h, step_w = ih // crop_rows, iw // crop_cols
+        for y in range(crop_rows):
+            for x in range(crop_cols):
+                crop_y = step_h * y
+                crop_x = step_w * x
+
+                crop_h = step_h if y < crop_rows - 1 else ih - crop_y
+                crop_w = step_w if x < crop_cols - 1 else iw - crop_x
+
+                crop_frames = input[:, :,
+                                    max(0, crop_y - overlap
+                                        ):min(crop_y + crop_h + overlap, ih),
+                                    max(0, crop_x - overlap
+                                        ):min(crop_x + crop_w
+                                              + overlap, iw)].contiguous()
+                h_start = overlap if max(0, crop_y - overlap) > 0 else 0
+                w_start = overlap if max(0, crop_x - overlap) > 0 else 0
+                h_end = h_start + crop_h if min(crop_y + crop_h
+                                                + overlap, ih) < ih else ih
+                w_end = w_start + crop_w if min(crop_x + crop_w
+                                                + overlap, iw) < iw else iw
+
+                output[:, :, crop_y:crop_y + crop_h,
+                       crop_x:crop_x + crop_w] = self.model(
+                           crop_frames)[:, :, h_start:h_end,
+                                        w_start:w_end].clamp(0, 1)
+        return output
+
+    def _train_forward(self, input: Tensor,
+                       target: Tensor) -> Dict[str, Tensor]:
+        preds = self.model(input)
+        return {'loss': self.loss(preds, target)}
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        return {'outputs': self.crop_process(input).cpu()}
+
+    def _evaluate_postprocess(self, input: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        preds = self.crop_process(input).cpu()
+        preds = list(torch.split(preds, 1, 0))
+        targets = list(torch.split(target.cpu(), 1, 0))
+
+        return {'pred': preds, 'target': targets}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        if self.training:
+            return self._train_forward(**inputs)
+        elif 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/image_defrcn_fewshot/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/__init__.py
new file mode 100644
index 00000000..ef73351a
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .defrcn_for_fewshot import DeFRCNForFewShot
+
+else:
+    _import_structure = {'defrcn_for_fewshot': ['DeFRCNForFewShot']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
new file mode 100644
index 00000000..d42e59b2
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/defrcn_for_fewshot.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .models.defaults_config import _C
+from .models.defrcn import DeFRCN
+from .utils.requirements_check import requires_version
+
+logger = get_logger()
+__all__ = ['DeFRCNForFewShot']
+
+
+@MODELS.register_module(
+    Tasks.image_fewshot_detection, module_name=Models.defrcn)
+class DeFRCNForFewShot(TorchModel):
+    """ Few-shot object detection model DeFRCN. The model requires detectron2-0.3 and pytorch-1.11.
+        Model config params mainly from detectron2, you can use detectron2 config file to initialize model.
+        Detail configs can be visited on detectron2.config.defaults and .models.defaults_config.
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the few-shot defrcn model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        requires_version()
+
+        super().__init__(model_dir, *args, **kwargs)
+
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        if 'config_path' in kwargs:
+            self.config.merge_from_dict(
+                {'model.config_path': kwargs['config_path']})
+
+        self.model_cfg = _C.clone()
+        self.model_cfg.merge_from_file(
+            os.path.join(model_dir, self.config.model.config_path))
+
+        if 'model_weights' in kwargs:
+            self.model_cfg.merge_from_list(
+                ['MODEL.WEIGHTS', kwargs['model_weights']])
+
+        self.model_cfg.freeze()
+
+        self.model = DeFRCN(self.model_cfg)
+
+    def forward(self, inputs) -> Any:
+        """return the result by the model
+
+        Args:
+            inputs (list): the preprocessed data
+
+        Returns:
+            Any: results
+        """
+        if self.training:
+            return self.model.forward(inputs)
+        else:
+            return self.model.inference(inputs)
+
+    def inference(self, input: Dict[str, Any]) -> Any:
+        with torch.no_grad():
+            results = self.model([input])
+        return results[0] if len(results) > 0 else None
+
+    def get_model_cfg(self):
+        return self.model_cfg
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/models/__init__.py
new file mode 100644
index 00000000..d463e460
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .defrcn import DeFRCN
+
+else:
+    _import_structure = {'defrcn': ['DeFRCN']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py b/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py
new file mode 100644
index 00000000..55fcc43b
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/defaults_config.py
@@ -0,0 +1,38 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/config/defaults.py
+
+from detectron2.config.defaults import _C
+
+_CC = _C
+
+# ----------- Backbone ----------- #
+_CC.MODEL.BACKBONE.FREEZE = False
+_CC.MODEL.BACKBONE.FREEZE_AT = 3
+
+# ------------- RPN -------------- #
+_CC.MODEL.RPN.FREEZE = False
+_CC.MODEL.RPN.ENABLE_DECOUPLE = False
+_CC.MODEL.RPN.BACKWARD_SCALE = 1.0
+
+# ------------- ROI -------------- #
+_CC.MODEL.ROI_HEADS.NAME = 'Res5ROIHeads'
+_CC.MODEL.ROI_HEADS.FREEZE_FEAT = False
+_CC.MODEL.ROI_HEADS.ENABLE_DECOUPLE = False
+_CC.MODEL.ROI_HEADS.BACKWARD_SCALE = 1.0
+_CC.MODEL.ROI_HEADS.OUTPUT_LAYER = 'FastRCNNOutputLayers'
+_CC.MODEL.ROI_HEADS.CLS_DROPOUT = False
+_CC.MODEL.ROI_HEADS.DROPOUT_RATIO = 0.8
+_CC.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 7  # for faster
+
+# ------------- TEST ------------- #
+_CC.TEST.PCB_ENABLE = False
+_CC.TEST.PCB_MODELTYPE = 'resnet'  # res-like
+_CC.TEST.PCB_MODELPATH = ''
+_CC.TEST.PCB_ALPHA = 0.50
+_CC.TEST.PCB_UPPER = 1.0
+_CC.TEST.PCB_LOWER = 0.05
+
+# ------------ Other ------------- #
+_CC.SOLVER.WEIGHT_DECAY = 5e-5
+_CC.MUTE_HEADER = True
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py b/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py
new file mode 100644
index 00000000..a5258017
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/defrcn.py
@@ -0,0 +1,179 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py
+
+import os
+from typing import Dict
+
+import torch
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone.resnet import build_resnet_backbone
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.modeling.proposal_generator.rpn import RPN, StandardRPNHead
+from detectron2.structures import ImageList
+from torch import nn
+
+from .gdl import AffineLayer, decouple_layer
+from .roi_heads import Res5ROIHeads
+
+
+class DeFRCN(nn.Module):
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.backbone = build_resnet_backbone(
+            cfg, ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
+        self._SHAPE_ = self.backbone.output_shape()
+
+        rpn_config = DeFRCN.from_rpn_config(cfg, self._SHAPE_)
+        self.proposal_generator = RPN(**rpn_config)
+
+        self.roi_heads = Res5ROIHeads(cfg, self._SHAPE_)
+        self.normalizer = self.normalize_fn()
+        self.affine_rpn = AffineLayer(
+            num_channels=self._SHAPE_['res4'].channels, bias=True)
+        self.affine_rcnn = AffineLayer(
+            num_channels=self._SHAPE_['res4'].channels, bias=True)
+        self.to(self.device)
+
+        if cfg.MODEL.BACKBONE.FREEZE:
+            for p in self.backbone.parameters():
+                p.requires_grad = False
+
+        if cfg.MODEL.RPN.FREEZE:
+            for p in self.proposal_generator.parameters():
+                p.requires_grad = False
+
+        if cfg.MODEL.ROI_HEADS.FREEZE_FEAT:
+            for p in self.roi_heads.res5.parameters():
+                p.requires_grad = False
+
+    def forward(self, batched_inputs):
+        if not self.training:
+            return self.inference(batched_inputs)
+        assert 'instances' in batched_inputs[0]
+        gt_instances = [x['instances'].to(self.device) for x in batched_inputs]
+        proposal_losses, detector_losses, _, _ = self._forward_once_(
+            batched_inputs, gt_instances)
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(self, batched_inputs):
+        assert not self.training
+        _, _, results, image_sizes = self._forward_once_(batched_inputs, None)
+        processed_results = []
+        for r, input, image_size in zip(results, batched_inputs, image_sizes):
+            height = input.get('height', image_size[0])
+            width = input.get('width', image_size[1])
+            r = detector_postprocess(r, height, width)
+            processed_results.append({'instances': r})
+        return processed_results
+
+    def _forward_once_(self, batched_inputs, gt_instances=None):
+
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+
+        features_de_rpn = features
+        if self.cfg.MODEL.RPN.ENABLE_DECOUPLE:
+            scale = self.cfg.MODEL.RPN.BACKWARD_SCALE
+            features_de_rpn = {
+                k: self.affine_rpn(decouple_layer(features[k], scale))
+                for k in features
+            }
+        proposals, proposal_losses = self.proposal_generator(
+            images, features_de_rpn, gt_instances)
+
+        features_de_rcnn = features
+        if self.cfg.MODEL.ROI_HEADS.ENABLE_DECOUPLE:
+            scale = self.cfg.MODEL.ROI_HEADS.BACKWARD_SCALE
+            features_de_rcnn = {
+                k: self.affine_rcnn(decouple_layer(features[k], scale))
+                for k in features
+            }
+        results, detector_losses = self.roi_heads(images, features_de_rcnn,
+                                                  proposals, gt_instances)
+
+        return proposal_losses, detector_losses, results, images.image_sizes
+
+    def preprocess_image(self, batched_inputs):
+        images = [x['image'].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images,
+                                        self.backbone.size_divisibility)
+        return images
+
+    def normalize_fn(self):
+        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
+        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
+        pixel_mean = (
+            torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).to(self.device).view(
+                num_channels, 1, 1))
+        pixel_std = (
+            torch.Tensor(self.cfg.MODEL.PIXEL_STD).to(self.device).view(
+                num_channels, 1, 1))
+        return lambda x: (x - pixel_mean) / pixel_std
+
+    @classmethod
+    def from_rpn_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        in_features = cfg.MODEL.RPN.IN_FEATURES
+        ret = {
+            'in_features':
+            in_features,
+            'min_box_size':
+            cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
+            'nms_thresh':
+            cfg.MODEL.RPN.NMS_THRESH,
+            'batch_size_per_image':
+            cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
+            'positive_fraction':
+            cfg.MODEL.RPN.POSITIVE_FRACTION,
+            'loss_weight': {
+                'loss_rpn_cls':
+                cfg.MODEL.RPN.LOSS_WEIGHT,
+                'loss_rpn_loc':
+                cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
+            },
+            'anchor_boundary_thresh':
+            cfg.MODEL.RPN.BOUNDARY_THRESH,
+            'box2box_transform':
+            Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
+            'box_reg_loss_type':
+            cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
+            'smooth_l1_beta':
+            cfg.MODEL.RPN.SMOOTH_L1_BETA,
+        }
+
+        ret['pre_nms_topk'] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN,
+                               cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
+        ret['post_nms_topk'] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN,
+                                cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
+
+        # ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
+        anchor_cfg = DefaultAnchorGenerator.from_config(
+            cfg, [input_shape[f] for f in in_features])
+        ret['anchor_generator'] = DefaultAnchorGenerator(**anchor_cfg)
+        ret['anchor_matcher'] = Matcher(
+            cfg.MODEL.RPN.IOU_THRESHOLDS,
+            cfg.MODEL.RPN.IOU_LABELS,
+            allow_low_quality_matches=True)
+        rpn_head_cfg = {
+            'in_channels':
+            [s.channels for s in [input_shape[f] for f in in_features]][0],
+            'num_anchors':
+            ret['anchor_generator'].num_anchors[0],
+            'box_dim':
+            ret['anchor_generator'].box_dim
+        }
+
+        ret['head'] = StandardRPNHead(**rpn_head_cfg)
+        return ret
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
new file mode 100644
index 00000000..9415b5a6
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/fast_rcnn.py
@@ -0,0 +1,274 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/rcnn.py
+
+import numpy as np
+import torch
+from detectron2.layers import batched_nms, cat
+from detectron2.modeling.roi_heads.fast_rcnn import \
+    fast_rcnn_inference_single_image
+from detectron2.utils.events import get_event_storage
+from fvcore.nn import smooth_l1_loss
+from torch import nn
+from torch.nn import functional as F
+
+
+def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh,
+                        topk_per_image):
+
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image,
+            scores_per_image,
+            image_shape,
+            score_thresh,
+            nms_thresh,
+            topk_per_image,
+        ) for scores_per_image, boxes_per_image, image_shape in zip(
+            scores, boxes, image_shapes)
+    ]
+    return tuple(list(x) for x in zip(*result_per_image))
+
+
+class FastRCNNOutputs(object):
+    """
+    A class that stores information about outputs of a Fast R-CNN head.
+    """
+
+    def __init__(
+        self,
+        box2box_transform,
+        pred_class_logits,
+        pred_proposal_deltas,
+        proposals,
+        smooth_l1_beta,
+    ):
+        """
+        Args:
+            box2box_transform (Box2BoxTransform/Box2BoxTransformRotated):
+                box2box transform instance for proposal-to-detection transformations.
+            pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class
+                logits for all R predicted object instances.
+                Each row corresponds to a predicted object instance.
+            pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for
+                class-specific or class-agnostic regression. It stores the predicted deltas that
+                transform proposals into final box detections.
+                B is the box dimension (4 or 5).
+                When B is 4, each row is [dx, dy, dw, dh (, ....)].
+                When B is 5, each row is [dx, dy, dw, dh, da (, ....)].
+            proposals (list[Instances]): A list of N Instances, where Instances i stores the
+                proposals for image i, in the field "proposal_boxes".
+                When training, each Instances must have ground-truth labels
+                stored in the field "gt_classes" and "gt_boxes".
+            smooth_l1_beta (float): The transition point between L1 and L2 loss in
+                the smooth L1 loss function. When set to 0, the loss becomes L1. When
+                set to +inf, the loss becomes constant 0.
+        """
+        self.box2box_transform = box2box_transform
+        self.num_preds_per_image = [len(p) for p in proposals]
+        self.pred_class_logits = pred_class_logits
+        self.pred_proposal_deltas = pred_proposal_deltas
+        self.smooth_l1_beta = smooth_l1_beta
+
+        box_type = type(proposals[0].proposal_boxes)
+        # cat(..., dim=0) concatenates over all images in the batch
+        self.proposals = box_type.cat([p.proposal_boxes for p in proposals])
+        assert (not self.proposals.tensor.requires_grad
+                ), 'Proposals should not require gradients!'
+        self.image_shapes = [x.image_size for x in proposals]
+
+        # The following fields should exist only when training.
+        if proposals[0].has('gt_boxes'):
+            self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals])
+            assert proposals[0].has('gt_classes')
+            self.gt_classes = cat([p.gt_classes for p in proposals], dim=0)
+
+    def _log_accuracy(self):
+        """
+        Log the accuracy metrics to EventStorage.
+        """
+        num_instances = self.gt_classes.numel()
+        pred_classes = self.pred_class_logits.argmax(dim=1)
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind)
+        num_fg = fg_inds.nonzero().numel()
+        fg_gt_classes = self.gt_classes[fg_inds]
+        fg_pred_classes = pred_classes[fg_inds]
+
+        num_false_negative = ((
+            fg_pred_classes == bg_class_ind).nonzero().numel())
+        num_accurate = (pred_classes == self.gt_classes).nonzero().numel()
+        fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
+
+        storage = get_event_storage()
+        storage.put_scalar('fast_rcnn/cls_accuracy',
+                           num_accurate / num_instances)
+        if num_fg > 0:
+            storage.put_scalar('fast_rcnn/fg_cls_accuracy',
+                               fg_num_accurate / num_fg)
+            storage.put_scalar('fast_rcnn/false_negative',
+                               num_false_negative / num_fg)
+
+    def softmax_cross_entropy_loss(self):
+        """
+        Compute the softmax cross entropy loss for box classification.
+
+        Returns:
+            scalar Tensor
+        """
+        self._log_accuracy()
+        return F.cross_entropy(
+            self.pred_class_logits, self.gt_classes, reduction='mean')
+
+    def smooth_l1_loss(self):
+        """
+        Compute the smooth L1 loss for box regression.
+
+        Returns:
+            scalar Tensor
+        """
+        gt_proposal_deltas = self.box2box_transform.get_deltas(
+            self.proposals.tensor, self.gt_boxes.tensor)
+        box_dim = gt_proposal_deltas.size(1)  # 4 or 5
+        cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim
+        device = self.pred_proposal_deltas.device
+
+        bg_class_ind = self.pred_class_logits.shape[1] - 1
+
+        fg_inds = torch.nonzero((self.gt_classes >= 0)
+                                & (self.gt_classes < bg_class_ind)).squeeze(1)
+        if cls_agnostic_bbox_reg:
+            # pred_proposal_deltas only corresponds to foreground class for agnostic
+            gt_class_cols = torch.arange(box_dim, device=device)
+        else:
+            fg_gt_classes = self.gt_classes[fg_inds]
+            gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(
+                box_dim, device=device)
+
+        loss_box_reg = smooth_l1_loss(
+            self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols],
+            gt_proposal_deltas[fg_inds],
+            self.smooth_l1_beta,
+            reduction='sum',
+        )
+
+        loss_box_reg = loss_box_reg / self.gt_classes.numel()
+        return loss_box_reg
+
+    def losses(self):
+        """
+        Compute the default losses for box head in Fast(er) R-CNN,
+        with softmax cross entropy loss and smooth L1 loss.
+
+        Returns:
+            A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg".
+        """
+        return {
+            'loss_cls': self.softmax_cross_entropy_loss(),
+            'loss_box_reg': self.smooth_l1_loss(),
+        }
+
+    def predict_boxes(self):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes
+                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of predicted objects for image i and B is the box dimension (4 or 5)
+        """
+        num_pred = len(self.proposals)
+        B = self.proposals.tensor.shape[1]
+        K = self.pred_proposal_deltas.shape[1] // B
+        boxes = self.box2box_transform.apply_deltas(
+            self.pred_proposal_deltas.view(num_pred * K, B),
+            self.proposals.tensor.unsqueeze(1).expand(num_pred, K,
+                                                      B).reshape(-1, B),
+        )
+        return boxes.view(num_pred, K * B).split(
+            self.num_preds_per_image, dim=0)
+
+    def predict_probs(self):
+        """
+        Returns:
+            list[Tensor]: A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+                for image i.
+        """
+        probs = F.softmax(self.pred_class_logits, dim=-1)
+        return probs.split(self.num_preds_per_image, dim=0)
+
+    def inference(self, score_thresh, nms_thresh, topk_per_image):
+        """
+        Args:
+            score_thresh (float): same as fast_rcnn_inference.
+            nms_thresh (float): same as fast_rcnn_inference.
+            topk_per_image (int): same as fast_rcnn_inference.
+        Returns:
+            list[Instances]: same as fast_rcnn_inference.
+            list[Tensor]: same as fast_rcnn_inference.
+        """
+        boxes = self.predict_boxes()
+        scores = self.predict_probs()
+        image_shapes = self.image_shapes
+
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            score_thresh,
+            nms_thresh,
+            topk_per_image,
+        )
+
+
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+      (1) proposal-to-detection box regression deltas
+      (2) classification scores
+    """
+
+    def __init__(self,
+                 cfg,
+                 input_size,
+                 num_classes,
+                 cls_agnostic_bbox_reg,
+                 box_dim=4):
+        """
+        Args:
+            cfg: config
+            input_size (int): channels, or (channels, height, width)
+            num_classes (int): number of foreground classes
+            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
+            box_dim (int): the dimension of bounding boxes.
+                Example box dimensions: 4 for regular XYXY boxes and 5 for rotated XYWHA boxes
+        """
+        super(FastRCNNOutputLayers, self).__init__()
+
+        if not isinstance(input_size, int):
+            input_size = np.prod(input_size)
+
+        # The prediction layer for num_classes foreground classes and one
+        # background class
+        self.cls_score = nn.Linear(input_size, num_classes + 1)
+        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
+        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
+
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for b in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(b.bias, 0)
+
+        self._do_cls_dropout = cfg.MODEL.ROI_HEADS.CLS_DROPOUT
+        self._dropout_ratio = cfg.MODEL.ROI_HEADS.DROPOUT_RATIO
+
+    def forward(self, x):
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        proposal_deltas = self.bbox_pred(x)
+
+        if self._do_cls_dropout:
+            x = F.dropout(x, self._dropout_ratio, training=self.training)
+        scores = self.cls_score(x)
+
+        return scores, proposal_deltas
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py b/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py
new file mode 100644
index 00000000..0d228fa7
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/gdl.py
@@ -0,0 +1,43 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/meta_arch/gdl.py
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+
+class GradientDecoupleLayer(Function):
+
+    @staticmethod
+    def forward(ctx, x, _lambda):
+        ctx._lambda = _lambda
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output * ctx._lambda
+        return grad_output, None
+
+
+class AffineLayer(nn.Module):
+
+    def __init__(self, num_channels, bias=False):
+        super(AffineLayer, self).__init__()
+        weight = torch.FloatTensor(1, num_channels, 1, 1).fill_(1)
+        self.weight = nn.Parameter(weight, requires_grad=True)
+
+        self.bias = None
+        if bias:
+            bias = torch.FloatTensor(1, num_channels, 1, 1).fill_(0)
+            self.bias = nn.Parameter(bias, requires_grad=True)
+
+    def forward(self, X):
+        out = X * self.weight.expand_as(X)
+        if self.bias is not None:
+            out = out + self.bias.expand_as(X)
+        return out
+
+
+def decouple_layer(x, _lambda):
+    return GradientDecoupleLayer.apply(x, _lambda)
diff --git a/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
new file mode 100644
index 00000000..9ac78119
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/models/roi_heads.py
@@ -0,0 +1,302 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/modeling/roi_heads/roi_heads.py
+
+from typing import Dict
+
+import numpy as np
+import torch
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone.resnet import BottleneckBlock, make_stage
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.matcher import Matcher
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.proposal_generator.proposal_utils import \
+    add_ground_truth_to_proposals
+from detectron2.modeling.roi_heads import select_foreground_proposals
+from detectron2.modeling.sampling import subsample_labels
+from detectron2.structures import Boxes, Instances, pairwise_iou
+from detectron2.utils.events import get_event_storage
+from torch import nn
+
+from .fast_rcnn import FastRCNNOutputLayers, FastRCNNOutputs
+
+
+class ROIHeads(torch.nn.Module):
+    """
+    ROIHeads perform all per-region computation in an R-CNN.
+
+    It contains logic of cropping the regions, extract per-region features,
+    and make per-region predictions.
+
+    It can have many variants, implemented as subclasses of this class.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super(ROIHeads, self).__init__()
+
+        # fmt: off
+        self.batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
+        self.positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+        self.test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
+        self.test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
+        self.test_detections_per_img = cfg.TEST.DETECTIONS_PER_IMAGE
+        self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        self.proposal_append_gt = cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT
+        self.feature_strides = {k: v.stride for k, v in input_shape.items()}
+        self.feature_channels = {k: v.channels for k, v in input_shape.items()}
+        self.cls_agnostic_bbox_reg = cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG
+        self.smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA
+        # fmt: on
+
+        # Matcher to assign box proposals to gt boxes
+        self.proposal_matcher = Matcher(
+            cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS,
+            cfg.MODEL.ROI_HEADS.IOU_LABELS,
+            allow_low_quality_matches=False,
+        )
+
+        # Box2BoxTransform for bounding box regression
+        self.box2box_transform = Box2BoxTransform(
+            weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
+
+    def _sample_proposals(self, matched_idxs, matched_labels, gt_classes):
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes,
+            self.batch_size_per_image,
+            self.positive_sample_fraction,
+            self.num_classes,
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals, targets):
+        """
+        Prepare some proposals to be used to train the ROI heads.
+        It performs box matching between `proposals` and `targets`, and assigns
+        training labels to the proposals.
+        It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes,
+        with a fraction of positives that is no larger than `self.positive_sample_fraction.
+
+        Args:
+            See :meth:`ROIHeads.forward`
+
+        Returns:
+            list[Instances]:
+                length `N` list of `Instances`s containing the proposals
+                sampled for training. Each `Instances` has the following fields:
+                - proposal_boxes: the proposal boxes
+                - gt_boxes: the ground-truth box that the proposal is assigned to
+                  (this is only meaningful if the proposal has a label > 0; if label = 0
+                   then the ground-truth box is random)
+                Other fields such as "gt_classes" that's included in `targets`.
+        """
+        gt_boxes = [x.gt_boxes for x in targets]
+
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(gt_boxes, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            has_gt = len(targets_per_image) > 0
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
+            matched_idxs, matched_labels = self.proposal_matcher(
+                match_quality_matrix)
+            sampled_idxs, gt_classes = self._sample_proposals(
+                matched_idxs, matched_labels, targets_per_image.gt_classes)
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            # We index all the attributes of targets that start with "gt_"
+            # and have not been added to proposals yet (="gt_classes").
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+
+                for (
+                        trg_name,
+                        trg_value,
+                ) in targets_per_image.get_fields().items():
+                    if trg_name.startswith(
+                            'gt_') and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name,
+                                                trg_value[sampled_targets])
+            else:
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros(
+                        (len(sampled_idxs), 4)))
+                proposals_per_image.gt_boxes = gt_boxes
+
+            num_bg_samples.append(
+                (gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar('roi_head/num_fg_samples', np.mean(num_fg_samples))
+        storage.put_scalar('roi_head/num_bg_samples', np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        Args:
+            images (ImageList):
+            features (dict[str: Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            proposals (list[Instances]): length `N` list of `Instances`s. The i-th
+                `Instances` contains object proposals for the i-th input image,
+                with fields "proposal_boxes" and "objectness_logits".
+            targets (list[Instances], optional): length `N` list of `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.  Specify `targets` during training only.
+                It may have the following fields:
+                - gt_boxes: the bounding box of each instance.
+                - gt_classes: the label for each instance with a category ranging in [0, #class].
+
+        Returns:
+            results (list[Instances]): length `N` list of `Instances`s containing the
+                detected instances. Returned during inference only; may be []
+                during training.
+            losses (dict[str: Tensor]): mapping from a named loss to a tensor
+                storing the loss. Used during training only.
+        """
+        raise NotImplementedError()
+
+
+class Res5ROIHeads(ROIHeads):
+    """
+    The ROIHeads in a typical "C4" R-CNN model, where the heads share the
+    cropping and the per-region feature computation by a Res5 block.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+
+        assert len(self.in_features) == 1
+
+        # fmt: off
+        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        pooler_scales = (1.0 / self.feature_strides[self.in_features[0]], )
+        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+
+        self.pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        self.res5, out_channels = self._build_res5_block(cfg)
+        self.box_predictor = FastRCNNOutputLayers(cfg, out_channels,
+                                                  self.num_classes,
+                                                  self.cls_agnostic_bbox_reg)
+
+    def _build_res5_block(self, cfg):
+        # fmt: off
+        stage_channel_factor = 2**3  # res5 is 8x res2
+        num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        bottleneck_channels = num_groups * width_per_group * stage_channel_factor
+        out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor
+        stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+        norm = cfg.MODEL.RESNETS.NORM
+        assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \
+            'Deformable conv is not yet supported in res5 head.'
+        # fmt: on
+
+        blocks = make_stage(
+            BottleneckBlock,
+            3,
+            first_stride=2,
+            in_channels=out_channels // 2,
+            bottleneck_channels=bottleneck_channels,
+            out_channels=out_channels,
+            num_groups=num_groups,
+            norm=norm,
+            stride_in_1x1=stride_in_1x1,
+        )
+        return nn.Sequential(*blocks), out_channels
+
+    def _shared_roi_transform(self, features, boxes):
+        x = self.pooler(features, boxes)
+        x = self.res5(x)
+        return x
+
+    def forward(self, images, features, proposals, targets=None):
+        """
+        See :class:`ROIHeads.forward`.
+        """
+        del images
+
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        del targets
+
+        proposal_boxes = [x.proposal_boxes for x in proposals]
+        box_features = self._shared_roi_transform(
+            [features[f] for f in self.in_features], proposal_boxes)
+        feature_pooled = box_features.mean(dim=[2, 3])  # pooled to 1x1
+        pred_class_logits, pred_proposal_deltas = self.box_predictor(
+            feature_pooled)
+        del feature_pooled
+
+        outputs = FastRCNNOutputs(
+            self.box2box_transform,
+            pred_class_logits,
+            pred_proposal_deltas,
+            proposals,
+            self.smooth_l1_beta,
+        )
+
+        if self.training:
+            del features
+            losses = outputs.losses()
+            return [], losses
+        else:
+            pred_instances, _ = outputs.inference(
+                self.test_score_thresh,
+                self.test_nms_thresh,
+                self.test_detections_per_img,
+            )
+            return pred_instances, {}
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/__init__.py b/modelscope/models/cv/image_defrcn_fewshot/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
new file mode 100644
index 00000000..bc118ff2
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
@@ -0,0 +1,81 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import importlib
+import sys
+from collections import OrderedDict
+
+from packaging import version
+
+from modelscope.utils.import_utils import _torch_available
+
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+DETECTRON2_REQUIRED_VERSION = version.parse('0.3')
+
+
+def is_detectron2_version_available():
+    _detectron2_available = importlib.util.find_spec('detectron2') is not None
+    _detectron2_version_available = False
+    if _detectron2_available:
+        _detectron2_version = version.parse(
+            importlib_metadata.version('detectron2'))
+        _detectron2_version_available = (_detectron2_version.major,
+                                         _detectron2_version.minor) == (
+                                             DETECTRON2_REQUIRED_VERSION.major,
+                                             DETECTRON2_REQUIRED_VERSION.minor)
+
+    return _detectron2_version_available
+
+
+TORCH_REQUIRED_VERSION = version.parse('1.11')
+
+
+def is_torch_version_available():
+    _torch_version_available = False
+    if _torch_available:
+        torch_version = version.parse(importlib_metadata.version('torch'))
+        _torch_version_available = (torch_version.major,
+                                    torch_version.minor) == (
+                                        TORCH_REQUIRED_VERSION.major,
+                                        TORCH_REQUIRED_VERSION.minor)
+
+    return _torch_version_available
+
+
+DETECTRON2_IMPORT_ERROR = """
+{0} requires the detectron2-0.3 but it was not found in your environment.
+You can install it from modelscope lib with pip:
+`pip install detectron2==0.3`
+"""
+
+TORCH_VERSION_IMPORT_ERROR = """
+{0} requires the torch-1.11 but it was not found in your environment. You can install it with pip:
+`pip install torch==1.11`
+"""
+
+REQUIREMENTS_MAAPING_VERSION = OrderedDict([
+    ('detectron2-0.3', (is_detectron2_version_available,
+                        DETECTRON2_IMPORT_ERROR)),
+    ('torch-1.11', (is_torch_version_available, TORCH_VERSION_IMPORT_ERROR)),
+])
+
+REQUIREMENTS = ['detectron2-0.3', 'torch-1.11']
+
+
+def requires_version():
+    checks = []
+    for req in REQUIREMENTS:
+        if req in REQUIREMENTS_MAAPING_VERSION:
+            check = REQUIREMENTS_MAAPING_VERSION[req]
+        else:
+            raise NotImplementedError('{} do not supported check'.format(req))
+        checks.append(check)
+
+    failed = [
+        msg.format('DeFRCN') for available, msg in checks if not available()
+    ]
+    if failed:
+        raise ImportError(''.join(failed))
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
new file mode 100644
index 00000000..7a94066e
--- /dev/null
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
@@ -0,0 +1,342 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/data/meta_voc.py
+
+import os
+import xml.etree.ElementTree as ET
+
+import numpy as np
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+
+# PASCAL VOC categories
+PASCAL_VOC_ALL_CATEGORIES = {
+    1: [
+        'aeroplane',
+        'bicycle',
+        'boat',
+        'bottle',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+        'bird',
+        'bus',
+        'cow',
+        'motorbike',
+        'sofa',
+    ],
+    2: [
+        'bicycle',
+        'bird',
+        'boat',
+        'bus',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'motorbike',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+        'aeroplane',
+        'bottle',
+        'cow',
+        'horse',
+        'sofa',
+    ],
+    3: [
+        'aeroplane',
+        'bicycle',
+        'bird',
+        'bottle',
+        'bus',
+        'car',
+        'chair',
+        'cow',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'train',
+        'tvmonitor',
+        'boat',
+        'cat',
+        'motorbike',
+        'sheep',
+        'sofa',
+    ]
+}
+
+PASCAL_VOC_NOVEL_CATEGORIES = {
+    1: ['bird', 'bus', 'cow', 'motorbike', 'sofa'],
+    2: ['aeroplane', 'bottle', 'cow', 'horse', 'sofa'],
+    3: ['boat', 'cat', 'motorbike', 'sheep', 'sofa']
+}
+
+PASCAL_VOC_BASE_CATEGORIES = {
+    1: [
+        'aeroplane',
+        'bicycle',
+        'boat',
+        'bottle',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+    ],
+    2: [
+        'bicycle',
+        'bird',
+        'boat',
+        'bus',
+        'car',
+        'cat',
+        'chair',
+        'diningtable',
+        'dog',
+        'motorbike',
+        'person',
+        'pottedplant',
+        'sheep',
+        'train',
+        'tvmonitor',
+    ],
+    3: [
+        'aeroplane',
+        'bicycle',
+        'bird',
+        'bottle',
+        'bus',
+        'car',
+        'chair',
+        'cow',
+        'diningtable',
+        'dog',
+        'horse',
+        'person',
+        'pottedplant',
+        'train',
+        'tvmonitor',
+    ]
+}
+
+
+def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
+                                classnames: str):
+    """
+    Load Pascal VOC detection annotations to Detectron2 format.
+    Args:
+        dirname: Contain "Annotations", "ImageSets", "JPEGImages"
+        split (str): one of "train", "test", "val", "trainval"
+    """
+    is_shots = 'shot' in name
+    dicts = []
+    if is_shots:
+        fileids = {}
+        # split_dir = os.path.join("datasets", "vocsplit")
+        split_dir = os.path.join(root, 'vocsplit')
+        shot = name.split('_')[-2].split('shot')[0]
+        seed = int(name.split('_seed')[-1])
+        split_dir = os.path.join(split_dir, 'seed{}'.format(seed))
+        for cls in classnames:
+            with PathManager.open(
+                    os.path.join(split_dir,
+                                 'box_{}shot_{}_train.txt'.format(shot,
+                                                                  cls))) as f:
+                fileids_ = np.loadtxt(f, dtype=np.str).tolist()
+                if isinstance(fileids_, str):
+                    fileids_ = [fileids_]
+                fileids_ = [
+                    fid.split('/')[-1].split('.jpg')[0] for fid in fileids_
+                ]
+                fileids[cls] = fileids_
+
+        for cls, fileids_ in fileids.items():
+            dicts_ = []
+            for fileid in fileids_:
+                year = '2012' if '_' in fileid else '2007'
+                # dirname = os.path.join("datasets", "VOC{}".format(year))
+                # anno_file = os.path.join(dirname, "Annotations", fileid + ".xml")
+                # jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg")
+
+                dir_voc = os.path.join(root, 'VOC{}'.format(year))
+                anno_file = os.path.join(dir_voc, 'Annotations',
+                                         fileid + '.xml')
+                jpeg_file = os.path.join(dir_voc, 'JPEGImages',
+                                         fileid + '.jpg')
+
+                tree = ET.parse(anno_file)
+
+                for obj in tree.findall('object'):
+                    r = {
+                        'file_name': jpeg_file,
+                        'image_id': fileid,
+                        'height': int(tree.findall('./size/height')[0].text),
+                        'width': int(tree.findall('./size/width')[0].text),
+                    }
+                    cls_ = obj.find('name').text
+                    if cls != cls_:
+                        continue
+                    bbox = obj.find('bndbox')
+                    bbox = [
+                        float(bbox.find(x).text)
+                        for x in ['xmin', 'ymin', 'xmax', 'ymax']
+                    ]
+                    bbox[0] -= 1.0
+                    bbox[1] -= 1.0
+
+                    instances = [{
+                        'category_id': classnames.index(cls),
+                        'bbox': bbox,
+                        'bbox_mode': BoxMode.XYXY_ABS,
+                    }]
+                    r['annotations'] = instances
+                    dicts_.append(r)
+            if len(dicts_) > int(shot):
+                dicts_ = np.random.choice(dicts_, int(shot), replace=False)
+            dicts.extend(dicts_)
+    else:
+        with PathManager.open(
+                os.path.join(root, dirname, 'ImageSets', 'Main',
+                             split + '.txt')) as f:
+            fileids = np.loadtxt(f, dtype=np.str)
+
+        for fileid in fileids:
+            anno_file = os.path.join(root, dirname, 'Annotations',
+                                     fileid + '.xml')
+            jpeg_file = os.path.join(root, dirname, 'JPEGImages',
+                                     fileid + '.jpg')
+
+            tree = ET.parse(anno_file)
+
+            r = {
+                'file_name': jpeg_file,
+                'image_id': fileid,
+                'height': int(tree.findall('./size/height')[0].text),
+                'width': int(tree.findall('./size/width')[0].text),
+            }
+            instances = []
+
+            for obj in tree.findall('object'):
+                cls = obj.find('name').text
+                if not (cls in classnames):
+                    continue
+                bbox = obj.find('bndbox')
+                bbox = [
+                    float(bbox.find(x).text)
+                    for x in ['xmin', 'ymin', 'xmax', 'ymax']
+                ]
+                bbox[0] -= 1.0
+                bbox[1] -= 1.0
+
+                instances.append({
+                    'category_id': classnames.index(cls),
+                    'bbox': bbox,
+                    'bbox_mode': BoxMode.XYXY_ABS,
+                })
+            r['annotations'] = instances
+            dicts.append(r)
+
+    return dicts
+
+
+def register_meta_voc(name, root, dirname, split, year, keepclasses, sid):
+    if keepclasses.startswith('base_novel'):
+        thing_classes = PASCAL_VOC_ALL_CATEGORIES[sid]
+    elif keepclasses.startswith('base'):
+        thing_classes = PASCAL_VOC_BASE_CATEGORIES[sid]
+    elif keepclasses.startswith('novel'):
+        thing_classes = PASCAL_VOC_NOVEL_CATEGORIES[sid]
+
+    DatasetCatalog.register(
+        name,
+        lambda: load_filtered_voc_instances(name, root, dirname, split,
+                                            thing_classes),
+    )
+
+    MetadataCatalog.get(name).set(
+        thing_classes=thing_classes,
+        dirname=os.path.join(root, dirname),
+        year=year,
+        split=split,
+        base_classes=PASCAL_VOC_BASE_CATEGORIES[sid],
+        novel_classes=PASCAL_VOC_NOVEL_CATEGORIES[sid],
+    )
+
+
+def register_all_voc(root='datasets'):
+
+    METASPLITS = [
+        ('voc_2007_trainval_base1', 'VOC2007', 'trainval', 'base1', 1),
+        ('voc_2007_trainval_base2', 'VOC2007', 'trainval', 'base2', 2),
+        ('voc_2007_trainval_base3', 'VOC2007', 'trainval', 'base3', 3),
+        ('voc_2012_trainval_base1', 'VOC2012', 'trainval', 'base1', 1),
+        ('voc_2012_trainval_base2', 'VOC2012', 'trainval', 'base2', 2),
+        ('voc_2012_trainval_base3', 'VOC2012', 'trainval', 'base3', 3),
+        ('voc_2007_trainval_all1', 'VOC2007', 'trainval', 'base_novel_1', 1),
+        ('voc_2007_trainval_all2', 'VOC2007', 'trainval', 'base_novel_2', 2),
+        ('voc_2007_trainval_all3', 'VOC2007', 'trainval', 'base_novel_3', 3),
+        ('voc_2012_trainval_all1', 'VOC2012', 'trainval', 'base_novel_1', 1),
+        ('voc_2012_trainval_all2', 'VOC2012', 'trainval', 'base_novel_2', 2),
+        ('voc_2012_trainval_all3', 'VOC2012', 'trainval', 'base_novel_3', 3),
+        ('voc_2007_test_base1', 'VOC2007', 'test', 'base1', 1),
+        ('voc_2007_test_base2', 'VOC2007', 'test', 'base2', 2),
+        ('voc_2007_test_base3', 'VOC2007', 'test', 'base3', 3),
+        ('voc_2007_test_novel1', 'VOC2007', 'test', 'novel1', 1),
+        ('voc_2007_test_novel2', 'VOC2007', 'test', 'novel2', 2),
+        ('voc_2007_test_novel3', 'VOC2007', 'test', 'novel3', 3),
+        ('voc_2007_test_all1', 'VOC2007', 'test', 'base_novel_1', 1),
+        ('voc_2007_test_all2', 'VOC2007', 'test', 'base_novel_2', 2),
+        ('voc_2007_test_all3', 'VOC2007', 'test', 'base_novel_3', 3),
+    ]
+    for prefix in ['all', 'novel']:
+        for sid in range(1, 4):
+            for shot in [1, 2, 3, 5, 10]:
+                for year in [2007, 2012]:
+                    for seed in range(30):
+                        seed = '_seed{}'.format(seed)
+                        name = 'voc_{}_trainval_{}{}_{}shot{}'.format(
+                            year, prefix, sid, shot, seed)
+                        dirname = 'VOC{}'.format(year)
+                        img_file = '{}_{}shot_split_{}_trainval'.format(
+                            prefix, shot, sid)
+                        keepclasses = ('base_novel_{}'.format(sid) if prefix
+                                       == 'all' else 'novel{}'.format(sid))
+                        METASPLITS.append(
+                            (name, dirname, img_file, keepclasses, sid))
+
+    for name, dirname, split, keepclasses, sid in METASPLITS:
+        if name in DatasetCatalog:
+            continue
+
+        year = 2007 if '2007' in name else 2012
+        register_meta_voc(
+            name,
+            root,
+            dirname,
+            split,
+            year,
+            keepclasses,
+            sid,
+        )
+        MetadataCatalog.get(name).evaluator_type = 'pascal_voc'
diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
index 4e8fc0ed..b2c43c4a 100644
--- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
+++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py
@@ -1,10 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-from copy import deepcopy
 from typing import Any, Dict, Union
 
 import torch.cuda
-from torch.nn.parallel import DataParallel, DistributedDataParallel
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor
@@ -38,34 +36,6 @@ class NAFNetForImageDenoise(TorchModel):
         self.loss = PSNRLoss()
         self.model = self._load_pretrained(self.model, model_path)
 
-    def _load_pretrained(self,
-                         net,
-                         load_path,
-                         strict=True,
-                         param_key='params'):
-        if isinstance(net, (DataParallel, DistributedDataParallel)):
-            net = net.module
-        load_net = torch.load(
-            load_path, map_location=lambda storage, loc: storage)
-        if param_key is not None:
-            if param_key not in load_net and 'params' in load_net:
-                param_key = 'params'
-                logger.info(
-                    f'Loading: {param_key} does not exist, use params.')
-            if param_key in load_net:
-                load_net = load_net[param_key]
-        logger.info(
-            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
-        )
-        # remove unnecessary 'module.'
-        for k, v in deepcopy(load_net).items():
-            if k.startswith('module.'):
-                load_net[k[7:]] = v
-                load_net.pop(k)
-        net.load_state_dict(load_net, strict=strict)
-        logger.info('load model done.')
-        return net
-
     def _train_forward(self, input: Tensor,
                        target: Tensor) -> Dict[str, Tensor]:
         preds = self.model(input)
diff --git a/modelscope/models/cv/image_face_fusion/__init__.py b/modelscope/models/cv/image_face_fusion/__init__.py
new file mode 100644
index 00000000..5c77e231
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .image_face_fusion import ImageFaceFusion
+
+else:
+    _import_structure = {'image_face_fusion': ['ImageFaceFusion']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_face_fusion/facegan/__init__.py b/modelscope/models/cv/image_face_fusion/facegan/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_face_fusion/facegan/gan_wrap.py b/modelscope/models/cv/image_face_fusion/facegan/gan_wrap.py
new file mode 100644
index 00000000..c46b17eb
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facegan/gan_wrap.py
@@ -0,0 +1,93 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+
+from .model import FullGenerator
+
+
+class GANWrap(object):
+
+    def __init__(self,
+                 model_path,
+                 size=256,
+                 channel_multiplier=1,
+                 device='cpu'):
+        self.device = device
+        self.mfile = model_path
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                 inplace=True),
+        ])
+        self.batchSize = 2
+        self.n_mlp = 8
+        self.resolution = size
+        self.load_model(channel_multiplier)
+
+    def load_model(self, channel_multiplier=2):
+        self.model = FullGenerator(self.resolution, 512, self.n_mlp,
+                                   channel_multiplier).to(self.device)
+        pretrained_dict = torch.load(
+            self.mfile, map_location=torch.device('cpu'))
+        self.model.load_state_dict(pretrained_dict)
+        self.model.eval()
+
+    def process_tensor(self, img_t, return_face=True):
+        b, c, h, w = img_t.shape
+        img_t = F.interpolate(img_t, (self.resolution, self.resolution))
+
+        with torch.no_grad():
+            out, __ = self.model(img_t)
+
+        out = F.interpolate(out, (w, h))
+        return out
+
+    def process(self, ims, return_face=True):
+        res = []
+        faces = []
+        for i in range(0, len(ims), self.batchSize):
+            sizes = []
+            imt = None
+            for im in ims[i:i + self.batchSize]:
+                sizes.append(im.shape[0])
+                im = cv2.resize(im, (self.resolution, self.resolution))
+                im_pil = Image.fromarray(im)
+                imt = self.img2tensor(im_pil) if imt is None else torch.cat(
+                    (imt, self.img2tensor(im_pil)), dim=0)
+
+            imt = torch.flip(imt, [1])
+            with torch.no_grad():
+                img_outs, __ = self.model(imt)
+
+            for sz, img_out in zip(sizes, img_outs):
+                img = self.tensor2img(img_out)
+                if return_face:
+                    faces.append(img)
+                img = cv2.resize(img, (sz, sz), interpolation=cv2.INTER_AREA)
+                res.append(img)
+
+        return res, faces
+
+    def img2tensor(self, img):
+        img_t = self.transform(img).to(self.device)
+        img_t = torch.unsqueeze(img_t, 0)
+        return img_t
+
+    def tensor2img(self, image_tensor, bytes=255.0, imtype=np.uint8):
+        if image_tensor.dim() == 3:
+            image_numpy = image_tensor.cpu().float().numpy()
+        else:
+            image_numpy = image_tensor[0].cpu().float().numpy()
+        image_numpy = np.transpose(image_numpy, (1, 2, 0))
+        image_numpy = image_numpy[:, :, ::-1]
+        image_numpy = np.clip(
+            image_numpy * np.asarray([0.5, 0.5, 0.5])
+            + np.asarray([0.5, 0.5, 0.5]), 0, 1)
+        image_numpy = image_numpy * bytes
+        return image_numpy.astype(imtype)
diff --git a/modelscope/models/cv/image_face_fusion/facegan/model.py b/modelscope/models/cv/image_face_fusion/facegan/model.py
new file mode 100644
index 00000000..eb142779
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facegan/model.py
@@ -0,0 +1,788 @@
+# The implementation is adopted from stylegan2-pytorch,
+# made public available under the MIT License at https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
+import math
+import random
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .op import FusedLeakyReLU, fused_leaky_relu, upfirdn2d
+
+isconcat = True
+sss = 2 if isconcat else 1
+ratio = 2
+
+
+class PixelNorm(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input):
+        return input * torch.rsqrt(
+            torch.mean(input**2, dim=1, keepdim=True) + 1e-8)
+
+
+def make_kernel(k):
+    k = torch.tensor(k, dtype=torch.float32)
+
+    if k.ndim == 1:
+        k = k[None, :] * k[:, None]
+
+    k /= k.sum()
+
+    return k
+
+
+class Upsample(nn.Module):
+
+    def __init__(self, kernel, factor=2):
+        super().__init__()
+
+        self.factor = factor
+        kernel = make_kernel(kernel) * (factor**2)
+        self.register_buffer('kernel', kernel)
+
+        p = kernel.shape[0] - factor
+
+        pad0 = (p + 1) // 2 + factor - 1
+        pad1 = p // 2
+
+        self.pad = (pad0, pad1)
+
+    def forward(self, input):
+        out = upfirdn2d(
+            input, self.kernel, up=self.factor, down=1, pad=self.pad)
+
+        return out
+
+
+class Downsample(nn.Module):
+
+    def __init__(self, kernel, factor=2):
+        super().__init__()
+
+        self.factor = factor
+        kernel = make_kernel(kernel)
+        self.register_buffer('kernel', kernel)
+
+        p = kernel.shape[0] - factor
+
+        pad0 = (p + 1) // 2
+        pad1 = p // 2
+
+        self.pad = (pad0, pad1)
+
+    def forward(self, input):
+        out = upfirdn2d(
+            input, self.kernel, up=1, down=self.factor, pad=self.pad)
+
+        return out
+
+
+class Blur(nn.Module):
+
+    def __init__(self, kernel, pad, upsample_factor=1):
+        super().__init__()
+
+        kernel = make_kernel(kernel)
+
+        if upsample_factor > 1:
+            kernel = kernel * (upsample_factor**2)
+
+        self.register_buffer('kernel', kernel)
+
+        self.pad = pad
+
+    def forward(self, input):
+        out = upfirdn2d(input, self.kernel, pad=self.pad)
+
+        return out
+
+
+class EqualConv2d(nn.Module):
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 bias=True):
+        super().__init__()
+
+        self.weight = nn.Parameter(
+            torch.randn(out_channel, in_channel, kernel_size, kernel_size))
+        self.scale = 1 / math.sqrt(in_channel * kernel_size**2)
+
+        self.stride = stride
+        self.padding = padding
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_channel))
+
+        else:
+            self.bias = None
+
+    def forward(self, input):
+        out = F.conv2d(
+            input,
+            self.weight * self.scale,
+            bias=self.bias,
+            stride=self.stride,
+            padding=self.padding,
+        )
+
+        return out
+
+    def __repr__(self):
+        return (
+            f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},'
+            f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})'
+        )
+
+
+class EqualLinear(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 bias=True,
+                 bias_init=0,
+                 lr_mul=1,
+                 activation=None):
+        super().__init__()
+
+        self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
+
+        else:
+            self.bias = None
+
+        self.activation = activation
+
+        self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+        self.lr_mul = lr_mul
+
+    def forward(self, input):
+        if self.activation:
+            out = F.linear(input, self.weight * self.scale)
+            out = fused_leaky_relu(out, self.bias * self.lr_mul)
+
+        else:
+            out = F.linear(
+                input, self.weight * self.scale, bias=self.bias * self.lr_mul)
+
+        return out
+
+    def __repr__(self):
+        return (
+            f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})'
+        )
+
+
+class ScaledLeakyReLU(nn.Module):
+
+    def __init__(self, negative_slope=0.2):
+        super().__init__()
+
+        self.negative_slope = negative_slope
+
+    def forward(self, input):
+        out = F.leaky_relu(input, negative_slope=self.negative_slope)
+
+        return out * math.sqrt(2)
+
+
+class ModulatedConv2d(nn.Module):
+
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        style_dim,
+        demodulate=True,
+        upsample=False,
+        downsample=False,
+        blur_kernel=[1, 3, 3, 1],
+    ):
+        super().__init__()
+
+        self.eps = 1e-8
+        self.kernel_size = kernel_size
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.upsample = upsample
+        self.downsample = downsample
+
+        if upsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) - (kernel_size - 1)
+            pad0 = (p + 1) // 2 + factor - 1
+            pad1 = p // 2 + 1
+
+            self.blur = Blur(
+                blur_kernel, pad=(pad0, pad1), upsample_factor=factor)
+
+        if downsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) + (kernel_size - 1)
+            pad0 = (p + 1) // 2
+            pad1 = p // 2
+
+            self.blur = Blur(blur_kernel, pad=(pad0, pad1))
+
+        fan_in = in_channel * kernel_size**2
+        self.scale = 1 / math.sqrt(fan_in)
+        self.padding = kernel_size // 2
+
+        self.weight = nn.Parameter(
+            torch.randn(1, out_channel, in_channel, kernel_size, kernel_size))
+
+        self.modulation = EqualLinear(style_dim, in_channel, bias_init=1)
+
+        self.demodulate = demodulate
+
+    def __repr__(self):
+        return (
+            f'{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, '
+            f'upsample={self.upsample}, downsample={self.downsample})')
+
+    def forward(self, input, style):
+        batch, in_channel, height, width = input.shape
+
+        style = self.modulation(style).view(batch, 1, in_channel, 1, 1)
+        weight = self.scale * self.weight * style
+
+        if self.demodulate:
+            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + 1e-8)
+            weight = weight * demod.view(batch, self.out_channel, 1, 1, 1)
+
+        weight = weight.view(batch * self.out_channel, in_channel,
+                             self.kernel_size, self.kernel_size)
+
+        if self.upsample:
+            input = input.view(1, batch * in_channel, height, width)
+            weight = weight.view(batch, self.out_channel, in_channel,
+                                 self.kernel_size, self.kernel_size)
+            weight = weight.transpose(1, 2).reshape(batch * in_channel,
+                                                    self.out_channel,
+                                                    self.kernel_size,
+                                                    self.kernel_size)
+            out = F.conv_transpose2d(
+                input, weight, padding=0, stride=2, groups=batch)
+            _, _, height, width = out.shape
+            out = out.view(batch, self.out_channel, height, width)
+            out = self.blur(out)
+
+        elif self.downsample:
+            input = self.blur(input)
+            _, _, height, width = input.shape
+            input = input.view(1, batch * in_channel, height, width)
+            out = F.conv2d(input, weight, padding=0, stride=2, groups=batch)
+            _, _, height, width = out.shape
+            out = out.view(batch, self.out_channel, height, width)
+
+        else:
+            input = input.view(1, batch * in_channel, height, width)
+            out = F.conv2d(input, weight, padding=self.padding, groups=batch)
+            _, _, height, width = out.shape
+            out = out.view(batch, self.out_channel, height, width)
+
+        return out
+
+
+class NoiseInjection(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+        self.weight = nn.Parameter(torch.zeros(1))
+
+    def forward(self, image, noise=None):
+
+        if noise is not None:
+            if isconcat:
+                return torch.cat((image, self.weight * noise), dim=1)  # concat
+            return image + self.weight * noise
+
+        if noise is None:
+            batch, _, height, width = image.shape
+            noise = image.new_empty(batch, 1, height, width).normal_()
+
+        return image + self.weight * noise
+
+
+class ConstantInput(nn.Module):
+
+    def __init__(self, channel, size=4):
+        super().__init__()
+
+        self.input = nn.Parameter(torch.randn(1, channel, size, size))
+
+    def forward(self, input):
+        batch = input.shape[0]
+        out = self.input.repeat(batch, 1, 1, 1)
+
+        return out
+
+
+class StyledConv(nn.Module):
+
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        style_dim,
+        upsample=False,
+        blur_kernel=[1, 3, 3, 1],
+        demodulate=True,
+    ):
+        super().__init__()
+
+        self.conv = ModulatedConv2d(
+            in_channel,
+            out_channel,
+            kernel_size,
+            style_dim,
+            upsample=upsample,
+            blur_kernel=blur_kernel,
+            demodulate=demodulate,
+        )
+
+        self.noise = NoiseInjection()
+        self.activate = FusedLeakyReLU(out_channel * sss)
+
+    def forward(self, input, style, noise=None):
+        out = self.conv(input, style)
+        out = self.noise(out, noise=noise)
+        # out = out + self.bias
+        out = self.activate(out)
+
+        return out
+
+
+class ToRGB(nn.Module):
+
+    def __init__(self,
+                 in_channel,
+                 style_dim,
+                 upsample=True,
+                 blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+
+        if upsample:
+            self.upsample = Upsample(blur_kernel)
+
+        self.conv = ModulatedConv2d(
+            in_channel, 3, 1, style_dim, demodulate=False)
+        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
+
+    def forward(self, input, style, skip=None):
+        out = self.conv(input, style)
+        out = out + self.bias
+
+        if skip is not None:
+            skip = self.upsample(skip)
+
+            out = out + skip
+
+        return out
+
+
+class Generator(nn.Module):
+
+    def __init__(
+        self,
+        size,
+        style_dim,
+        n_mlp,
+        channel_multiplier=2,
+        blur_kernel=[1, 3, 3, 1],
+        lr_mlp=0.01,
+    ):
+        super().__init__()
+
+        self.size = size
+        self.n_mlp = n_mlp
+        self.style_dim = style_dim
+
+        layers = [PixelNorm()]
+
+        for i in range(n_mlp):
+            layers.append(
+                EqualLinear(
+                    style_dim,
+                    style_dim,
+                    lr_mul=lr_mlp,
+                    activation='fused_lrelu'))
+
+        self.style = nn.Sequential(*layers)
+
+        self.channels = {
+            4: 512 // ratio,
+            8: 512 // ratio,
+            16: 512 // ratio,
+            32: 512 // ratio,
+            64: 256 // ratio * channel_multiplier,
+            128: 128 // ratio * channel_multiplier,
+            256: 64 // ratio * channel_multiplier,
+            512: 32 // ratio * channel_multiplier,
+            1024: 16 // ratio * channel_multiplier,
+        }
+
+        self.input = ConstantInput(self.channels[4])
+        self.conv1 = StyledConv(
+            self.channels[4],
+            self.channels[4],
+            3,
+            style_dim,
+            blur_kernel=blur_kernel)
+        self.to_rgb1 = ToRGB(self.channels[4] * sss, style_dim, upsample=False)
+
+        self.log_size = int(math.log(size, 2))
+
+        self.convs = nn.ModuleList()
+        self.upsamples = nn.ModuleList()
+        self.to_rgbs = nn.ModuleList()
+
+        in_channel = self.channels[4]
+
+        for i in range(3, self.log_size + 1):
+            out_channel = self.channels[2**i]
+
+            self.convs.append(
+                StyledConv(
+                    in_channel * sss,
+                    out_channel,
+                    3,
+                    style_dim,
+                    upsample=True,
+                    blur_kernel=blur_kernel,
+                ))
+
+            self.convs.append(
+                StyledConv(
+                    out_channel * sss,
+                    out_channel,
+                    3,
+                    style_dim,
+                    blur_kernel=blur_kernel))
+
+            self.to_rgbs.append(ToRGB(out_channel * sss, style_dim))
+
+            in_channel = out_channel
+
+        self.n_latent = self.log_size * 2 - 2
+
+    def make_noise(self):
+        device = self.input.input.device
+
+        noises = [torch.randn(1, 1, 2**2, 2**2, device=device)]
+
+        for i in range(3, self.log_size + 1):
+            for _ in range(2):
+                noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
+
+        return noises
+
+    def mean_latent(self, n_latent):
+        latent_in = torch.randn(
+            n_latent, self.style_dim, device=self.input.input.device)
+        latent = self.style(latent_in).mean(0, keepdim=True)
+
+        return latent
+
+    def get_latent(self, input):
+        return self.style(input)
+
+    def forward(
+        self,
+        styles,
+        return_latents=False,
+        inject_index=None,
+        truncation=1,
+        truncation_latent=None,
+        input_is_latent=False,
+        noise=None,
+    ):
+        if not input_is_latent:
+            styles = [self.style(s) for s in styles]
+
+        if noise is None:
+            noise = []
+            batch = styles[0].shape[0]
+            for i in range(self.n_mlp + 1):
+                size = 2**(i + 2)
+                noise.append(
+                    torch.randn(
+                        batch,
+                        self.channels[size],
+                        size,
+                        size,
+                        device=styles[0].device))
+
+        if truncation < 1:
+            style_t = []
+
+            for style in styles:
+                style_t.append(truncation_latent
+                               + truncation * (style - truncation_latent))
+
+            styles = style_t
+
+        if len(styles) < 2:
+            inject_index = self.n_latent
+
+            latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
+
+        else:
+            if inject_index is None:
+                inject_index = random.randint(1, self.n_latent - 1)
+
+            latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
+            latent2 = styles[1].unsqueeze(1).repeat(
+                1, self.n_latent - inject_index, 1)
+
+            latent = torch.cat([latent, latent2], 1)
+
+        out = self.input(latent)
+        out = self.conv1(out, latent[:, 0], noise=noise[0])
+
+        skip = self.to_rgb1(out, latent[:, 1])
+
+        i = 1
+        noise_i = 1
+
+        for conv1, conv2, to_rgb in zip(self.convs[::2], self.convs[1::2],
+                                        self.to_rgbs):
+            out = conv1(out, latent[:, i], noise=noise[(noise_i + 1) // 2])
+            out = conv2(out, latent[:, i + 1], noise=noise[(noise_i + 2) // 2])
+            skip = to_rgb(out, latent[:, i + 2], skip)
+
+            i += 2
+            noise_i += 2
+
+        image = skip
+
+        if return_latents:
+            return image, latent
+
+        else:
+            return image, None
+
+
+class ConvLayer(nn.Sequential):
+
+    def __init__(
+        self,
+        in_channel,
+        out_channel,
+        kernel_size,
+        downsample=False,
+        blur_kernel=[1, 3, 3, 1],
+        bias=True,
+        activate=True,
+    ):
+        layers = []
+
+        if downsample:
+            factor = 2
+            p = (len(blur_kernel) - factor) + (kernel_size - 1)
+            pad0 = (p + 1) // 2
+            pad1 = p // 2
+
+            layers.append(Blur(blur_kernel, pad=(pad0, pad1)))
+
+            stride = 2
+            self.padding = 0
+
+        else:
+            stride = 1
+            self.padding = kernel_size // 2
+
+        layers.append(
+            EqualConv2d(
+                in_channel,
+                out_channel,
+                kernel_size,
+                padding=self.padding,
+                stride=stride,
+                bias=bias and not activate,
+            ))
+
+        if activate:
+            if bias:
+                layers.append(FusedLeakyReLU(out_channel))
+
+            else:
+                layers.append(ScaledLeakyReLU(0.2))
+
+        super().__init__(*layers)
+
+
+class ResBlock(nn.Module):
+
+    def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+
+        self.conv1 = ConvLayer(in_channel, in_channel, 3)
+        self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True)
+
+        self.skip = ConvLayer(
+            in_channel,
+            out_channel,
+            1,
+            downsample=True,
+            activate=False,
+            bias=False)
+
+    def forward(self, input):
+        out = self.conv1(input)
+        out = self.conv2(out)
+
+        skip = self.skip(input)
+        out = (out + skip) / math.sqrt(2)
+
+        return out
+
+
+class Discriminator(nn.Module):
+
+    def __init__(self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1]):
+        super().__init__()
+
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256 * channel_multiplier,
+            128: 128 * channel_multiplier,
+            256: 64 * channel_multiplier,
+            512: 32 * channel_multiplier,
+            1024: 16 * channel_multiplier,
+        }
+
+        convs = [ConvLayer(3, channels[size], 1)]
+
+        log_size = int(math.log(size, 2))
+
+        in_channel = channels[size]
+
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2**(i - 1)]
+
+            convs.append(ResBlock(in_channel, out_channel, blur_kernel))
+
+            in_channel = out_channel
+
+        self.convs = nn.Sequential(*convs)
+
+        self.stddev_group = 4
+        self.stddev_feat = 1
+
+        self.final_conv = ConvLayer(in_channel + 1, channels[4], 3)
+        self.final_linear = nn.Sequential(
+            EqualLinear(
+                channels[4] * 4 * 4, channels[4], activation='fused_lrelu'),
+            EqualLinear(channels[4], 1),
+        )
+
+    def forward(self, input):
+        out = self.convs(input)
+
+        batch, channel, height, width = out.shape
+        group = min(batch, self.stddev_group)
+        stddev = out.view(group, -1, self.stddev_feat,
+                          channel // self.stddev_feat, height, width)
+        stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8)
+        stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2)
+        stddev = stddev.repeat(group, 1, height, width)
+        out = torch.cat([out, stddev], 1)
+
+        out = self.final_conv(out)
+
+        out = out.view(batch, -1)
+        out = self.final_linear(out)
+        return out
+
+
+class FullGenerator(nn.Module):
+
+    def __init__(
+        self,
+        size,
+        style_dim,
+        n_mlp,
+        channel_multiplier=2,
+        blur_kernel=[1, 3, 3, 1],
+        lr_mlp=0.01,
+    ):
+        super().__init__()
+        channels = {
+            4: 512 // ratio,
+            8: 512 // ratio,
+            16: 512 // ratio,
+            32: 512 // ratio,
+            64: 256 // ratio * channel_multiplier,
+            128: 128 // ratio * channel_multiplier,
+            256: 64 // ratio * channel_multiplier,
+            512: 32 // ratio * channel_multiplier,
+            1024: 16 // ratio * channel_multiplier,
+        }
+
+        self.log_size = int(math.log(size, 2))
+        self.generator = Generator(
+            size,
+            style_dim,
+            n_mlp,
+            channel_multiplier=channel_multiplier,
+            blur_kernel=blur_kernel,
+            lr_mlp=lr_mlp)
+
+        conv = [ConvLayer(3, channels[size], 1)]
+        self.ecd0 = nn.Sequential(*conv)
+        in_channel = channels[size]
+
+        self.names = ['ecd%d' % i for i in range(self.log_size - 1)]
+        for i in range(self.log_size, 2, -1):
+            out_channel = channels[2**(i - 1)]
+            conv = [ConvLayer(in_channel, out_channel, 3, downsample=True)]
+            setattr(self, self.names[self.log_size - i + 1],
+                    nn.Sequential(*conv))
+            in_channel = out_channel
+        self.final_linear = nn.Sequential(
+            EqualLinear(
+                channels[4] * 4 * 4, style_dim, activation='fused_lrelu'))
+
+    def forward(
+        self,
+        inputs,
+        return_latents=False,
+        inject_index=None,
+        truncation=1,
+        truncation_latent=None,
+        input_is_latent=False,
+    ):
+        noise = []
+        for i in range(self.log_size - 1):
+            ecd = getattr(self, self.names[i])
+            inputs = ecd(inputs)
+            noise.append(inputs)
+        inputs = inputs.view(inputs.shape[0], -1)
+        outs = self.final_linear(inputs)
+        outs = self.generator([outs],
+                              return_latents,
+                              inject_index,
+                              truncation,
+                              truncation_latent,
+                              input_is_latent,
+                              noise=noise[::-1])
+
+        return outs
diff --git a/modelscope/models/cv/image_face_fusion/facegan/op/__init__.py b/modelscope/models/cv/image_face_fusion/facegan/op/__init__.py
new file mode 100644
index 00000000..74477cfb
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facegan/op/__init__.py
@@ -0,0 +1,4 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# at https://github.com/rosinality/stylegan2-pytorch
+from .fused_act import FusedLeakyReLU, fused_leaky_relu
+from .upfirdn2d import upfirdn2d
diff --git a/modelscope/models/cv/image_face_fusion/facegan/op/conv2d_gradfix.py b/modelscope/models/cv/image_face_fusion/facegan/op/conv2d_gradfix.py
new file mode 100644
index 00000000..a3aba91f
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facegan/op/conv2d_gradfix.py
@@ -0,0 +1,228 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/conv2d_gradfix.py
+import contextlib
+import warnings
+
+import torch
+from torch import autograd
+from torch.nn import functional as F
+
+enabled = True
+weight_gradients_disabled = False
+
+
+@contextlib.contextmanager
+def no_weight_gradients():
+    global weight_gradients_disabled
+
+    old = weight_gradients_disabled
+    weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+
+def conv2d(input,
+           weight,
+           bias=None,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=1):
+    if could_use_op(input):
+        return conv2d_gradfix(
+            transpose=False,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=0,
+            dilation=dilation,
+            groups=groups,
+        ).apply(input, weight, bias)
+
+    return F.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+def conv_transpose2d(
+    input,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+):
+    if could_use_op(input):
+        return conv2d_gradfix(
+            transpose=True,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            dilation=dilation,
+        ).apply(input, weight, bias)
+
+    return F.conv_transpose2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+def could_use_op(input):
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+
+    if input.device.type != 'cuda':
+        return False
+
+    warnings.warn(
+        f'conv2d_gradfix not supported on PyTorch {torch.__version__}. Falling back to torch.nn.functional.conv2d().'
+    )
+
+    return False
+
+
+def ensure_tuple(xs, ndim):
+    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs, ) * ndim
+
+    return xs
+
+
+conv2d_gradfix_cache = dict()
+
+
+def conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding,
+                   dilation, groups):
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = ensure_tuple(stride, ndim)
+    padding = ensure_tuple(padding, ndim)
+    output_padding = ensure_tuple(output_padding, ndim)
+    dilation = ensure_tuple(dilation, ndim)
+
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation,
+           groups)
+    if key in conv2d_gradfix_cache:
+        return conv2d_gradfix_cache[key]
+
+    common_kwargs = dict(
+        stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+
+        a = input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i]
+        return [
+            a - (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    class Conv2d(autograd.Function):
+
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            if not transpose:
+                out = F.conv2d(
+                    input=input, weight=weight, bias=bias, **common_kwargs)
+
+            else:
+                out = F.conv_transpose2d(
+                    input=input,
+                    weight=weight,
+                    bias=bias,
+                    output_padding=output_padding,
+                    **common_kwargs,
+                )
+
+            ctx.save_for_backward(input, weight)
+
+            return out
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            grad_input, grad_weight, grad_bias = None, None, None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(
+                    input_shape=input.shape, output_shape=grad_output.shape)
+                grad_input = conv2d_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs,
+                ).apply(grad_output, weight, None)
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input)
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum((0, 2, 3))
+
+            return grad_input, grad_weight, grad_bias
+
+    class Conv2dGradWeight(autograd.Function):
+
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            op = torch._C._jit_get_operation(
+                'aten::cudnn_convolution_backward_weight' if not transpose else
+                'aten::cudnn_convolution_transpose_backward_weight')
+            flags = [
+                torch.backends.cudnn.benchmark,
+                torch.backends.cudnn.deterministic,
+                torch.backends.cudnn.allow_tf32,
+            ]
+            grad_weight = op(
+                weight_shape,
+                grad_output,
+                input,
+                padding,
+                stride,
+                dilation,
+                groups,
+                *flags,
+            )
+            ctx.save_for_backward(grad_output, input)
+
+            return grad_weight
+
+        @staticmethod
+        def backward(ctx, grad_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad_grad_output, grad_grad_input = None, None
+
+            if ctx.needs_input_grad[0]:
+                grad_grad_output = Conv2d.apply(input, grad_grad_weight, None)
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(
+                    input_shape=input.shape, output_shape=grad_output.shape)
+                grad_grad_input = conv2d_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs,
+                ).apply(grad_output, grad_grad_weight, None)
+
+            return grad_grad_output, grad_grad_input
+
+    conv2d_gradfix_cache[key] = Conv2d
+
+    return Conv2d
diff --git a/modelscope/models/cv/image_face_fusion/facegan/op/fused_act.py b/modelscope/models/cv/image_face_fusion/facegan/op/fused_act.py
new file mode 100644
index 00000000..7db45d3c
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facegan/op/fused_act.py
@@ -0,0 +1,113 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn import functional as F
+
+def_lib = False
+
+
+class FusedLeakyReLUFunctionBackward(Function):
+
+    @staticmethod
+    def forward(ctx, grad_output, out, bias, negative_slope, scale):
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        empty = grad_output.new_empty(0)
+
+        grad_input = fused.fused_bias_act(grad_output.contiguous(), empty, out,
+                                          3, 1, negative_slope, scale)
+
+        dim = [0]
+
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+
+        if bias:
+            grad_bias = grad_input.sum(dim).detach()
+
+        else:
+            grad_bias = empty
+
+        return grad_input, grad_bias
+
+    @staticmethod
+    def backward(ctx, gradgrad_input, gradgrad_bias):
+        out, = ctx.saved_tensors
+        gradgrad_out = fused.fused_bias_act(
+            gradgrad_input.contiguous(),
+            gradgrad_bias,
+            out,
+            3,
+            1,
+            ctx.negative_slope,
+            ctx.scale,
+        )
+
+        return gradgrad_out, None, None, None, None
+
+
+class FusedLeakyReLUFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, bias, negative_slope, scale):
+        empty = input.new_empty(0)
+
+        ctx.bias = bias is not None
+
+        if bias is None:
+            bias = empty
+
+        out = fused.fused_bias_act(input, bias, empty, 3, 0, negative_slope,
+                                   scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        out, = ctx.saved_tensors
+
+        grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.bias, ctx.negative_slope, ctx.scale)
+
+        if not ctx.bias:
+            grad_bias = None
+
+        return grad_input, grad_bias, None, None
+
+
+class FusedLeakyReLU(nn.Module):
+
+    def __init__(self, channel, bias=True, negative_slope=0.2, scale=2**0.5):
+        super().__init__()
+
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(channel))
+
+        else:
+            self.bias = None
+
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input):
+        return fused_leaky_relu(input, self.bias, self.negative_slope,
+                                self.scale)
+
+
+def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
+    if not def_lib:
+        if bias is not None:
+            rest_dim = [1] * (input.ndim - bias.ndim - 1)
+            return (F.leaky_relu(
+                input + bias.view(1, bias.shape[0], *rest_dim),
+                negative_slope=0.2) * scale)
+
+        else:
+            return F.leaky_relu(input, negative_slope=0.2) * scale
diff --git a/modelscope/models/cv/image_face_fusion/facegan/op/upfirdn2d.py b/modelscope/models/cv/image_face_fusion/facegan/op/upfirdn2d.py
new file mode 100644
index 00000000..3b93c082
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facegan/op/upfirdn2d.py
@@ -0,0 +1,198 @@
+# The implementation is adopted from stylegan2-pytorch, made public available under the MIT License
+# at https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py
+from collections import abc
+
+import torch
+from torch.autograd import Function
+from torch.nn import functional as F
+
+def_lib = False
+
+
+class UpFirDn2dBackward(Function):
+
+    @staticmethod
+    def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad,
+                in_size, out_size):
+
+        up_x, up_y = up
+        down_x, down_y = down
+        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
+
+        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
+
+        grad_input = upfirdn2d_op.upfirdn2d(
+            grad_output,
+            grad_kernel,
+            down_x,
+            down_y,
+            up_x,
+            up_y,
+            g_pad_x0,
+            g_pad_x1,
+            g_pad_y0,
+            g_pad_y1,
+        )
+        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2],
+                                     in_size[3])
+
+        ctx.save_for_backward(kernel)
+
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        ctx.up_x = up_x
+        ctx.up_y = up_y
+        ctx.down_x = down_x
+        ctx.down_y = down_y
+        ctx.pad_x0 = pad_x0
+        ctx.pad_x1 = pad_x1
+        ctx.pad_y0 = pad_y0
+        ctx.pad_y1 = pad_y1
+        ctx.in_size = in_size
+        ctx.out_size = out_size
+
+        return grad_input
+
+    @staticmethod
+    def backward(ctx, gradgrad_input):
+        kernel, = ctx.saved_tensors
+
+        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2],
+                                                ctx.in_size[3], 1)
+
+        gradgrad_out = upfirdn2d_op.upfirdn2d(
+            gradgrad_input,
+            kernel,
+            ctx.up_x,
+            ctx.up_y,
+            ctx.down_x,
+            ctx.down_y,
+            ctx.pad_x0,
+            ctx.pad_x1,
+            ctx.pad_y0,
+            ctx.pad_y1,
+        )
+        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], ctx.out_size[1], ctx.in_size[3])
+        gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1],
+                                         ctx.out_size[0], ctx.out_size[1])
+
+        return gradgrad_out, None, None, None, None, None, None, None, None
+
+
+class UpFirDn2d(Function):
+
+    @staticmethod
+    def forward(ctx, input, kernel, up, down, pad):
+        up_x, up_y = up
+        down_x, down_y = down
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        kernel_h, kernel_w = kernel.shape
+        batch, channel, in_h, in_w = input.shape
+        ctx.in_size = input.shape
+
+        input = input.reshape(-1, in_h, in_w, 1)
+
+        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
+
+        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h + down_y) // down_y
+        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w + down_x) // down_x
+        ctx.out_size = (out_h, out_w)
+
+        ctx.up = (up_x, up_y)
+        ctx.down = (down_x, down_y)
+        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
+
+        g_pad_x0 = kernel_w - pad_x0 - 1
+        g_pad_y0 = kernel_h - pad_y0 - 1
+        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
+        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
+
+        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
+
+        out = upfirdn2d_op.upfirdn2d(input, kernel, up_x, up_y, down_x, down_y,
+                                     pad_x0, pad_x1, pad_y0, pad_y1)
+        # out = out.view(major, out_h, out_w, minor)
+        out = out.view(-1, channel, out_h, out_w)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        kernel, grad_kernel = ctx.saved_tensors
+
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = UpFirDn2dBackward.apply(
+                grad_output,
+                kernel,
+                grad_kernel,
+                ctx.up,
+                ctx.down,
+                ctx.pad,
+                ctx.g_pad,
+                ctx.in_size,
+                ctx.out_size,
+            )
+
+        return grad_input, None, None, None, None
+
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+    if not isinstance(up, abc.Iterable):
+        up = (up, up)
+
+    if not isinstance(down, abc.Iterable):
+        down = (down, down)
+
+    if len(pad) == 2:
+        pad = (pad[0], pad[1], pad[0], pad[1])
+
+    if not def_lib:
+        out = upfirdn2d_native(input, kernel, *up, *down, *pad)
+
+    return out
+
+
+def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
+                     pad_y0, pad_y1):
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(
+        out,
+        [0, 0,
+         max(pad_x0, 0),
+         max(pad_x1, 0),
+         max(pad_y0, 0),
+         max(pad_y1, 0)])
+    out = out[:,
+              max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0),
+              max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0)]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape(
+        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h + down_y) // down_y
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w + down_x) // down_x
+
+    return out.view(-1, channel, out_h, out_w)
diff --git a/modelscope/models/cv/image_face_fusion/facelib/__init__.py b/modelscope/models/cv/image_face_fusion/facelib/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_face_fusion/facelib/align_trans.py b/modelscope/models/cv/image_face_fusion/facelib/align_trans.py
new file mode 100644
index 00000000..554b0e7c
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facelib/align_trans.py
@@ -0,0 +1,301 @@
+# The implementation here is modified based on InsightFace_Pytorch, originally MIT License and publicly available
+# at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/mtcnn_pytorch/src/align_trans.py
+import cv2
+import numpy as np
+
+from .matlab_cp2tform import get_similarity_transform_for_cv2
+
+# reference facial points, a list of coordinates (x,y)
+REFERENCE_FACIAL_POINTS = [[30.29459953, 51.69630051],
+                           [65.53179932, 51.50139999],
+                           [48.02519989,
+                            71.73660278], [33.54930115, 92.3655014],
+                           [62.72990036, 92.20410156]]
+
+DEFAULT_CROP_SIZE = (96, 112)
+
+
+class FaceWarpException(Exception):
+
+    def __str__(self):
+        return 'In File {}:{}'.format(__file__, super.__str__(self))
+
+
+def get_reference_facial_points(output_size=None,
+                                inner_padding_factor=0.0,
+                                outer_padding=(0, 0),
+                                default_square=False):
+    """
+    Function:
+    ----------
+        get reference 5 key points according to crop settings:
+        0. Set default crop_size:
+            if default_square:
+                crop_size = (112, 112)
+            else:
+                crop_size = (96, 112)
+        1. Pad the crop_size by inner_padding_factor in each side;
+        2. Resize crop_size into (output_size - outer_padding*2),
+            pad into output_size with outer_padding;
+        3. Output reference_5point;
+    Parameters:
+    ----------
+        @output_size: (w, h) or None
+            size of aligned face image
+        @inner_padding_factor: (w_factor, h_factor)
+            padding factor for inner (w, h)
+        @outer_padding: (w_pad, h_pad)
+            each row is a pair of coordinates (x, y)
+        @default_square: True or False
+            if True:
+                default crop_size = (112, 112)
+            else:
+                default crop_size = (96, 112);
+        !!! make sure, if output_size is not None:
+                (output_size - outer_padding)
+                = some_scale * (default crop_size * (1.0 + inner_padding_factor))
+    Returns:
+    ----------
+        @reference_5point: 5x2 np.array
+            each row is a pair of transformed coordinates (x, y)
+    """
+
+    tmp_5pts = np.array(REFERENCE_FACIAL_POINTS)
+    tmp_crop_size = np.array(DEFAULT_CROP_SIZE)
+
+    # 0) make the inner region a square
+    if default_square:
+        size_diff = max(tmp_crop_size) - tmp_crop_size
+        tmp_5pts += size_diff / 2
+        tmp_crop_size += size_diff
+
+    if (output_size and output_size[0] == tmp_crop_size[0]
+            and output_size[1] == tmp_crop_size[1]):
+        return tmp_5pts
+
+    if (inner_padding_factor == 0 and outer_padding == (0, 0)):
+        if output_size is None:
+            return tmp_5pts
+        else:
+            raise FaceWarpException(
+                'No paddings to do, output_size must be None or {}'.format(
+                    tmp_crop_size))
+
+    if not (0 <= inner_padding_factor <= 1.0):
+        raise FaceWarpException('Not (0 <= inner_padding_factor <= 1.0)')
+
+    if ((inner_padding_factor > 0 or outer_padding[0] > 0
+         or outer_padding[1] > 0) and output_size is None):
+        output_size = tmp_crop_size * \
+            (1 + inner_padding_factor * 2).astype(np.int32)
+        output_size += np.array(outer_padding)
+
+    if not (outer_padding[0] < output_size[0]
+            and outer_padding[1] < output_size[1]):
+        raise FaceWarpException('Not (outer_padding[0] < output_size[0]'
+                                'and outer_padding[1] < output_size[1])')
+
+    # 1) pad the inner region according inner_padding_factor
+    if inner_padding_factor > 0:
+        size_diff = tmp_crop_size * inner_padding_factor * 2
+        tmp_5pts += size_diff / 2
+        tmp_crop_size += np.round(size_diff).astype(np.int32)
+
+    # 2) resize the padded inner region
+    size_bf_outer_pad = np.array(output_size) - np.array(outer_padding) * 2
+
+    if size_bf_outer_pad[0] * tmp_crop_size[1] != size_bf_outer_pad[
+            1] * tmp_crop_size[0]:
+        raise FaceWarpException(
+            'Must have (output_size - outer_padding)'
+            '= some_scale * (crop_size * (1.0 + inner_padding_factor)')
+
+    scale_factor = size_bf_outer_pad[0].astype(np.float32) / tmp_crop_size[0]
+    tmp_5pts = tmp_5pts * scale_factor
+    tmp_crop_size = size_bf_outer_pad
+
+    # 3) add outer_padding to make output_size
+    reference_5point = tmp_5pts + np.array(outer_padding)
+    tmp_crop_size = output_size
+
+    return reference_5point
+
+
+def get_affine_transform_matrix(src_pts, dst_pts):
+    """
+    Function:
+    ----------
+        get affine transform matrix 'tfm' from src_pts to dst_pts
+    Parameters:
+    ----------
+        @src_pts: Kx2 np.array
+            source points matrix, each row is a pair of coordinates (x, y)
+        @dst_pts: Kx2 np.array
+            destination points matrix, each row is a pair of coordinates (x, y)
+    Returns:
+    ----------
+        @tfm: 2x3 np.array
+            transform matrix from src_pts to dst_pts
+    """
+
+    tfm = np.float32([[1, 0, 0], [0, 1, 0]])
+    n_pts = src_pts.shape[0]
+    ones = np.ones((n_pts, 1), src_pts.dtype)
+    src_pts_ = np.hstack([src_pts, ones])
+    dst_pts_ = np.hstack([dst_pts, ones])
+
+    A, res, rank, s = np.linalg.lstsq(src_pts_, dst_pts_)
+
+    if rank == 3:
+        tfm = np.float32([[A[0, 0], A[1, 0], A[2, 0]],
+                          [A[0, 1], A[1, 1], A[2, 1]]])
+    elif rank == 2:
+        tfm = np.float32([[A[0, 0], A[1, 0], 0], [A[0, 1], A[1, 1], 0]])
+
+    return tfm
+
+
+def warp_and_crop_face(src_img,
+                       facial_pts,
+                       reference_pts=None,
+                       crop_size=(96, 112),
+                       align_type='smilarity',
+                       return_trans_inv=False):
+    """
+    Function:
+    ----------
+        apply affine transform 'trans' to uv
+    Parameters:
+    ----------
+        @src_img: 3x3 np.array
+            input image
+        @facial_pts: could be
+            1)a list of K coordinates (x,y)
+        or
+            2) Kx2 or 2xK np.array
+            each row or col is a pair of coordinates (x, y)
+        @reference_pts: could be
+            1) a list of K coordinates (x,y)
+        or
+            2) Kx2 or 2xK np.array
+            each row or col is a pair of coordinates (x, y)
+        or
+            3) None
+            if None, use default reference facial points
+        @crop_size: (w, h)
+            output face image size
+        @align_type: transform type, could be one of
+            1) 'similarity': use similarity transform
+            2) 'cv2_affine': use the first 3 points to do affine transform,
+                    by calling cv2.getAffineTransform()
+            3) 'affine': use all points to do affine transform
+    Returns:
+    ----------
+        @face_img: output face image with size (w, h) = @crop_size
+    """
+
+    if reference_pts is None:
+        if crop_size[0] == 96 and crop_size[1] == 112:
+            reference_pts = REFERENCE_FACIAL_POINTS
+        else:
+            default_square = False
+            inner_padding_factor = 0
+            outer_padding = (0, 0)
+            output_size = crop_size
+
+            reference_pts = get_reference_facial_points(
+                output_size, inner_padding_factor, outer_padding,
+                default_square)
+
+    ref_pts = np.float32(reference_pts)
+    ref_pts = (ref_pts - 112 / 2) * 0.85 + 112 / 2
+    ref_pts *= crop_size[0] / 112.
+    ref_pts_shp = ref_pts.shape
+    if max(ref_pts_shp) < 3 or min(ref_pts_shp) != 2:
+        raise FaceWarpException(
+            'reference_pts.shape must be (K,2) or (2,K) and K>2')
+
+    if ref_pts_shp[0] == 2:
+        ref_pts = ref_pts.T
+
+    src_pts = np.float32(facial_pts)
+    src_pts_shp = src_pts.shape
+    if max(src_pts_shp) < 3 or min(src_pts_shp) != 2:
+        raise FaceWarpException(
+            'facial_pts.shape must be (K,2) or (2,K) and K>2')
+
+    if src_pts_shp[0] == 2:
+        src_pts = src_pts.T
+
+    if src_pts.shape != ref_pts.shape:
+        raise FaceWarpException(
+            'facial_pts and reference_pts must have the same shape')
+
+    if align_type == 'cv2_affine':
+        tfm = cv2.getAffineTransform(src_pts[0:3], ref_pts[0:3])
+    elif align_type == 'affine':
+        tfm = get_affine_transform_matrix(src_pts, ref_pts)
+    else:
+        tfm, tfm_inv = get_similarity_transform_for_cv2(src_pts, ref_pts)
+
+    face_img = cv2.warpAffine(src_img, tfm, (crop_size[0], crop_size[1]))
+
+    if return_trans_inv:
+        return face_img, tfm_inv
+    else:
+        return face_img
+
+
+def get_f5p(landmarks, np_img):
+    eye_left = find_pupil(landmarks[36:41], np_img)
+    eye_right = find_pupil(landmarks[42:47], np_img)
+    if eye_left is None or eye_right is None:
+        print('cannot find 5 points with find_pupil, used mean instead.!')
+        eye_left = landmarks[36:41].mean(axis=0)
+        eye_right = landmarks[42:47].mean(axis=0)
+    nose = landmarks[30]
+    mouth_left = landmarks[48]
+    mouth_right = landmarks[54]
+    f5p = [[eye_left[0], eye_left[1]], [eye_right[0], eye_right[1]],
+           [nose[0], nose[1]], [mouth_left[0], mouth_left[1]],
+           [mouth_right[0], mouth_right[1]]]
+    return f5p
+
+
+def find_pupil(landmarks, np_img):
+    h, w, _ = np_img.shape
+    xmax = int(landmarks[:, 0].max())
+    xmin = int(landmarks[:, 0].min())
+    ymax = int(landmarks[:, 1].max())
+    ymin = int(landmarks[:, 1].min())
+
+    if ymin >= ymax or xmin >= xmax or ymin < 0 or xmin < 0 or ymax > h or xmax > w:
+        return None
+    eye_img_bgr = np_img[ymin:ymax, xmin:xmax, :]
+    eye_img = cv2.cvtColor(eye_img_bgr, cv2.COLOR_BGR2GRAY)
+    eye_img = cv2.equalizeHist(eye_img)
+    n_marks = landmarks - np.array([xmin, ymin]).reshape([1, 2])
+    eye_mask = cv2.fillConvexPoly(
+        np.zeros_like(eye_img), n_marks.astype(np.int32), 1)
+    ret, thresh = cv2.threshold(eye_img, 100, 255,
+                                cv2.THRESH_BINARY | cv2.THRESH_OTSU)
+    thresh = (1 - thresh / 255.) * eye_mask
+    cnt = 0
+    xm = []
+    ym = []
+    for i in range(thresh.shape[0]):
+        for j in range(thresh.shape[1]):
+            if thresh[i, j] > 0.5:
+                xm.append(j)
+                ym.append(i)
+                cnt += 1
+    if cnt != 0:
+        xm.sort()
+        ym.sort()
+        xm = xm[cnt // 2]
+        ym = ym[cnt // 2]
+    else:
+        xm = thresh.shape[1] / 2
+        ym = thresh.shape[0] / 2
+
+    return xm + xmin, ym + ymin
diff --git a/modelscope/models/cv/image_face_fusion/facelib/matlab_cp2tform.py b/modelscope/models/cv/image_face_fusion/facelib/matlab_cp2tform.py
new file mode 100644
index 00000000..87b9fd1e
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/facelib/matlab_cp2tform.py
@@ -0,0 +1,230 @@
+# The implementation is adopted from InsightFace_Pytorch, made publicly available under the MIT License
+# at https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/mtcnn_pytorch/src/matlab_cp2tform.py
+
+import numpy as np
+from numpy.linalg import inv, lstsq
+from numpy.linalg import matrix_rank as rank
+from numpy.linalg import norm
+
+
+class MatlabCp2tormException(Exception):
+
+    def __str__(self):
+        return 'In File {}:{}'.format(__file__, super.__str__(self))
+
+
+def tformfwd(trans, uv):
+    """
+    Function:
+    ----------
+        apply affine transform 'trans' to uv
+
+    Parameters:
+    ----------
+        @trans: 3x3 np.array
+            transform matrix
+        @uv: Kx2 np.array
+            each row is a pair of coordinates (x, y)
+
+    Returns:
+    ----------
+        @xy: Kx2 np.array
+            each row is a pair of transformed coordinates (x, y)
+    """
+    uv = np.hstack((uv, np.ones((uv.shape[0], 1))))
+    xy = np.dot(uv, trans)
+    xy = xy[:, 0:-1]
+    return xy
+
+
+def tforminv(trans, uv):
+    """
+    Function:
+    ----------
+        apply the inverse of affine transform 'trans' to uv
+
+    Parameters:
+    ----------
+        @trans: 3x3 np.array
+            transform matrix
+        @uv: Kx2 np.array
+            each row is a pair of coordinates (x, y)
+
+    Returns:
+    ----------
+        @xy: Kx2 np.array
+            each row is a pair of inverse-transformed coordinates (x, y)
+    """
+    Tinv = inv(trans)
+    xy = tformfwd(Tinv, uv)
+    return xy
+
+
+def findNonreflectiveSimilarity(uv, xy, options=None):
+
+    options = {'K': 2}
+
+    K = options['K']
+    M = xy.shape[0]
+    x = xy[:, 0].reshape((-1, 1))
+    y = xy[:, 1].reshape((-1, 1))
+
+    tmp1 = np.hstack((x, y, np.ones((M, 1)), np.zeros((M, 1))))
+    tmp2 = np.hstack((y, -x, np.zeros((M, 1)), np.ones((M, 1))))
+    X = np.vstack((tmp1, tmp2))
+
+    u = uv[:, 0].reshape((-1, 1))
+    v = uv[:, 1].reshape((-1, 1))
+    U = np.vstack((u, v))
+
+    if rank(X) >= 2 * K:
+        r, _, _, _ = lstsq(X, U)
+        r = np.squeeze(r)
+    else:
+        raise Exception('cp2tform:twoUniquePointsReq')
+
+    sc = r[0]
+    ss = r[1]
+    tx = r[2]
+    ty = r[3]
+
+    Tinv = np.array([[sc, -ss, 0], [ss, sc, 0], [tx, ty, 1]])
+
+    T = inv(Tinv)
+    T[:, 2] = np.array([0, 0, 1])
+
+    return T, Tinv
+
+
+def findSimilarity(uv, xy, options=None):
+
+    options = {'K': 2}
+
+    trans1, trans1_inv = findNonreflectiveSimilarity(uv, xy, options)
+
+    xyR = xy
+    xyR[:, 0] = -1 * xyR[:, 0]
+
+    trans2r, trans2r_inv = findNonreflectiveSimilarity(uv, xyR, options)
+
+    TreflectY = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]])
+
+    trans2 = np.dot(trans2r, TreflectY)
+
+    xy1 = tformfwd(trans1, uv)
+    norm1 = norm(xy1 - xy)
+
+    xy2 = tformfwd(trans2, uv)
+    norm2 = norm(xy2 - xy)
+
+    if norm1 <= norm2:
+        return trans1, trans1_inv
+    else:
+        trans2_inv = inv(trans2)
+        return trans2, trans2_inv
+
+
+def get_similarity_transform(src_pts, dst_pts, reflective=True):
+    """
+    Function:
+    ----------
+        Find Similarity Transform Matrix 'trans':
+            u = src_pts[:, 0]
+            v = src_pts[:, 1]
+            x = dst_pts[:, 0]
+            y = dst_pts[:, 1]
+            [x, y, 1] = [u, v, 1] * trans
+
+    Parameters:
+    ----------
+        @src_pts: Kx2 np.array
+            source points, each row is a pair of coordinates (x, y)
+        @dst_pts: Kx2 np.array
+            destination points, each row is a pair of transformed
+            coordinates (x, y)
+        @reflective: True or False
+            if True:
+                use reflective similarity transform
+            else:
+                use non-reflective similarity transform
+
+    Returns:
+    ----------
+       @trans: 3x3 np.array
+            transform matrix from uv to xy
+        trans_inv: 3x3 np.array
+            inverse of trans, transform matrix from xy to uv
+    """
+
+    if reflective:
+        trans, trans_inv = findSimilarity(src_pts, dst_pts)
+    else:
+        trans, trans_inv = findNonreflectiveSimilarity(src_pts, dst_pts)
+
+    return trans, trans_inv
+
+
+def cvt_tform_mat_for_cv2(trans):
+    """
+    Function:
+    ----------
+        Convert Transform Matrix 'trans' into 'cv2_trans' which could be
+        directly used by cv2.warpAffine():
+            u = src_pts[:, 0]
+            v = src_pts[:, 1]
+            x = dst_pts[:, 0]
+            y = dst_pts[:, 1]
+            [x, y].T = cv_trans * [u, v, 1].T
+
+    Parameters:
+    ----------
+        @trans: 3x3 np.array
+            transform matrix from uv to xy
+
+    Returns:
+    ----------
+        @cv2_trans: 2x3 np.array
+            transform matrix from src_pts to dst_pts, could be directly used
+            for cv2.warpAffine()
+    """
+    cv2_trans = trans[:, 0:2].T
+
+    return cv2_trans
+
+
+def get_similarity_transform_for_cv2(src_pts, dst_pts, reflective=True):
+    """
+    Function:
+    ----------
+        Find Similarity Transform Matrix 'cv2_trans' which could be
+        directly used by cv2.warpAffine():
+            u = src_pts[:, 0]
+            v = src_pts[:, 1]
+            x = dst_pts[:, 0]
+            y = dst_pts[:, 1]
+            [x, y].T = cv_trans * [u, v, 1].T
+
+    Parameters:
+    ----------
+        @src_pts: Kx2 np.array
+            source points, each row is a pair of coordinates (x, y)
+        @dst_pts: Kx2 np.array
+            destination points, each row is a pair of transformed
+            coordinates (x, y)
+        reflective: True or False
+            if True:
+                use reflective similarity transform
+            else:
+                use non-reflective similarity transform
+
+    Returns:
+    ----------
+        @cv2_trans: 2x3 np.array
+            transform matrix from src_pts to dst_pts, could be directly used
+            for cv2.warpAffine()
+    """
+    trans, trans_inv = get_similarity_transform(src_pts, dst_pts, reflective)
+    cv2_trans = cvt_tform_mat_for_cv2(trans)
+    cv2_trans_inv = cvt_tform_mat_for_cv2(trans_inv)
+
+    return cv2_trans, cv2_trans_inv
diff --git a/modelscope/models/cv/image_face_fusion/image_face_fusion.py b/modelscope/models/cv/image_face_fusion/image_face_fusion.py
new file mode 100644
index 00000000..24907ceb
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/image_face_fusion.py
@@ -0,0 +1,253 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from collections import OrderedDict
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL.Image as Image
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.face_detection.peppa_pig_face.facer import FaceAna
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .facegan.gan_wrap import GANWrap
+from .facelib.align_trans import (get_f5p, get_reference_facial_points,
+                                  warp_and_crop_face)
+from .network.aei_flow_net import AEI_Net
+from .network.bfm import ParametricFaceModel
+from .network.facerecon_model import ReconNetWrapper
+from .network.model_irse import Backbone
+from .network.ops import warp_affine_torch
+
+logger = get_logger()
+
+__all__ = ['ImageFaceFusion']
+
+
+@MODELS.register_module(
+    Tasks.image_face_fusion, module_name=Models.image_face_fusion)
+class ImageFaceFusion(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the image face fusion model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        if torch.cuda.is_available():
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+
+        self.num_kp = 17
+        self.id_dim = 512
+
+        self.netG = AEI_Net(
+            c_id=self.id_dim, num_kp=self.num_kp, device=self.device)
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoints = torch.load(model_path, map_location='cpu')
+        model_state = self.convert_state_dict(checkpoints['state_dict'])
+        self.netG.load_state_dict(model_state)
+        self.netG = self.netG.to(self.device)
+        self.netG.eval()
+
+        self.arcface = Backbone([112, 112], 100, 'ir')
+        arcface_path = os.path.join(model_dir, 'faceRecog',
+                                    'CurricularFace_Backbone.pth')
+        self.arcface.load_state_dict(
+            torch.load(arcface_path, map_location='cpu'), strict=False)
+        self.arcface = self.arcface.to(self.device)
+        self.arcface.eval()
+
+        self.f_3d = ReconNetWrapper(net_recon='resnet50', use_last_fc=False)
+        f_3d_path = os.path.join(model_dir, '3dRecon', 'face_3d.pth')
+        self.f_3d.load_state_dict(
+            torch.load(f_3d_path, map_location='cpu')['net_recon'])
+        self.f_3d = self.f_3d.to(self.device)
+        self.f_3d.eval()
+
+        bfm_dir = os.path.join(model_dir, 'BFM')
+        self.face_model = ParametricFaceModel(bfm_folder=bfm_dir)
+        self.face_model.to(self.device)
+
+        face_enhance_path = os.path.join(model_dir, 'faceEnhance',
+                                         '350000-Ns256.pt')
+        self.ganwrap = GANWrap(
+            model_path=face_enhance_path,
+            size=256,
+            channel_multiplier=1,
+            device=self.device)
+
+        self.facer = FaceAna(model_dir)
+
+        logger.info('load facefusion models done')
+
+        self.mask_init = cv2.imread(os.path.join(model_dir, 'alpha.jpg'))
+        self.mask_init = cv2.resize(self.mask_init, (256, 256))
+        self.mask = self.image_transform(self.mask_init, is_norm=False)
+
+        self.test_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        ])
+
+        logger.info('init done')
+
+    def convert_state_dict(self, state_dict):
+        if not next(iter(state_dict)).startswith('module.'):
+            return state_dict
+        new_state_dict = OrderedDict()
+
+        split_index = 0
+        for cur_key, cur_value in state_dict.items():
+            if cur_key.startswith('module.model'):
+                split_index = 13
+            elif cur_key.startswith('module'):
+                split_index = 7
+
+            break
+
+        for k, v in state_dict.items():
+            name = k[split_index:]
+            new_state_dict[name] = v
+        return new_state_dict
+
+    def image_transform(self,
+                        image,
+                        is_norm=True,
+                        mean=(0.5, 0.5, 0.5),
+                        std=(0.5, 0.5, 0.5)):
+        image = image.astype(np.float32)
+        image = image / 255.0
+        if is_norm:
+            image -= mean
+            image /= std
+
+        image = image.transpose((2, 0, 1))
+        image = np.expand_dims(image, axis=0)
+        image = torch.from_numpy(image)
+        image = image.to(self.device)
+        return image
+
+    def extract_id(self, np_source, f5p):
+        Xs = warp_and_crop_face(
+            np_source,
+            f5p,
+            reference_pts=get_reference_facial_points(default_square=True),
+            crop_size=(256, 256))
+
+        Xs = Image.fromarray(Xs)
+        Xs = self.test_transform(Xs)
+        Xs = Xs.unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            embeds, Xs_feats = self.arcface(
+                F.interpolate(
+                    Xs, (112, 112), mode='bilinear', align_corners=True))
+        return embeds, Xs
+
+    def detect_face(self, img):
+        src_h, src_w, _ = img.shape
+        boxes, landmarks, _ = self.facer.run(img)
+        if boxes.shape[0] == 0:
+            return None
+        elif boxes.shape[0] > 1:
+            max_area = 0
+            max_index = 0
+            for i in range(boxes.shape[0]):
+                bbox_width = boxes[i][2] - boxes[i][0]
+                bbox_height = boxes[i][3] - boxes[i][1]
+                area = int(bbox_width) * int(bbox_height)
+                if area > max_area:
+                    max_index = i
+                    max_area = area
+            return landmarks[max_index]
+        else:
+            return landmarks[0]
+
+    def compute_3d_params(self, Xs, Xt):
+        kp_fuse = {}
+        kp_t = {}
+
+        c_s = self.f_3d(
+            F.interpolate(Xs * 0.5 + 0.5, size=224, mode='bilinear'))
+        c_t = self.f_3d(
+            F.interpolate(Xt * 0.5 + 0.5, size=224, mode='bilinear'))
+        c_fuse = torch.cat(((c_s[:, :80] + c_t[:, :80]) / 2, c_t[:, 80:]),
+                           dim=1)
+        _, _, _, q_fuse = self.face_model.compute_for_render(c_fuse)
+        q_fuse = q_fuse / 224
+        q_fuse[..., 1] = 1 - q_fuse[..., 1]
+        q_fuse = q_fuse * 2 - 1
+        delta = int((17 - self.num_kp) / 2)
+
+        _, _, _, q_t = self.face_model.compute_for_render(c_t)
+        q_t = q_t / 224
+        q_t[..., 1] = 1 - q_t[..., 1]
+        q_t = q_t * 2 - 1
+
+        kp_fuse['value'] = q_fuse[:, delta:17 - delta, :]
+        kp_t['value'] = q_t[:, delta:17 - delta, :]
+
+        return kp_fuse, kp_t
+
+    def inference(self, template_img, user_img):
+        ori_h, ori_w, _ = template_img.shape
+
+        template_img = template_img.cpu().numpy()
+        user_img = user_img.cpu().numpy()
+
+        user_img_bgr = user_img[:, :, ::-1]
+        landmark_source = self.detect_face(user_img)
+        if landmark_source is None:
+            logger.warning('No face detected in user image!')
+            return template_img
+        f5p_user = get_f5p(landmark_source, user_img_bgr)
+
+        template_img_bgr = template_img[:, :, ::-1]
+        landmark_template = self.detect_face(template_img)
+        if landmark_template is None:
+            logger.warning('No face detected in template image!')
+            return template_img
+        f5p_template = get_f5p(landmark_template, template_img_bgr)
+
+        Xs_embeds, Xs = self.extract_id(user_img, f5p_user)
+        Xt, trans_inv = warp_and_crop_face(
+            template_img,
+            f5p_template,
+            reference_pts=get_reference_facial_points(default_square=True),
+            crop_size=(256, 256),
+            return_trans_inv=True)
+
+        trans_inv = trans_inv.astype(np.float32)
+        trans_inv = torch.from_numpy(trans_inv)
+        trans_inv = trans_inv.to(self.device)
+        Xt_raw = self.image_transform(template_img, is_norm=False)
+        Xt = self.image_transform(Xt)
+
+        with torch.no_grad():
+            kp_fuse, kp_t = self.compute_3d_params(Xs, Xt)
+            Yt, _, _ = self.netG(Xt, Xs_embeds, kp_fuse, kp_t)
+            Yt = self.ganwrap.process_tensor(Yt)
+            Yt = Yt * 0.5 + 0.5
+            Yt = torch.clamp(Yt, 0, 1)
+
+            Yt_trans_inv = warp_affine_torch(Yt, trans_inv, (ori_h, ori_w))
+            mask_ = warp_affine_torch(self.mask, trans_inv, (ori_h, ori_w))
+
+            Yt_trans_inv = mask_ * Yt_trans_inv + (1 - mask_) * Xt_raw
+            Yt_trans_inv = Yt_trans_inv.squeeze().permute(1, 2,
+                                                          0).cpu().numpy()
+            Yt_trans_inv = Yt_trans_inv.astype(np.float32)
+            out_img = Yt_trans_inv[:, :, ::-1] * 255.
+
+        logger.info('model inference done')
+
+        return out_img.astype(np.uint8)
diff --git a/modelscope/models/cv/image_face_fusion/network/__init__.py b/modelscope/models/cv/image_face_fusion/network/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_face_fusion/network/aad_layer.py b/modelscope/models/cv/image_face_fusion/network/aad_layer.py
new file mode 100644
index 00000000..adb68518
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/aad_layer.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+
+from .ops import SpectralNorm
+
+
+class AADLayer(nn.Module):
+
+    def __init__(self, c_x, attr_c, c_id=256):
+        super(AADLayer, self).__init__()
+        self.attr_c = attr_c
+        self.c_id = c_id
+        self.c_x = c_x
+
+        ks = 3
+        pw = ks // 2
+
+        nhidden = 128
+
+        self.mlp_shared = nn.Sequential(
+            nn.ReflectionPad2d(pw),
+            nn.Conv2d(attr_c, nhidden, kernel_size=ks, padding=0), nn.ReLU())
+        self.pad = nn.ReflectionPad2d(pw)
+        self.conv1 = nn.Conv2d(
+            nhidden, c_x, kernel_size=ks, stride=1, padding=0)
+        self.conv2 = nn.Conv2d(
+            nhidden, c_x, kernel_size=ks, stride=1, padding=0)
+        self.fc1 = nn.Linear(c_id, c_x)
+        self.fc2 = nn.Linear(c_id, c_x)
+
+        self.norm = PositionalNorm2d
+
+        self.pad_h = nn.ReflectionPad2d(pw)
+        self.conv_h = nn.Conv2d(c_x, 1, kernel_size=ks, stride=1, padding=0)
+
+    def forward(self, h_in, z_attr, z_id):
+
+        h = self.norm(h_in)
+        actv = self.mlp_shared(z_attr)
+        gamma_attr = self.conv1(self.pad(actv))
+        beta_attr = self.conv2(self.pad(actv))
+
+        gamma_id = self.fc1(z_id)
+        beta_id = self.fc2(z_id)
+        A = gamma_attr * h + beta_attr
+        gamma_id = gamma_id.reshape(h.shape[0], self.c_x, 1, 1).expand_as(h)
+        beta_id = beta_id.reshape(h.shape[0], self.c_x, 1, 1).expand_as(h)
+        B = gamma_id * h + beta_id
+
+        M = torch.sigmoid(self.conv_h(self.pad_h(h)))
+
+        out = (torch.ones_like(M).to(M.device) - M) * A + M * B
+
+        return out
+
+
+def PositionalNorm2d(x, epsilon=1e-5):
+    mean = x.mean(dim=1, keepdim=True)
+    std = x.var(dim=1, keepdim=True).add(epsilon).sqrt()
+    output = (x - mean) / std
+    return output
+
+
+class AAD_ResBlk(nn.Module):
+
+    def __init__(self, cin, cout, c_attr, c_id=256):
+        super(AAD_ResBlk, self).__init__()
+        self.cin = cin
+        self.cout = cout
+        self.learned_shortcut = (self.cin != self.cout)
+        fmiddle = min(self.cin, self.cout)
+
+        self.AAD1 = AADLayer(cin, c_attr, c_id)
+        self.AAD2 = AADLayer(fmiddle, c_attr, c_id)
+        self.pad = nn.ReflectionPad2d(1)
+        self.conv1 = SpectralNorm(
+            nn.Conv2d(cin, fmiddle, kernel_size=3, stride=1, padding=0))
+        self.conv2 = SpectralNorm(
+            nn.Conv2d(fmiddle, cout, kernel_size=3, stride=1, padding=0))
+
+        self.relu1 = nn.LeakyReLU(2e-1)
+        self.relu2 = nn.LeakyReLU(2e-1)
+
+        if self.learned_shortcut:
+            self.AAD3 = AADLayer(cin, c_attr, c_id)
+            self.conv3 = SpectralNorm(
+                nn.Conv2d(cin, cout, kernel_size=1, bias=False))
+
+    def forward(self, h, z_attr, z_id):
+        x = self.conv1(self.pad(self.relu1(self.AAD1(h, z_attr, z_id))))
+        x = self.conv2(self.pad(self.relu2(self.AAD2(x, z_attr, z_id))))
+
+        if self.learned_shortcut:
+            h = self.conv3(self.AAD3(h, z_attr, z_id))
+
+        x = x + h
+
+        return x
diff --git a/modelscope/models/cv/image_face_fusion/network/aei_flow_net.py b/modelscope/models/cv/image_face_fusion/network/aei_flow_net.py
new file mode 100644
index 00000000..a2047671
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/aei_flow_net.py
@@ -0,0 +1,251 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .aad_layer import AAD_ResBlk
+from .dense_motion import DenseMotionNetwork
+from .ops import SpectralNorm, init_func
+
+
+class Conv4x4(nn.Module):
+
+    def __init__(self, in_c, out_c):
+        super(Conv4x4, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm = nn.BatchNorm2d(out_c)
+        self.lrelu = nn.LeakyReLU(0.1)
+
+    def forward(self, feat):
+        x = self.conv(feat)
+        x = self.norm(x)
+        x = self.lrelu(x)
+        return x
+
+
+class DeConv4x4(nn.Module):
+
+    def __init__(self, in_c, out_c):
+        super(DeConv4x4, self).__init__()
+        self.deconv = nn.ConvTranspose2d(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=4,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.bn = nn.BatchNorm2d(out_c)
+        self.lrelu = nn.LeakyReLU(0.1)
+
+    def forward(self, input, skip):
+        x = self.deconv(input)
+        x = self.bn(x)
+        x = self.lrelu(x)
+        return torch.cat((x, skip), dim=1)
+
+
+class Attention(nn.Module):
+
+    def __init__(self, ch, use_sn=True):
+        super(Attention, self).__init__()
+        self.ch = ch
+        self.theta = nn.Conv2d(
+            self.ch, self.ch // 8, kernel_size=1, padding=0, bias=False)
+        self.phi = nn.Conv2d(
+            self.ch, self.ch // 8, kernel_size=1, padding=0, bias=False)
+        self.g = nn.Conv2d(
+            self.ch, self.ch // 2, kernel_size=1, padding=0, bias=False)
+        self.o = nn.Conv2d(
+            self.ch // 2, self.ch, kernel_size=1, padding=0, bias=False)
+        if use_sn:
+            self.theta = SpectralNorm(self.theta)
+            self.phi = SpectralNorm(self.phi)
+            self.g = SpectralNorm(self.g)
+            self.o = SpectralNorm(self.o)
+        self.gamma = nn.Parameter(torch.tensor(0.), requires_grad=True)
+
+    def forward(self, x, y=None):
+        theta = self.theta(x)
+        phi = F.max_pool2d(self.phi(x), [2, 2])
+        g = F.max_pool2d(self.g(x), [2, 2])
+        theta = theta.view(-1, self.ch // 8, x.shape[2] * x.shape[3])
+        phi = phi.view(-1, self.ch // 8, x.shape[2] * x.shape[3] // 4)
+        g = g.view(-1, self.ch // 2, x.shape[2] * x.shape[3] // 4)
+        beta = F.softmax(torch.bmm(theta.transpose(1, 2), phi), -1)
+        o = self.o(
+            torch.bmm(g, beta.transpose(1, 2)).view(-1, self.ch // 2,
+                                                    x.shape[2], x.shape[3]))
+        return self.gamma * o + x
+
+
+class MLAttrEncoder(nn.Module):
+
+    def __init__(self):
+        super(MLAttrEncoder, self).__init__()
+        self.conv1 = Conv4x4(3, 32)
+        self.conv2 = Conv4x4(32, 64)
+        self.conv3 = Conv4x4(64, 128)
+        self.conv4 = Conv4x4(128, 256)
+        self.conv5 = Conv4x4(256, 512)
+        self.conv6 = Conv4x4(512, 1024)
+        self.conv7 = Conv4x4(1024, 1024)
+
+        self.deconv1 = DeConv4x4(1024, 1024)
+        self.deconv2 = DeConv4x4(2048, 512)
+        self.deconv3 = DeConv4x4(1024, 256)
+        self.deconv4 = DeConv4x4(512, 128)
+        self.deconv5 = DeConv4x4(256, 64)
+        self.deconv6 = DeConv4x4(128, 32)
+
+        self.apply(init_func)
+
+    def forward(self, Xt):
+        feat1 = self.conv1(Xt)
+        feat2 = self.conv2(feat1)
+        feat3 = self.conv3(feat2)
+        feat4 = self.conv4(feat3)
+        feat5 = self.conv5(feat4)
+        feat6 = self.conv6(feat5)
+        z_attr1 = self.conv7(feat6)
+
+        z_attr2 = self.deconv1(z_attr1, feat6)
+        z_attr3 = self.deconv2(z_attr2, feat5)
+        z_attr4 = self.deconv3(z_attr3, feat4)
+        z_attr5 = self.deconv4(z_attr4, feat3)
+        z_attr6 = self.deconv5(z_attr5, feat2)
+        z_attr7 = self.deconv6(z_attr6, feat1)
+        z_attr8 = F.interpolate(
+            z_attr7, scale_factor=2, mode='bilinear', align_corners=True)
+
+        return z_attr1, z_attr2, z_attr3, z_attr4, z_attr5, z_attr6, z_attr7, z_attr8
+
+
+class AADGenerator(nn.Module):
+
+    def __init__(self, c_id=256):
+        super(AADGenerator, self).__init__()
+        self.up1 = nn.ConvTranspose2d(
+            c_id, 1024, kernel_size=2, stride=1, padding=0)
+        self.AADBlk1 = AAD_ResBlk(1024, 1024, 1024, c_id)
+        self.AADBlk2 = AAD_ResBlk(1024, 1024, 2048, c_id)
+        self.AADBlk3 = AAD_ResBlk(1024, 1024, 1024, c_id)
+        self.AADBlk4 = AAD_ResBlk(1024, 512, 512, c_id)
+        self.AADBlk5 = AAD_ResBlk(512, 256, 256, c_id)
+        self.AADBlk6 = AAD_ResBlk(256, 128, 128, c_id)
+        self.AADBlk7 = AAD_ResBlk(128, 64, 64, c_id)
+        self.AADBlk8 = AAD_ResBlk(64, 3, 64, c_id)
+
+        self.sa = Attention(512, use_sn=True)
+
+        self.apply(init_func)
+
+    def forward(self, z_attr, z_id, deformation):
+
+        m = self.up1(z_id.reshape(z_id.shape[0], -1, 1, 1))
+        m2 = F.interpolate(
+            self.AADBlk1(m, z_attr[0], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        m3 = F.interpolate(
+            self.AADBlk2(m2, z_attr[1], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        m4 = F.interpolate(
+            self.AADBlk3(m3, z_attr[2], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        m5 = F.interpolate(
+            self.AADBlk4(m4, z_attr[3], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        m5 = self.sa(m5)
+        m6 = F.interpolate(
+            self.AADBlk5(m5, z_attr[4], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        m7 = F.interpolate(
+            self.AADBlk6(m6, z_attr[5], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+        m8 = F.interpolate(
+            self.AADBlk7(m7, z_attr[6], z_id),
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=True)
+
+        y = self.AADBlk8(m8, z_attr[7], z_id)
+
+        return torch.tanh(y)
+
+    def deform_input(self, inp, deformation):
+        _, h_old, w_old, _ = deformation.shape
+        _, _, h, w = inp.shape
+        if h_old != h or w_old != w:
+            deformation = deformation.permute(0, 3, 1, 2)
+            deformation = F.interpolate(
+                deformation, size=(h, w), mode='bilinear')
+            deformation = deformation.permute(0, 2, 3, 1)
+        return F.grid_sample(inp, deformation), deformation
+
+
+class AEI_Net(nn.Module):
+
+    def __init__(self, c_id=256, num_kp=17, device=torch.device('cuda')):
+        super(AEI_Net, self).__init__()
+        self.device = device
+        self.encoder = MLAttrEncoder()
+        self.generator = AADGenerator(c_id)
+        self.dense_motion_network = DenseMotionNetwork(
+            num_kp=num_kp, num_channels=3, estimate_occlusion_map=False)
+
+    def deform_input(self, inp, deformation):
+        _, h_old, w_old, _ = deformation.shape
+        _, _, h, w = inp.shape
+        if h_old != h or w_old != w:
+            deformation = deformation.permute(0, 3, 1, 2)
+            deformation = F.interpolate(
+                deformation, size=(h, w), mode='bilinear')
+            deformation = deformation.permute(0, 2, 3, 1)
+        return F.grid_sample(inp, deformation), deformation
+
+    def flow_change(self, x, flow):
+        n, c, h, w = x.size()
+        yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
+        xv = xv.float() / (w - 1) * 2.0 - 1
+        yv = yv.float() / (h - 1) * 2.0 - 1
+        grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)),
+                         -1).unsqueeze(0).to(self.device)
+        flow_delta = flow - grid
+        return flow_delta
+
+    def forward(self, Xt, z_id, kp_fuse, kp_t):
+        output_flow = {}
+        dense_motion = self.dense_motion_network(
+            source_image=Xt, kp_driving=kp_fuse, kp_source=kp_t)
+        deformation = dense_motion['deformation']
+
+        with torch.no_grad():
+            Xt_warp, _ = self.deform_input(Xt, deformation)
+        attr = self.encoder(Xt_warp)
+
+        Y = self.generator(attr, z_id, deformation)
+
+        output_flow['deformed'], flow = self.deform_input(Xt, deformation)
+        output_flow['flow'] = self.flow_change(Xt, flow)
+
+        return Y, attr, output_flow
+
+    def get_attr(self, X):
+        return self.encoder(X)
diff --git a/modelscope/models/cv/image_face_fusion/network/bfm.py b/modelscope/models/cv/image_face_fusion/network/bfm.py
new file mode 100644
index 00000000..fe6a42e5
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/bfm.py
@@ -0,0 +1,249 @@
+# The implementation is adopted from Deep3DFaceRecon_pytorch, made publicly available under the MIT License
+# at https://github.com/sicxu/Deep3DFaceRecon_pytorch/blob/master/models/bfm.py
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.io import loadmat
+
+
+def perspective_projection(focal, center):
+    return np.array([focal, 0, center, 0, focal, center, 0, 0,
+                     1]).reshape([3, 3]).astype(np.float32).transpose()
+
+
+class SH:
+
+    def __init__(self):
+        self.a = [np.pi, 2 * np.pi / np.sqrt(3.), 2 * np.pi / np.sqrt(8.)]
+        self.c = [
+            1 / np.sqrt(4 * np.pi),
+            np.sqrt(3.) / np.sqrt(4 * np.pi),
+            3 * np.sqrt(5.) / np.sqrt(12 * np.pi)
+        ]
+
+
+class ParametricFaceModel():
+
+    def __init__(self,
+                 bfm_folder='./BFM',
+                 recenter=True,
+                 camera_distance=10.,
+                 init_lit=np.array([0.8, 0, 0, 0, 0, 0, 0, 0, 0]),
+                 focal=1015.,
+                 center=112.,
+                 is_train=True,
+                 default_name='BFM_model_front.mat'):
+
+        model = loadmat(os.path.join(bfm_folder, default_name))
+        # mean face shape. [3*N,1]
+        self.mean_shape = model['meanshape'].astype(np.float32)
+        # identity basis. [3*N,80]
+        self.id_base = model['idBase'].astype(np.float32)
+        # expression basis. [3*N,64]
+        self.exp_base = model['exBase'].astype(np.float32)
+        # mean face texture. [3*N,1] (0-255)
+        self.mean_tex = model['meantex'].astype(np.float32)
+        # texture basis. [3*N,80]
+        self.tex_base = model['texBase'].astype(np.float32)
+        # face indices for each vertex that lies in. starts from 0. [N,8]
+        self.point_buf = model['point_buf'].astype(np.int64) - 1
+        # vertex indices for each face. starts from 0. [F,3]
+        self.face_buf = model['tri'].astype(np.int64) - 1
+        # vertex indices for 68 landmarks. starts from 0. [68,1]
+        self.keypoints = np.squeeze(model['keypoints']).astype(np.int64) - 1
+
+        if is_train:
+            # vertex indices for small face region to compute photometric error. starts from 0.
+            self.front_mask = np.squeeze(model['frontmask2_idx']).astype(
+                np.int64) - 1
+            # vertex indices for each face from small face region. starts from 0. [f,3]
+            self.front_face_buf = model['tri_mask2'].astype(np.int64) - 1
+            # vertex indices for pre-defined skin region to compute reflectance loss
+            self.skin_mask = np.squeeze(model['skinmask'])
+
+        if recenter:
+            mean_shape = self.mean_shape.reshape([-1, 3])
+            mean_shape = mean_shape - np.mean(
+                mean_shape, axis=0, keepdims=True)
+            self.mean_shape = mean_shape.reshape([-1, 1])
+
+        self.persc_proj = perspective_projection(focal, center)
+        self.device = 'cpu'
+        self.camera_distance = camera_distance
+        self.SH = SH()
+        self.init_lit = init_lit.reshape([1, 1, -1]).astype(np.float32)
+
+    def to(self, device):
+        self.device = device
+        for key, value in self.__dict__.items():
+            if type(value).__module__ == np.__name__:
+                setattr(self, key, torch.tensor(value).to(device))
+
+    def compute_shape(self, id_coeff, exp_coeff):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            id_coeff         -- torch.tensor, size (B, 80), identity coeffs
+            exp_coeff        -- torch.tensor, size (B, 64), expression coeffs
+        """
+        batch_size = id_coeff.shape[0]
+        id_part = torch.einsum('ij,aj->ai', self.id_base, id_coeff)
+        exp_part = torch.einsum('ij,aj->ai', self.exp_base, exp_coeff)
+        face_shape = id_part + exp_part + self.mean_shape.reshape([1, -1])
+        return face_shape.reshape([batch_size, -1, 3])
+
+    def compute_texture(self, tex_coeff, normalize=True):
+        """
+        Return:
+            face_texture     -- torch.tensor, size (B, N, 3), in RGB order, range (0, 1.)
+
+        Parameters:
+            tex_coeff        -- torch.tensor, size (B, 80)
+        """
+        batch_size = tex_coeff.shape[0]
+        face_texture = torch.einsum('ij,aj->ai', self.tex_base,
+                                    tex_coeff) + self.mean_tex
+        if normalize:
+            face_texture = face_texture / 255.
+        return face_texture.reshape([batch_size, -1, 3])
+
+    def compute_norm(self, face_shape):
+        """
+        Return:
+            vertex_norm      -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+
+        v1 = face_shape[:, self.face_buf[:, 0]]
+        v2 = face_shape[:, self.face_buf[:, 1]]
+        v3 = face_shape[:, self.face_buf[:, 2]]
+        e1 = v1 - v2
+        e2 = v2 - v3
+        face_norm = torch.cross(e1, e2, dim=-1)
+        face_norm = F.normalize(face_norm, dim=-1, p=2)
+        face_norm = torch.cat(
+            [face_norm,
+             torch.zeros(face_norm.shape[0], 1, 3).to(self.device)],
+            dim=1)
+
+        vertex_norm = torch.sum(face_norm[:, self.point_buf], dim=2)
+        vertex_norm = F.normalize(vertex_norm, dim=-1, p=2)
+        return vertex_norm
+
+    def compute_color(self, face_texture, face_norm, gamma):
+        batch_size = gamma.shape[0]
+        a, c = self.SH.a, self.SH.c
+        gamma = gamma.reshape([batch_size, 3, 9])
+        gamma = gamma + self.init_lit
+        gamma = gamma.permute(0, 2, 1)
+        face_norm_p1 = face_norm[..., :1]
+        face_norm_p2 = face_norm[..., 1:2]
+        face_norm_p3 = face_norm[..., 2:]
+        face_norm_diff = face_norm_p1**2 - face_norm_p2**2
+        temp = [
+            a[0] * c[0] * torch.ones_like(face_norm_p1).to(self.device),
+            -a[1] * c[1] * face_norm_p2, a[1] * c[1] * face_norm_p3,
+            -a[1] * c[1] * face_norm_p1,
+            a[2] * c[2] * face_norm_p1 * face_norm_p2,
+            -a[2] * c[2] * face_norm_p2 * face_norm_p3,
+            0.5 * a[2] * c[2] / np.sqrt(3.) * (3 * face_norm_p3**2 - 1),
+            -a[2] * c[2] * face_norm_p1 * face_norm_p3,
+            0.5 * a[2] * c[2] * face_norm_diff
+        ]
+
+        Y = torch.cat(temp, dim=-1)
+        r = Y @ gamma[..., :1]
+        g = Y @ gamma[..., 1:2]
+        b = Y @ gamma[..., 2:]
+        face_color = torch.cat([r, g, b], dim=-1) * face_texture
+        return face_color
+
+    def compute_rotation(self, angles):
+        batch_size = angles.shape[0]
+        ones = torch.ones([batch_size, 1]).to(self.device)
+        zeros = torch.zeros([batch_size, 1]).to(self.device)
+        x, y, z = angles[:, :1], angles[:, 1:2], angles[:, 2:],
+
+        temp_x = [
+            ones, zeros, zeros, zeros,
+            torch.cos(x), -torch.sin(x), zeros,
+            torch.sin(x),
+            torch.cos(x)
+        ]
+        rot_x = torch.cat(temp_x, dim=1).reshape([batch_size, 3, 3])
+
+        temp_y = [
+            torch.cos(y), zeros,
+            torch.sin(y), zeros, ones, zeros, -torch.sin(y), zeros,
+            torch.cos(y)
+        ]
+        rot_y = torch.cat(temp_y, dim=1).reshape([batch_size, 3, 3])
+
+        temp_z = [
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z),
+            torch.cos(z), zeros, zeros, zeros, ones
+        ]
+        rot_z = torch.cat(temp_z, dim=1).reshape([batch_size, 3, 3])
+
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+
+    def to_camera(self, face_shape):
+        face_shape[..., -1] = self.camera_distance - face_shape[..., -1]
+        return face_shape
+
+    def to_image(self, face_shape):
+        # to image_plane
+        face_proj = face_shape @ self.persc_proj
+        face_proj = face_proj[..., :2] / face_proj[..., 2:]
+
+        return face_proj
+
+    def transform(self, face_shape, rot, trans):
+        return face_shape @ rot + trans.unsqueeze(1)
+
+    def get_landmarks(self, face_proj):
+        return face_proj[:, self.keypoints]
+
+    def split_coeff(self, coeffs):
+        id_coeffs = coeffs[:, :80]
+        exp_coeffs = coeffs[:, 80:144]
+        tex_coeffs = coeffs[:, 144:224]
+        angles = coeffs[:, 224:227]
+        gammas = coeffs[:, 227:254]
+        translations = coeffs[:, 254:]
+        return {
+            'id': id_coeffs,
+            'exp': exp_coeffs,
+            'tex': tex_coeffs,
+            'angle': angles,
+            'gamma': gammas,
+            'trans': translations
+        }
+
+    def compute_for_render(self, coeffs):
+        coef_dict = self.split_coeff(coeffs)
+        face_shape = self.compute_shape(coef_dict['id'], coef_dict['exp'])
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+        face_shape_transformed = self.transform(face_shape, rotation,
+                                                coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+
+        face_texture = self.compute_texture(coef_dict['tex'])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted,
+                                        coef_dict['gamma'])
+
+        return face_vertex, face_texture, face_color, landmark
diff --git a/modelscope/models/cv/image_face_fusion/network/dense_motion.py b/modelscope/models/cv/image_face_fusion/network/dense_motion.py
new file mode 100644
index 00000000..ba5cc2db
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/dense_motion.py
@@ -0,0 +1,376 @@
+# The implementation is adopted from first-order-model, made publicly available under the MIT License
+# at https://github.com/AliaksandrSiarohin/first-order-model/blob/master/modules/dense_motion.py
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+def kp2gaussian(kp, spatial_size, kp_variance):
+    """
+    Transform a keypoint into gaussian like representation
+    """
+    mean = kp['value']
+
+    coordinate_grid = make_coordinate_grid(spatial_size, mean.type())
+    number_of_leading_dimensions = len(mean.shape) - 1
+    shape = (1, ) * number_of_leading_dimensions + coordinate_grid.shape
+    coordinate_grid = coordinate_grid.view(*shape)
+    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1)
+    coordinate_grid = coordinate_grid.repeat(*repeats)
+
+    # Preprocess kp shape
+    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 2)
+    mean = mean.view(*shape)
+
+    mean_sub = (coordinate_grid - mean)
+
+    out = torch.exp(-0.5 * (mean_sub**2).sum(-1) / kp_variance)
+
+    return out
+
+
+def make_coordinate_grid(spatial_size, type):
+    """
+    Create a meshgrid [-1,1] x [-1,1] of given spatial_size.
+    """
+    h, w = spatial_size
+    x = torch.arange(w).type(type)
+    y = torch.arange(h).type(type)
+
+    x = (2 * (x / (w - 1)) - 1)
+    y = (2 * (y / (h - 1)) - 1)
+
+    yy = y.view(-1, 1).repeat(1, w)
+    xx = x.view(1, -1).repeat(h, 1)
+
+    meshed = torch.cat([xx.unsqueeze_(2), yy.unsqueeze_(2)], 2)
+
+    return meshed
+
+
+class UpBlock2d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 kernel_size=3,
+                 padding=1,
+                 groups=1):
+        super(UpBlock2d, self).__init__()
+
+        self.conv = nn.Conv2d(
+            in_channels=in_features,
+            out_channels=out_features,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=2)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+
+
+class DownBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 kernel_size=3,
+                 padding=1,
+                 groups=1):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_features,
+            out_channels=out_features,
+            kernel_size=kernel_size,
+            padding=padding,
+            groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+
+
+class Encoder(nn.Module):
+    """
+    Hourglass Encoder
+    """
+
+    def __init__(self,
+                 block_expansion,
+                 in_features,
+                 num_blocks=3,
+                 max_features=256):
+        super(Encoder, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(
+                DownBlock2d(
+                    in_features if i == 0 else min(max_features,
+                                                   block_expansion * (2**i)),
+                    min(max_features, block_expansion * (2**(i + 1))),
+                    kernel_size=3,
+                    padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+
+
+class Decoder(nn.Module):
+    """
+    Hourglass Decoder
+    """
+
+    def __init__(self,
+                 block_expansion,
+                 in_features,
+                 num_blocks=3,
+                 max_features=256):
+        super(Decoder, self).__init__()
+
+        up_blocks = []
+
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(
+                max_features, block_expansion * (2**(i + 1)))
+            out_filters = min(max_features, block_expansion * (2**i))
+            up_blocks.append(
+                UpBlock2d(in_filters, out_filters, kernel_size=3, padding=1))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+
+    def forward(self, x):
+        out = x.pop()
+        for up_block in self.up_blocks:
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, skip], dim=1)
+        return out
+
+
+class Hourglass(nn.Module):
+    """
+    Hourglass architecture.
+    """
+
+    def __init__(self,
+                 block_expansion,
+                 in_features,
+                 num_blocks=3,
+                 max_features=256):
+        super(Hourglass, self).__init__()
+        self.encoder = Encoder(block_expansion, in_features, num_blocks,
+                               max_features)
+        self.decoder = Decoder(block_expansion, in_features, num_blocks,
+                               max_features)
+        self.out_filters = self.decoder.out_filters
+
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+
+
+class AntiAliasInterpolation2d(nn.Module):
+    """
+    Band-limited downsampling, for better preservation of the input signal.
+    """
+
+    def __init__(self, channels, scale):
+        super(AntiAliasInterpolation2d, self).__init__()
+        sigma = (1 / scale - 1) / 2
+        kernel_size = 2 * round(sigma * 4) + 1
+        self.ka = kernel_size // 2
+        self.kb = self.ka - 1 if kernel_size % 2 == 0 else self.ka
+
+        kernel_size = [kernel_size, kernel_size]
+        sigma = [sigma, sigma]
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid(
+            [torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= torch.exp(-(mgrid - mean)**2 / (2 * std**2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer('weight', kernel)
+        self.groups = channels
+        self.scale = scale
+        inv_scale = 1 / scale
+        self.int_inv_scale = int(inv_scale)
+
+    def forward(self, input):
+        if self.scale == 1.0:
+            return input
+
+        out = F.pad(input, (self.ka, self.kb, self.ka, self.kb))
+        out = F.conv2d(out, weight=self.weight, groups=self.groups)
+        out = out[:, :, ::self.int_inv_scale, ::self.int_inv_scale]
+
+        return out
+
+
+class DenseMotionNetwork(nn.Module):
+    """
+    Module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
+    """
+
+    def __init__(self,
+                 num_kp,
+                 num_channels,
+                 estimate_occlusion_map=False,
+                 kp_variance=0.01):
+        super(DenseMotionNetwork, self).__init__()
+        block_expansion = 64
+        num_blocks = 5
+        max_features = 1024
+        scale_factor = 0.25
+
+        self.hourglass = Hourglass(
+            block_expansion=block_expansion,
+            in_features=(num_kp + 1) * (num_channels + 1),
+            max_features=max_features,
+            num_blocks=num_blocks)
+
+        self.mask = nn.Conv2d(
+            self.hourglass.out_filters,
+            num_kp + 1,
+            kernel_size=(7, 7),
+            padding=(3, 3))
+
+        if estimate_occlusion_map:
+            self.occlusion = nn.Conv2d(
+                self.hourglass.out_filters,
+                1,
+                kernel_size=(7, 7),
+                padding=(3, 3))
+        else:
+            self.occlusion = None
+
+        self.num_kp = num_kp
+        self.scale_factor = scale_factor
+        self.kp_variance = kp_variance
+
+        if self.scale_factor != 1:
+            self.down = AntiAliasInterpolation2d(num_channels,
+                                                 self.scale_factor)
+
+    def create_heatmap_representations(self, source_image, kp_driving,
+                                       kp_source):
+        """
+        Eq 6. in the paper H_k(z)
+        """
+        spatial_size = source_image.shape[2:]
+        gaussian_driving = kp2gaussian(
+            kp_driving,
+            spatial_size=spatial_size,
+            kp_variance=self.kp_variance)
+        gaussian_source = kp2gaussian(
+            kp_source, spatial_size=spatial_size, kp_variance=self.kp_variance)
+        heatmap = gaussian_driving - gaussian_source
+
+        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0],
+                            spatial_size[1]).type(heatmap.type())
+        heatmap = torch.cat([zeros, heatmap], dim=1)
+        heatmap = heatmap.unsqueeze(2)
+        return heatmap
+
+    def create_sparse_motions(self, source_image, kp_driving, kp_source):
+        """
+        Eq 4. in the paper T_{s<-d}(z)
+        """
+        bs, _, h, w = source_image.shape
+        identity_grid = make_coordinate_grid((h, w),
+                                             type=kp_source['value'].type())
+        identity_grid = identity_grid.view(1, 1, h, w, 2)
+        coordinate_grid = identity_grid - kp_driving['value'].view(
+            bs, self.num_kp, 1, 1, 2)
+        if 'jacobian' in kp_driving:
+            jacobian = torch.matmul(kp_source['jacobian'],
+                                    torch.inverse(kp_driving['jacobian']))
+            jacobian = jacobian.unsqueeze(-3).unsqueeze(-3)
+            jacobian = jacobian.repeat(1, 1, h, w, 1, 1)
+            coordinate_grid = torch.matmul(jacobian,
+                                           coordinate_grid.unsqueeze(-1))
+            coordinate_grid = coordinate_grid.squeeze(-1)
+
+        driving_to_source = coordinate_grid + kp_source['value'].view(
+            bs, self.num_kp, 1, 1, 2)
+
+        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1)
+        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)
+        return sparse_motions
+
+    def create_deformed_source_image(self, source_image, sparse_motions):
+        bs, _, h, w = source_image.shape
+        source_repeat = source_image.unsqueeze(1).unsqueeze(1).repeat(
+            1, self.num_kp + 1, 1, 1, 1, 1)
+        temp_dim = bs * (self.num_kp + 1)
+        source_repeat = source_repeat.view(temp_dim, -1, h, w)
+        sparse_motions = sparse_motions.view((temp_dim, h, w, -1))
+        sparse_deformed = F.grid_sample(source_repeat, sparse_motions)
+        sparse_deformed = sparse_deformed.view((bs, self.num_kp + 1, -1, h, w))
+        return sparse_deformed
+
+    def forward(self, source_image, kp_driving, kp_source):
+        if self.scale_factor != 1:
+            source_image = self.down(source_image)
+
+        bs, _, h, w = source_image.shape
+
+        out_dict = dict()
+        heatmap_representation = self.create_heatmap_representations(
+            source_image, kp_driving, kp_source)
+        sparse_motion = self.create_sparse_motions(source_image, kp_driving,
+                                                   kp_source)
+        deformed_source = self.create_deformed_source_image(
+            source_image, sparse_motion)
+        out_dict['sparse_deformed'] = deformed_source
+
+        input = torch.cat([heatmap_representation, deformed_source], dim=2)
+        input = input.view(bs, -1, h, w)
+
+        prediction = self.hourglass(input)
+
+        mask = self.mask(prediction)
+        mask = F.softmax(mask, dim=1)
+        out_dict['mask'] = mask
+        mask = mask.unsqueeze(2)
+        sparse_motion = sparse_motion.permute(0, 1, 4, 2, 3)
+        deformation = (sparse_motion * mask).sum(dim=1)
+        deformation = deformation.permute(0, 2, 3, 1)
+
+        out_dict['deformation'] = deformation
+
+        if self.occlusion:
+            occlusion_map = torch.sigmoid(self.occlusion(prediction))
+            out_dict['occlusion_map'] = occlusion_map
+
+        return out_dict
diff --git a/modelscope/models/cv/image_face_fusion/network/facerecon_model.py b/modelscope/models/cv/image_face_fusion/network/facerecon_model.py
new file mode 100644
index 00000000..53b34d3c
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/facerecon_model.py
@@ -0,0 +1,559 @@
+# The implementation is adopted from Deep3DFaceRecon_pytorch, made publicly available under the MIT License
+# at https://github.com/sicxu/Deep3DFaceRecon_pytorch/blob/master/models/networks.py
+
+import os
+from typing import Any, Callable, List, Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.optim import lr_scheduler
+
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+
+def get_scheduler(optimizer, opt):
+    """Return a learning rate scheduler
+
+    Parameters:
+        optimizer          -- the optimizer of the network
+        opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions．　
+                              opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
+
+    For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
+    See https://pytorch.org/docs/stable/optim.html for more details.
+    """
+    if opt.lr_policy == 'linear':
+
+        def lambda_rule(epoch):
+            lr_l = 1.0 - max(0, epoch + opt.epoch_count
+                             - opt.n_epochs) / float(opt.n_epochs + 1)
+            return lr_l
+
+        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
+    elif opt.lr_policy == 'step':
+        scheduler = lr_scheduler.StepLR(
+            optimizer, step_size=opt.lr_decay_epochs, gamma=0.2)
+    elif opt.lr_policy == 'plateau':
+        scheduler = lr_scheduler.ReduceLROnPlateau(
+            optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
+    elif opt.lr_policy == 'cosine':
+        scheduler = lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=opt.n_epochs, eta_min=0)
+    else:
+        return NotImplementedError(
+            'learning rate policy [%s] is not implemented', opt.lr_policy)
+    return scheduler
+
+
+def define_net_recon(net_recon, use_last_fc=False, init_path=None):
+    return ReconNetWrapper(
+        net_recon, use_last_fc=use_last_fc, init_path=init_path)
+
+
+class ReconNetWrapper(nn.Module):
+    fc_dim = 257
+
+    def __init__(self, net_recon, use_last_fc=False, init_path=None):
+        super(ReconNetWrapper, self).__init__()
+        self.use_last_fc = use_last_fc
+        if net_recon not in func_dict:
+            return NotImplementedError('network [%s] is not implemented',
+                                       net_recon)
+        func, last_dim = func_dict[net_recon]
+        backbone = func(use_last_fc=use_last_fc, num_classes=self.fc_dim)
+        if init_path and os.path.isfile(init_path):
+            state_dict = filter_state_dict(
+                torch.load(init_path, map_location='cpu'))
+            backbone.load_state_dict(state_dict)
+            print('loading init net_recon %s from %s' % (net_recon, init_path))
+        self.backbone = backbone
+        if not use_last_fc:
+            self.final_layers = nn.ModuleList([
+                conv1x1(last_dim, 80, bias=True),  # id layer
+                conv1x1(last_dim, 64, bias=True),  # exp layer
+                conv1x1(last_dim, 80, bias=True),  # tex layer
+                conv1x1(last_dim, 3, bias=True),  # angle layer
+                conv1x1(last_dim, 27, bias=True),  # gamma layer
+                conv1x1(last_dim, 2, bias=True),  # tx, ty
+                conv1x1(last_dim, 1, bias=True)  # tz
+            ])
+            for m in self.final_layers:
+                nn.init.constant_(m.weight, 0.)
+                nn.init.constant_(m.bias, 0.)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if not self.use_last_fc:
+            output = []
+            for layer in self.final_layers:
+                output.append(layer(x))
+            x = torch.flatten(torch.cat(output, dim=1), 1)
+        return x
+
+
+# adapted from https://github.com/pytorch/vision/edit/master/torchvision/models/resnet.py
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2',
+    'wide_resnet101_2'
+]
+
+model_urls = {
+    'resnet18':
+    'https://download.pytorch.org/models/resnet18-f37072fd.pth',
+    'resnet34':
+    'https://download.pytorch.org/models/resnet34-b627a593.pth',
+    'resnet50':
+    'https://download.pytorch.org/models/resnet50-0676ba61.pth',
+    'resnet101':
+    'https://download.pytorch.org/models/resnet101-63fe2227.pth',
+    'resnet152':
+    'https://download.pytorch.org/models/resnet152-394f9c45.pth',
+    'resnext50_32x4d':
+    'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d':
+    'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2':
+    'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2':
+    'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            groups: int = 1,
+            dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            bias: bool = False) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=bias)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+            self,
+            inplanes: int,
+            planes: int,
+            stride: int = 1,
+            downsample: Optional[nn.Module] = None,
+            groups: int = 1,
+            base_width: int = 64,
+            dilation: int = 1,
+            norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+            self,
+            inplanes: int,
+            planes: int,
+            stride: int = 1,
+            downsample: Optional[nn.Module] = None,
+            groups: int = 1,
+            base_width: int = 64,
+            dilation: int = 1,
+            norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+            self,
+            block: Type[Union[BasicBlock, Bottleneck]],
+            layers: List[int],
+            num_classes: int = 1000,
+            zero_init_residual: bool = False,
+            use_last_fc: bool = False,
+            groups: int = 1,
+            width_per_group: int = 64,
+            replace_stride_with_dilation: Optional[List[bool]] = None,
+            norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.use_last_fc = use_last_fc
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(
+            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        if self.use_last_fc:
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight,
+                                      0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight,
+                                      0)  # type: ignore[arg-type]
+
+    def _make_layer(self,
+                    block: Type[Union[BasicBlock, Bottleneck]],
+                    planes: int,
+                    blocks: int,
+                    stride: int = 1,
+                    dilate: bool = False) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        if self.use_last_fc:
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(arch: str, block: Type[Union[BasicBlock,
+                                         Bottleneck]], layers: List[int],
+            pretrained: bool, progress: bool, **kwargs: Any) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def resnet18(pretrained: bool = False,
+             progress: bool = True,
+             **kwargs: Any) -> ResNet:
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained: bool = False,
+             progress: bool = True,
+             **kwargs: Any) -> ResNet:
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained: bool = False,
+             progress: bool = True,
+             **kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained: bool = False,
+              progress: bool = True,
+              **kwargs: Any) -> ResNet:
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnet152(pretrained: bool = False,
+              progress: bool = True,
+              **kwargs: Any) -> ResNet:
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext50_32x4d(pretrained: bool = False,
+                    progress: bool = True,
+                    **kwargs: Any) -> ResNet:
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained: bool = False,
+                     progress: bool = True,
+                     **kwargs: Any) -> ResNet:
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained: bool = False,
+                    progress: bool = True,
+                    **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained: bool = False,
+                     progress: bool = True,
+                     **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+func_dict = {'resnet18': (resnet18, 512), 'resnet50': (resnet50, 2048)}
diff --git a/modelscope/models/cv/image_face_fusion/network/model_irse.py b/modelscope/models/cv/image_face_fusion/network/model_irse.py
new file mode 100644
index 00000000..4e212bd0
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/model_irse.py
@@ -0,0 +1,249 @@
+# The implementation is adopted from face.evoLVe, made publicly available under the MIT License
+# at https://github.com/ZhaoJ9014/face.evoLVe/blob/master/backbone/model_irse.py
+from collections import namedtuple
+
+import torch
+import torch.nn as nn
+from torch.nn import (AdaptiveAvgPool2d, BatchNorm1d, BatchNorm2d, Conv2d,
+                      Dropout, Linear, MaxPool2d, Module, PReLU, ReLU,
+                      Sequential, Sigmoid)
+
+
+class Flatten(Module):
+
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+
+
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+
+    return output
+
+
+class SEModule(Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+
+        nn.init.xavier_uniform_(self.fc1.weight.data)
+
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+
+        return module_input * x
+
+
+class bottleneck_IR(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class bottleneck_IR_SE(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR_SE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth), SEModule(depth, 16))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    '''A named tuple describing a ResNet block.'''
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+
+    return [Bottleneck(in_channel, depth, stride)
+            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+
+    return blocks
+
+
+class Backbone(Module):
+
+    def __init__(self, input_size, num_layers, mode='ir'):
+        super(Backbone, self).__init__()
+        assert input_size[0] in [
+            112, 224
+        ], 'input_size should be [112, 112] or [224, 224]'
+        assert num_layers in [50, 100,
+                              152], 'num_layers should be 50, 100 or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = bottleneck_IR
+        elif mode == 'ir_se':
+            unit_module = bottleneck_IR_SE
+        self.input_layer = Sequential(
+            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
+            PReLU(64))
+        if input_size[0] == 112:
+            self.output_layer = Sequential(
+                BatchNorm2d(512), Dropout(0.4), Flatten(),
+                Linear(512 * 7 * 7, 512), BatchNorm1d(512, affine=False))
+        else:
+            self.output_layer = Sequential(
+                BatchNorm2d(512), Dropout(0.4), Flatten(),
+                Linear(512 * 14 * 14, 512), BatchNorm1d(512, affine=False))
+
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        conv_out = x.view(x.shape[0], -1)
+        x = self.output_layer(x)
+
+        return l2_norm(x), conv_out
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+
+def IR_50(input_size):
+    """Constructs a ir-50 model.
+    """
+    model = Backbone(input_size, 50, 'ir')
+
+    return model
+
+
+def IR_101(input_size):
+    """Constructs a ir-101 model.
+    """
+    model = Backbone(input_size, 100, 'ir')
+
+    return model
+
+
+def IR_152(input_size):
+    """Constructs a ir-152 model.
+    """
+    model = Backbone(input_size, 152, 'ir')
+
+    return model
+
+
+def IR_SE_50(input_size):
+    """Constructs a ir_se-50 model.
+    """
+    model = Backbone(input_size, 50, 'ir_se')
+
+    return model
+
+
+def IR_SE_101(input_size):
+    """Constructs a ir_se-101 model.
+    """
+    model = Backbone(input_size, 100, 'ir_se')
+
+    return model
+
+
+def IR_SE_152(input_size):
+    """Constructs a ir_se-152 model.
+    """
+    model = Backbone(input_size, 152, 'ir_se')
+
+    return model
diff --git a/modelscope/models/cv/image_face_fusion/network/ops.py b/modelscope/models/cv/image_face_fusion/network/ops.py
new file mode 100644
index 00000000..d168e691
--- /dev/null
+++ b/modelscope/models/cv/image_face_fusion/network/ops.py
@@ -0,0 +1,211 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import Parameter
+
+
+def init_func(m, init_type='xavier', gain=0.02):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm2d') != -1:
+        if hasattr(m, 'weight') and m.weight is not None:
+            nn.init.normal_(m.weight, 1.0, gain)
+        if hasattr(m, 'bias') and m.bias is not None:
+            nn.init.constant_(m.bias, 0.0)
+    elif hasattr(m, 'weight') and (classname.find('Conv') != -1
+                                   or classname.find('Linear') != -1):
+        if init_type == 'normal':
+            nn.init.normal_(m.weight, 0.0, gain)
+        elif init_type == 'xavier':
+            nn.init.xavier_normal_(m.weight, gain=gain)
+        elif init_type == 'xavier_uniform':
+            nn.init.xavier_uniform_(m.weight, gain=1.0)
+        elif init_type == 'kaiming':
+            nn.init.kaiming_normal_(m.weight, a=0, mode='fan_in')
+        elif init_type == 'orthogonal':
+            nn.init.orthogonal_(m.weight, gain=gain)
+        elif init_type == 'none':  # uses pytorch's default init method
+            m.reset_parameters()
+        else:
+            raise NotImplementedError(
+                'initialization method [%s] is not implemented' % init_type)
+        if hasattr(m, 'bias') and m.bias is not None:
+            nn.init.constant_(m.bias, 0.0)
+    elif hasattr(m, 'weight_bar') and (classname.find('Conv') != -1):
+        if init_type == 'normal':
+            nn.init.normal_(m.weight_bar, 0.0, gain)
+        elif init_type == 'xavier':
+            nn.init.xavier_normal_(m.weight_bar, gain=gain)
+        elif init_type == 'xavier_uniform':
+            nn.init.xavier_uniform_(m.weight_bar, gain=1.0)
+        elif init_type == 'kaiming':
+            nn.init.kaiming_normal_(m.weight_bar, a=0, mode='fan_in')
+        elif init_type == 'orthogonal':
+            nn.init.orthogonal_(m.weight_bar, gain=gain)
+        elif init_type == 'none':  # uses pytorch's default init method
+            m.reset_parameters()
+        else:
+            raise NotImplementedError(
+                'initialization method [%s] is not implemented' % init_type)
+        if hasattr(m, 'bias') and m.bias is not None:
+            nn.init.constant_(m.bias, 0.0)
+
+
+def l2normalize(v, eps=1e-12):
+    return v / (v.norm() + eps)
+
+
+class SpectralNorm(nn.Module):
+
+    def __init__(self, module, name='weight', power_iterations=1):
+        super(SpectralNorm, self).__init__()
+        self.module = module
+        self.name = name
+        self.power_iterations = power_iterations
+        self._make_params()
+
+    def _update_u_v(self):
+        u = getattr(self.module, self.name + '_u')
+        v = getattr(self.module, self.name + '_v')
+        w = getattr(self.module, self.name + '_bar')
+
+        height = w.data.shape[0]
+        for _ in range(self.power_iterations):
+            v.data = l2normalize(
+                torch.mv(torch.t(w.view(height, -1).data), u.data))
+            u.data = l2normalize(torch.mv(w.view(height, -1).data, v.data))
+
+        sigma = u.dot(w.view(height, -1).mv(v))
+        setattr(self.module, self.name, w / sigma.expand_as(w))
+
+    def _noupdate_u_v(self):
+        u = getattr(self.module, self.name + '_u')
+        v = getattr(self.module, self.name + '_v')
+        w = getattr(self.module, self.name + '_bar')
+
+        height = w.data.shape[0]
+        sigma = u.dot(w.view(height, -1).mv(v))
+        setattr(self.module, self.name, w / sigma.expand_as(w))
+
+    def _make_params(self):
+        w = getattr(self.module, self.name)
+
+        height = w.data.shape[0]
+        width = w.view(height, -1).data.shape[1]
+
+        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
+        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
+        u.data = l2normalize(u.data)
+        v.data = l2normalize(v.data)
+        w_bar = Parameter(w.data)
+
+        del self.module._parameters[self.name]
+
+        self.module.register_parameter(self.name + '_u', u)
+        self.module.register_parameter(self.name + '_v', v)
+        self.module.register_parameter(self.name + '_bar', w_bar)
+
+    def forward(self, *args):
+        if self.module.training:
+            self._update_u_v()
+        else:
+            self._noupdate_u_v()
+        return self.module.forward(*args)
+
+
+def convert_affinematrix_to_homography(A):
+    H = torch.nn.functional.pad(A, [0, 0, 0, 1], 'constant', value=0.0)
+    H[..., -1, -1] += 1.0
+    return H
+
+
+def normal_transform_pixel(height, width, eps=1e-14):
+    tr_mat = torch.tensor([[1.0, 0.0, -1.0], [0.0, 1.0, -1.0], [0.0, 0.0,
+                                                                1.0]])  # 3x3
+
+    # prevent divide by zero bugs
+    width_denom = eps if width == 1 else width - 1.0
+    height_denom = eps if height == 1 else height - 1.0
+
+    tr_mat[0, 0] = tr_mat[0, 0] * 2.0 / width_denom
+    tr_mat[1, 1] = tr_mat[1, 1] * 2.0 / height_denom
+
+    return tr_mat.unsqueeze(0)  # 1x3x3
+
+
+def _torch_inverse_cast(input):
+    if not isinstance(input, torch.Tensor):
+        raise AssertionError(
+            f'Input must be torch.Tensor. Got: {type(input)}.')
+    dtype = input.dtype
+    if dtype not in (torch.float32, torch.float64):
+        dtype = torch.float32
+    return torch.inverse(input.to(dtype)).to(input.dtype)
+
+
+def normalize_homography(dst_pix_trans_src_pix, dsize_src, dsize_dst):
+    if not isinstance(dst_pix_trans_src_pix, torch.Tensor):
+        raise TypeError(
+            f'Input type is not a torch.Tensor. Got {type(dst_pix_trans_src_pix)}'
+        )
+
+    if not (len(dst_pix_trans_src_pix.shape) == 3
+            or dst_pix_trans_src_pix.shape[-2:] == (3, 3)):
+        raise ValueError(
+            f'Input dst_pix_trans_src_pix must be a Bx3x3 tensor. Got {dst_pix_trans_src_pix.shape}'
+        )
+
+    # source and destination sizes
+    src_h, src_w = dsize_src
+    dst_h, dst_w = dsize_dst
+
+    # compute the transformation pixel/norm for src/dst
+    src_norm_trans_src_pix: torch.Tensor = normal_transform_pixel(
+        src_h, src_w).to(dst_pix_trans_src_pix)
+
+    src_pix_trans_src_norm = _torch_inverse_cast(src_norm_trans_src_pix)
+    dst_norm_trans_dst_pix = normal_transform_pixel(
+        dst_h, dst_w).to(dst_pix_trans_src_pix)
+
+    # compute chain transformations
+    dst_norm_trans_src_norm = dst_norm_trans_dst_pix @ (
+        dst_pix_trans_src_pix @ src_pix_trans_src_norm)
+    return dst_norm_trans_src_norm
+
+
+def warp_affine_torch(src,
+                      M,
+                      dsize,
+                      mode='bilinear',
+                      padding_mode='zeros',
+                      align_corners=True):
+
+    if not isinstance(src, torch.Tensor):
+        raise TypeError(
+            f'Input src type is not a torch.Tensor. Got {type(src)}')
+
+    if not isinstance(M, torch.Tensor):
+        raise TypeError(f'Input M type is not a torch.Tensor. Got {type(M)}')
+
+    if not len(src.shape) == 4:
+        raise ValueError(
+            f'Input src must be a BxCxHxW tensor. Got {src.shape}')
+
+    if not (len(M.shape) == 3 or M.shape[-2:] == (2, 3)):
+        raise ValueError(f'Input M must be a Bx2x3 tensor. Got {M.shape}')
+
+    B, C, H, W = src.size()
+
+    # we generate a 3x3 transformation matrix from 2x3 affine
+    M_3x3 = convert_affinematrix_to_homography(M)
+    dst_norm_trans_src_norm = normalize_homography(M_3x3, (H, W), dsize)
+    src_norm_trans_dst_norm = _torch_inverse_cast(dst_norm_trans_src_norm)
+    grid = F.affine_grid(
+        src_norm_trans_dst_norm[:, :2, :], [B, C, dsize[0], dsize[1]],
+        align_corners=align_corners)
+    return F.grid_sample(
+        src,
+        grid,
+        align_corners=align_corners,
+        mode=mode,
+        padding_mode=padding_mode)
diff --git a/modelscope/models/cv/image_instance_segmentation/__init__.py b/modelscope/models/cv/image_instance_segmentation/__init__.py
index 8ccfef4b..60e688eb 100644
--- a/modelscope/models/cv/image_instance_segmentation/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/__init__.py
@@ -5,13 +5,18 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .cascade_mask_rcnn_swin import CascadeMaskRCNNSwin
+    from .maskdino_swin import MaskDINOSwin
     from .model import CascadeMaskRCNNSwinModel
-    from .postprocess_utils import get_img_ins_seg_result
+    from .maskdino_model import MaskDINOSwinModel
+    from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
 else:
     _import_structure = {
         'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
+        'maskdino_swin': ['MaskDINOSwin'],
         'model': ['CascadeMaskRCNNSwinModel'],
-        'postprocess_utils': ['get_img_ins_seg_result'],
+        'maskdino_model': ['MaskDINOSwinModel'],
+        'postprocess_utils':
+        ['get_img_ins_seg_result', 'get_maskdino_ins_seg_result'],
     }
 
     import sys
diff --git a/modelscope/models/cv/image_instance_segmentation/backbones/__init__.py b/modelscope/models/cv/image_instance_segmentation/backbones/__init__.py
index fec1b627..bbeac51e 100644
--- a/modelscope/models/cv/image_instance_segmentation/backbones/__init__.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/__init__.py
@@ -5,10 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .swin_transformer import SwinTransformer
+    from .swin_transformer import D2SwinTransformer
 
 else:
     _import_structure = {
-        'swin_transformer': ['SwinTransformer'],
+        'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
     }
 
     import sys
diff --git a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
index 2007688d..09ab7c20 100644
--- a/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/swin_transformer.py
@@ -692,3 +692,56 @@ class SwinTransformer(nn.Module):
         """Convert the model into training mode while keep layers freezed."""
         super(SwinTransformer, self).train(mode)
         self._freeze_stages()
+
+
+class D2SwinTransformer(SwinTransformer):
+
+    def __init__(self, *args, **kwargs):
+        self._out_features = kwargs.pop('out_features')
+        super().__init__(*args, **kwargs)
+
+        self._out_feature_strides = {
+            'res2': 4,
+            'res3': 8,
+            'res4': 16,
+            'res5': 32,
+        }
+        self._out_feature_channels = {
+            'res2': self.num_features[0],
+            'res3': self.num_features[1],
+            'res4': self.num_features[2],
+            'res5': self.num_features[3],
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f'SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!'
+        outputs = {}
+        outs = super().forward(x)
+        y = {}
+        for out, i in zip(outs, self.out_indices):
+            y['res{}'.format(i + 2)] = out
+
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: dict(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name])
+            for name in self._out_features
+        }
+
+    @property
+    def size_divisibility(self):
+        return 32
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/__init__.py b/modelscope/models/cv/image_instance_segmentation/maskdino/__init__.py
new file mode 100644
index 00000000..703d04f1
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .maskdino_encoder import MaskDINOEncoder
+    from .maskdino_decoder import MaskDINODecoder
+
+else:
+    _import_structure = {
+        'maskdino_encoder': ['MaskDINOEncoder'],
+        'maskdino_decoder': ['MaskDINODecoder'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/dino_decoder.py b/modelscope/models/cv/image_instance_segmentation/maskdino/dino_decoder.py
new file mode 100644
index 00000000..769922c5
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/dino_decoder.py
@@ -0,0 +1,277 @@
+# The implementation is adopted from DINO, made publicly available under the Apache License,
+# Version 2.0 at https://github.com/IDEA-Research/DINO
+
+from typing import Optional
+
+import torch
+from torch import Tensor, nn
+from torch.cuda.amp import autocast
+
+from .ms_deform_attn import MSDeformAttn
+from .utils import (MLP, _get_activation_fn, _get_clones,
+                    gen_sineembed_for_position, inverse_sigmoid)
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(
+        self,
+        decoder_layer,
+        num_layers,
+        norm=None,
+        return_intermediate=False,
+        d_model=256,
+        query_dim=4,
+        modulate_hw_attn=True,
+        num_feature_levels=1,
+        deformable_decoder=True,
+        decoder_query_perturber=None,
+        dec_layer_number=None,  # number of queries each layer in decoder
+        rm_dec_query_scale=True,
+        dec_layer_share=False,
+        dec_layer_dropout_prob=None,
+    ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(
+                decoder_layer, num_layers, layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, 'support return_intermediate only'
+        self.query_dim = query_dim
+        assert query_dim in [
+            2, 4
+        ], 'query_dim should be 2/4 but {}'.format(query_dim)
+        self.num_feature_levels = num_feature_levels
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model,
+                                  2)
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+
+        if rm_dec_query_scale:
+            self.query_scale = None
+        else:
+            raise NotImplementedError
+        self.bbox_embed = None
+        self.class_embed = None
+
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+
+        self.decoder_query_perturber = decoder_query_perturber
+        self.box_pred_damping = None
+
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+        # for memory
+        level_start_index: Optional[Tensor] = None,  # num_levels
+        spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+        valid_ratios: Optional[Tensor] = None,
+    ):
+
+        output = tgt
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+
+        for layer_id, layer in enumerate(self.layers):
+            # preprocess ref points
+            if self.training and self.decoder_query_perturber is not None and layer_id != 0:
+                reference_points = self.decoder_query_perturber(
+                    reference_points)
+
+            reference_points_input = (
+                reference_points[:, :, None]
+                * torch.cat([valid_ratios, valid_ratios], -1)[None, :]
+            )  # nq, bs, nlevel, 4
+            query_sine_embed = gen_sineembed_for_position(
+                reference_points_input[:, :, 0, :])  # nq, bs, 256*2
+
+            raw_query_pos = self.ref_point_head(
+                query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(
+                output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask)
+
+            # iter update
+            if self.bbox_embed is not None:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+
+                reference_points = new_reference_points.detach()
+                # if layer_id != self.num_layers - 1:
+                ref_points.append(new_reference_points)
+
+            intermediate.append(self.norm(output))
+
+        return [[itm_out.transpose(0, 1) for itm_out in intermediate],
+                [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]]
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation='relu',
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+        use_deformable_box_attn=False,
+        key_aware_type=None,
+    ):
+        super().__init__()
+
+        # cross attention
+        if use_deformable_box_attn:
+            raise NotImplementedError
+        else:
+            self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads,
+                                           n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_type = key_aware_type
+        self.key_aware_proj = None
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    @autocast(enabled=False)
+    def forward(
+            self,
+            tgt: Optional[Tensor],  # nq, bs, d_model
+            tgt_query_pos: Optional[
+                Tensor] = None,  # pos for query. MLP(Sine(pos))
+            tgt_query_sine_embed: Optional[
+                Tensor] = None,  # pos for query. Sine(pos)
+            tgt_key_padding_mask: Optional[Tensor] = None,
+            tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+            memory: Optional[Tensor] = None,  # hw, bs, d_model
+            memory_key_padding_mask: Optional[Tensor] = None,
+            memory_level_start_index: Optional[Tensor] = None,  # num_levels
+            memory_spatial_shapes: Optional[
+                Tensor] = None,  # bs, num_levels, 2
+            memory_pos: Optional[Tensor] = None,  # pos for memory
+            self_attn_mask: Optional[
+                Tensor] = None,  # mask used for self-attention
+            cross_attn_mask: Optional[
+                Tensor] = None,  # mask used for cross-attention
+    ):
+
+        # self attention
+        if self.self_attn is not None:
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+
+        # cross attention
+        if self.key_aware_type is not None:
+            if self.key_aware_type == 'mean':
+                tgt = tgt + memory.mean(0, keepdim=True)
+            elif self.key_aware_type == 'proj_mean':
+                tgt = tgt + self.key_aware_proj(memory).mean(0, keepdim=True)
+            else:
+                raise NotImplementedError('Unknown key_aware_type: {}'.format(
+                    self.key_aware_type))
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+            tgt_reference_points.transpose(0, 1).contiguous(),
+            memory.transpose(0, 1), memory_spatial_shapes,
+            memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/maskdino_decoder.py b/modelscope/models/cv/image_instance_segmentation/maskdino/maskdino_decoder.py
new file mode 100644
index 00000000..5e6c9ad2
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/maskdino_decoder.py
@@ -0,0 +1,352 @@
+# The implementation is adopted from Mask DINO, made publicly available under the Apache License,
+# Version 2.0 at https://github.com/IDEA-Research/MaskDINO
+# Part of implementation is borrowed from Mask2Former,
+# https://github.com/facebookresearch/Mask2Former, under MIT license.
+
+import torch
+from torch import nn
+
+from .dino_decoder import DeformableTransformerDecoderLayer, TransformerDecoder
+from .utils import (MLP, Conv2d, box_xyxy_to_cxcywh,
+                    gen_encoder_output_proposals, get_bounding_boxes,
+                    inverse_sigmoid)
+
+
+class MaskDINODecoder(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        num_classes: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dim_feedforward: int,
+        dec_layers: int,
+        mask_dim: int,
+        enforce_input_project: bool,
+        two_stage: bool,
+        initialize_box_type: bool,
+        initial_pred: bool,
+        learn_tgt: bool,
+        total_num_feature_levels: int = 4,
+        dropout: float = 0.0,
+        activation: str = 'relu',
+        nhead: int = 8,
+        dec_n_points: int = 4,
+        return_intermediate_dec: bool = True,
+        query_dim: int = 4,
+        dec_layer_share: bool = False,
+        semantic_ce_loss: bool = False,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            dec_layers: number of Transformer decoder layers
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+            dropout: dropout rate
+            activation: activation function
+            nhead: num heads in multi-head attention
+            dec_n_points: number of sampling points in decoder
+            return_intermediate_dec: return the intermediate results of decoder
+            query_dim: 4 -> (x, y, w, h)
+            dec_layer_share: whether to share each decoder layer
+            semantic_ce_loss: use ce loss for semantic segmentation
+        """
+        super().__init__()
+
+        assert mask_classification, 'Only support mask classification model'
+        self.mask_classification = mask_classification
+        self.num_feature_levels = total_num_feature_levels
+        self.initial_pred = initial_pred
+
+        # define Transformer decoder here
+        self.learn_tgt = learn_tgt
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.two_stage = two_stage
+        self.initialize_box_type = initialize_box_type
+        self.total_num_feature_levels = total_num_feature_levels
+
+        self.num_queries = num_queries
+        self.semantic_ce_loss = semantic_ce_loss
+        # learnable query features
+        if not two_stage or self.learn_tgt:
+            self.query_feat = nn.Embedding(num_queries, hidden_dim)
+        if not two_stage and initialize_box_type == 'no':
+            self.query_embed = nn.Embedding(num_queries, 4)
+        if two_stage:
+            self.enc_output = nn.Linear(hidden_dim, hidden_dim)
+            self.enc_output_norm = nn.LayerNorm(hidden_dim)
+
+        self.input_proj = nn.ModuleList()
+        for _ in range(self.num_feature_levels):
+            if in_channels != hidden_dim or enforce_input_project:
+                self.input_proj.append(
+                    Conv2d(in_channels, hidden_dim, kernel_size=1))
+                nn.init.kaiming_uniform_(self.input_proj[-1].weight, a=1)
+                nn.init.constant_(self.input_proj[-1].bias, 0)
+            else:
+                self.input_proj.append(nn.Sequential())
+        self.num_classes = num_classes
+        # output FFNs
+        assert self.mask_classification, 'why not class embedding?'
+        if self.mask_classification:
+            if self.semantic_ce_loss:
+                self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+            else:
+                self.class_embed = nn.Linear(hidden_dim, num_classes)
+        self.label_enc = nn.Embedding(num_classes, hidden_dim)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+
+        # init decoder
+        self.decoder_norm = decoder_norm = nn.LayerNorm(hidden_dim)
+        decoder_layer = DeformableTransformerDecoderLayer(
+            hidden_dim, dim_feedforward, dropout, activation,
+            self.num_feature_levels, nhead, dec_n_points)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            self.num_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+            d_model=hidden_dim,
+            query_dim=query_dim,
+            num_feature_levels=self.num_feature_levels,
+            dec_layer_share=dec_layer_share,
+        )
+
+        self.hidden_dim = hidden_dim
+        self._bbox_embed = _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        box_embed_layerlist = [_bbox_embed for i in range(self.num_layers)
+                               ]  # share box prediction each layer
+        self.bbox_embed = nn.ModuleList(box_embed_layerlist)
+        self.decoder.bbox_embed = self.bbox_embed
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def pred_box(self, reference, hs, ref0=None):
+        """
+        Args:
+            reference: reference box coordinates from each decoder layer
+            hs: content
+            ref0: whether there are prediction from the first layer
+        """
+        if ref0 is None:
+            outputs_coord_list = []
+        else:
+            outputs_coord_list = [ref0]
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(
+                zip(reference[:-1], self.bbox_embed, hs)):
+            layer_delta_unsig = layer_bbox_embed(layer_hs)
+            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(
+                layer_ref_sig)
+            layer_outputs_unsig = layer_outputs_unsig.sigmoid()
+            outputs_coord_list.append(layer_outputs_unsig)
+        outputs_coord_list = torch.stack(outputs_coord_list)
+        return outputs_coord_list
+
+    def forward(self, x, mask_features, masks, targets=None):
+        """
+        Args:
+            x: input, a list of multi-scale feature
+            mask_features: is the per-pixel embeddings with resolution 1/4 of the original image,
+                obtained by fusing backbone encoder encoded features. This is used to produce binary masks.
+            masks: mask in the original image
+            targets: used for denoising training
+        """
+        assert len(x) == self.num_feature_levels
+        size_list = []
+        # disable mask, it does not affect performance
+        enable_mask = 0
+        if masks is not None:
+            for src in x:
+                if src.size(2) % 32 or src.size(3) % 32:
+                    enable_mask = 1
+        if enable_mask == 0:
+            masks = [
+                torch.zeros((src.size(0), src.size(2), src.size(3)),
+                            device=src.device,
+                            dtype=torch.bool) for src in x
+            ]
+        src_flatten = []
+        mask_flatten = []
+        spatial_shapes = []
+        for i in range(self.num_feature_levels):
+            idx = self.num_feature_levels - 1 - i
+            bs, c, h, w = x[idx].shape
+            size_list.append(x[i].shape[-2:])
+            spatial_shapes.append(x[idx].shape[-2:])
+            src_flatten.append(self.input_proj[idx](
+                x[idx]).flatten(2).transpose(1, 2))
+            mask_flatten.append(masks[i].flatten(1))
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        predictions_class = []
+        predictions_mask = []
+        if self.two_stage:
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                src_flatten, mask_flatten, spatial_shapes)
+            output_memory = self.enc_output_norm(
+                self.enc_output(output_memory))
+            enc_outputs_class_unselected = self.class_embed(output_memory)
+            enc_outputs_coord_unselected = self._bbox_embed(
+                output_memory
+            ) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+            topk_proposals = torch.topk(
+                enc_outputs_class_unselected.max(-1)[0], topk, dim=1)[1]
+            refpoint_embed_undetach = torch.gather(
+                enc_outputs_coord_unselected, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed = refpoint_embed_undetach.detach()
+
+            tgt_undetach = torch.gather(output_memory, 1,
+                                        topk_proposals.unsqueeze(-1).repeat(
+                                            1, 1,
+                                            self.hidden_dim))  # unsigmoid
+
+            outputs_class, outputs_mask = self.forward_prediction_heads(
+                tgt_undetach.transpose(0, 1), mask_features)
+            tgt = tgt_undetach.detach()
+            if self.learn_tgt:
+                tgt = self.query_feat.weight[None].repeat(bs, 1, 1)
+            interm_outputs = dict()
+            interm_outputs['pred_logits'] = outputs_class
+            interm_outputs['pred_boxes'] = refpoint_embed_undetach.sigmoid()
+            interm_outputs['pred_masks'] = outputs_mask
+
+            if self.initialize_box_type != 'no':
+                # convert masks into boxes to better initialize box in the decoder
+                assert self.initial_pred
+                flatten_mask = outputs_mask.detach().flatten(0, 1)
+                h, w = outputs_mask.shape[-2:]
+                if self.initialize_box_type == 'bitmask':  # slower, but more accurate
+                    refpoint_embed = get_bounding_boxes(flatten_mask > 0)
+                else:
+                    assert NotImplementedError
+                refpoint_embed = box_xyxy_to_cxcywh(
+                    refpoint_embed) / torch.as_tensor(
+                        [w, h, w, h],
+                        dtype=torch.float,
+                        device=refpoint_embed.device)
+                refpoint_embed = refpoint_embed.reshape(
+                    outputs_mask.shape[0], outputs_mask.shape[1], 4)
+                refpoint_embed = inverse_sigmoid(refpoint_embed)
+        elif not self.two_stage:
+            tgt = self.query_feat.weight[None].repeat(bs, 1, 1)
+            refpoint_embed = self.query_embed.weight[None].repeat(bs, 1, 1)
+
+        tgt_mask = None
+        mask_dict = None
+
+        # direct prediction from the matching and denoising part in the begining
+        if self.initial_pred:
+            outputs_class, outputs_mask = self.forward_prediction_heads(
+                tgt.transpose(0, 1), mask_features, self.training)
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=src_flatten.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=None,
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            tgt_mask=tgt_mask)
+        for i, output in enumerate(hs):
+            outputs_class, outputs_mask = self.forward_prediction_heads(
+                output.transpose(0, 1), mask_features, self.training
+                or (i == len(hs) - 1))
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+
+        # iteratively box prediction
+        if self.initial_pred:
+            out_boxes = self.pred_box(references, hs, refpoint_embed.sigmoid())
+            assert len(predictions_class) == self.num_layers + 1
+        else:
+            out_boxes = self.pred_box(references, hs)
+        if mask_dict is not None:
+            predictions_mask = torch.stack(predictions_mask)
+            predictions_class = torch.stack(predictions_class)
+            predictions_class, out_boxes, predictions_mask = \
+                self.dn_post_process(predictions_class, out_boxes, mask_dict, predictions_mask)
+            predictions_class, predictions_mask = list(
+                predictions_class), list(predictions_mask)
+        elif self.training:  # this is to insure self.label_enc participate in the model
+            predictions_class[-1] += 0.0 * self.label_enc.weight.sum()
+
+        out = {
+            'pred_logits':
+            predictions_class[-1],
+            'pred_masks':
+            predictions_mask[-1],
+            'pred_boxes':
+            out_boxes[-1],
+            'aux_outputs':
+            self._set_aux_loss(
+                predictions_class if self.mask_classification else None,
+                predictions_mask, out_boxes)
+        }
+        if self.two_stage:
+            out['interm_outputs'] = interm_outputs
+        return out, mask_dict
+
+    def forward_prediction_heads(self, output, mask_features, pred_mask=True):
+        decoder_output = self.decoder_norm(output)
+        decoder_output = decoder_output.transpose(0, 1)
+        outputs_class = self.class_embed(decoder_output)
+        outputs_mask = None
+        if pred_mask:
+            mask_embed = self.mask_embed(decoder_output)
+            outputs_mask = torch.einsum('bqc,bchw->bqhw', mask_embed,
+                                        mask_features)
+
+        return outputs_class, outputs_mask
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks, out_boxes=None):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if out_boxes is None:
+            return [{
+                'pred_logits': a,
+                'pred_masks': b
+            } for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])]
+        else:
+            return [{
+                'pred_logits': a,
+                'pred_masks': b,
+                'pred_boxes': c
+            } for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],
+                                 out_boxes[:-1])]
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/maskdino_encoder.py b/modelscope/models/cv/image_instance_segmentation/maskdino/maskdino_encoder.py
new file mode 100644
index 00000000..0116ec0b
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/maskdino_encoder.py
@@ -0,0 +1,473 @@
+# The implementation is adopted from Mask DINO, made publicly available under the Apache License,
+# Version 2.0 at https://github.com/IDEA-Research/MaskDINO
+# Part of implementation is borrowed from Mask2Former,
+# https://github.com/facebookresearch/Mask2Former, under MIT license.
+
+from typing import Any, Dict, List
+
+import numpy as np
+import torch
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn import functional as F
+from torch.nn.init import constant_, kaiming_uniform_, normal_
+
+from .ms_deform_attn import MSDeformAttn
+from .position_encoding import PositionEmbeddingSine
+from .utils import Conv2d, _get_activation_fn, _get_clones
+
+
+# MSDeformAttn Transformer encoder in deformable detr
+class MSDeformAttnTransformerEncoderOnly(nn.Module):
+
+    def __init__(
+        self,
+        d_model=256,
+        nhead=8,
+        num_encoder_layers=6,
+        dim_feedforward=1024,
+        dropout=0.1,
+        activation='relu',
+        num_feature_levels=4,
+        enc_n_points=4,
+    ):
+        super().__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        encoder_layer = MSDeformAttnTransformerEncoderLayer(
+            d_model, dim_feedforward, dropout, activation, num_feature_levels,
+            nhead, enc_n_points)
+        self.encoder = MSDeformAttnTransformerEncoder(encoder_layer,
+                                                      num_encoder_layers)
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(num_feature_levels, d_model))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        normal_(self.level_embed)
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def forward(self, srcs, masks, pos_embeds):
+
+        enable_mask = 0
+        if masks is not None:
+            for src in srcs:
+                if src.size(2) % 32 or src.size(3) % 32:
+                    enable_mask = 1
+        if enable_mask == 0:
+            masks = [
+                torch.zeros((x.size(0), x.size(2), x.size(3)),
+                            device=x.device,
+                            dtype=torch.bool) for x in srcs
+            ]
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask,
+                  pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            src = src.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
+                              valid_ratios, lvl_pos_embed_flatten,
+                              mask_flatten)
+
+        return memory, spatial_shapes, level_start_index
+
+
+class MSDeformAttnTransformerEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model=256,
+                 d_ffn=1024,
+                 dropout=0.1,
+                 activation='relu',
+                 n_levels=4,
+                 n_heads=8,
+                 n_points=4):
+        super().__init__()
+
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self,
+                src,
+                pos,
+                reference_points,
+                spatial_shapes,
+                level_start_index,
+                padding_mask=None):
+        # self attention
+        src2 = self.self_attn(
+            self.with_pos_embed(src, pos), reference_points, src,
+            spatial_shapes, level_start_index, padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class MSDeformAttnTransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W_ - 0.5, W_, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self,
+                src,
+                spatial_shapes,
+                level_start_index,
+                valid_ratios,
+                pos=None,
+                padding_mask=None):
+        output = src
+        reference_points = self.get_reference_points(
+            spatial_shapes, valid_ratios, device=src.device)
+        for _, layer in enumerate(self.layers):
+            output = layer(output, pos, reference_points, spatial_shapes,
+                           level_start_index, padding_mask)
+
+        return output
+
+
+class MaskDINOEncoder(nn.Module):
+    """
+    This is the multi-scale encoder in detection models, also named as pixel decoder in segmentation models.
+    """
+
+    def __init__(
+        self,
+        input_shape: Dict[str, Any],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        conv_dim: int,
+        mask_dim: int,
+        # deformable transformer encoder args
+        transformer_in_features: List[str],
+        common_stride: int,
+        num_feature_levels: int,
+        total_num_feature_levels: int,
+        feature_order: str,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            conv_dim: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            num_feature_levels: feature scales used
+            total_num_feature_levels: total feautre scales used (include the downsampled features)
+            feature_order: 'low2high' or 'high2low', i.e., 'low2high' means low-resolution features
+                are put in the first.
+        """
+        super().__init__()
+        transformer_input_shape = {
+            k: v
+            for k, v in input_shape.items() if k in transformer_in_features
+        }
+        # this is the input shape of pixel decoder
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
+        self.in_features = [k for k, v in input_shape
+                            ]  # starting from "res2" to "res5"
+        self.feature_strides = [v['stride'] for k, v in input_shape]
+        self.feature_channels = [v['channels'] for k, v in input_shape]
+        self.feature_order = feature_order
+
+        if feature_order == 'low2high':
+            transformer_input_shape = sorted(
+                transformer_input_shape.items(), key=lambda x: -x[1]['stride'])
+        else:
+            transformer_input_shape = sorted(
+                transformer_input_shape.items(), key=lambda x: x[1]['stride'])
+        self.transformer_in_features = [k for k, v in transformer_input_shape
+                                        ]  # starting from "res2" to "res5"
+        transformer_in_channels = [
+            v['channels'] for k, v in transformer_input_shape
+        ]
+        self.transformer_feature_strides = [
+            v['stride'] for k, v in transformer_input_shape
+        ]  # to decide extra FPN layers
+
+        self.maskdino_num_feature_levels = num_feature_levels  # always use 3 scales
+        self.total_num_feature_levels = total_num_feature_levels
+        self.common_stride = common_stride
+
+        self.transformer_num_feature_levels = len(self.transformer_in_features)
+        self.low_resolution_index = transformer_in_channels.index(
+            max(transformer_in_channels))
+        self.high_resolution_index = 0 if self.feature_order == 'low2high' else -1
+        if self.transformer_num_feature_levels > 1:
+            input_proj_list = []
+            for in_channels in transformer_in_channels[::-1]:
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, conv_dim, kernel_size=1),
+                        nn.GroupNorm(32, conv_dim),
+                    ))
+            # input projectino for downsample
+            in_channels = max(transformer_in_channels)
+            for _ in range(
+                    self.total_num_feature_levels
+                    - self.transformer_num_feature_levels):  # exclude the res2
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            in_channels,
+                            conv_dim,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1),
+                        nn.GroupNorm(32, conv_dim),
+                    ))
+                in_channels = conv_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(
+                        transformer_in_channels[-1], conv_dim, kernel_size=1),
+                    nn.GroupNorm(32, conv_dim),
+                )
+            ])
+
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+        self.transformer = MSDeformAttnTransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            num_feature_levels=self.total_num_feature_levels,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        self.mask_dim = mask_dim
+        # use 1x1 conv instead
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.c2_xavier_fill(self.mask_features)
+        # extra fpn levels
+        stride = min(self.transformer_feature_strides)
+        self.num_fpn_levels = max(
+            int(np.log2(stride) - np.log2(self.common_stride)), 1)
+
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = False
+        for idx, in_channels in enumerate(
+                self.feature_channels[:self.num_fpn_levels]):
+            lateral_norm = nn.GroupNorm(32, conv_dim)
+            output_norm = nn.GroupNorm(32, conv_dim)
+
+            lateral_conv = Conv2d(
+                in_channels,
+                conv_dim,
+                kernel_size=1,
+                bias=use_bias,
+                norm=lateral_norm)
+            output_conv = Conv2d(
+                conv_dim,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+                activation=F.relu,
+            )
+            self.c2_xavier_fill(lateral_conv)
+            self.c2_xavier_fill(output_conv)
+            self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
+            self.add_module('layer_{}'.format(idx + 1), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+
+    def c2_xavier_fill(self, module: nn.Module) -> None:
+        kaiming_uniform_(module.weight, a=1)
+        if module.bias is not None:
+            constant_(module.bias, 0)
+
+    @autocast(enabled=False)
+    def forward_features(self, features, masks):
+        """
+        Args:
+            features: multi-scale features from the backbone
+            masks: image mask
+        Returns:
+            enhanced multi-scale features and mask feature (1/4 resolution) for the decoder to produce binary mask
+        """
+        # backbone features
+        srcs = []
+        pos = []
+        # additional downsampled features
+        srcsl = []
+        posl = []
+        if self.total_num_feature_levels > self.transformer_num_feature_levels:
+            smallest_feat = features[self.transformer_in_features[
+                self.low_resolution_index]].float()
+            _len_srcs = self.transformer_num_feature_levels
+            for i in range(_len_srcs, self.total_num_feature_levels):
+                if i == _len_srcs:
+                    src = self.input_proj[i](smallest_feat)
+                else:
+                    src = self.input_proj[i](srcsl[-1])
+                srcsl.append(src)
+                posl.append(self.pe_layer(src))
+        srcsl = srcsl[::-1]
+        # Reverse feature maps
+        for idx, f in enumerate(self.transformer_in_features[::-1]):
+            x = features[f].float(
+            )  # deformable detr does not support half precision
+            srcs.append(self.input_proj[idx](x))
+            pos.append(self.pe_layer(x))
+        srcs.extend(
+            srcsl) if self.feature_order == 'low2high' else srcsl.extend(srcs)
+        pos.extend(posl) if self.feature_order == 'low2high' else posl.extend(
+            pos)
+        if self.feature_order != 'low2high':
+            srcs = srcsl
+            pos = posl
+        y, spatial_shapes, level_start_index = self.transformer(
+            srcs, masks, pos)
+        bs = y.shape[0]
+
+        split_size_or_sections = [None] * self.total_num_feature_levels
+        for i in range(self.total_num_feature_levels):
+            if i < self.total_num_feature_levels - 1:
+                split_size_or_sections[i] = level_start_index[
+                    i + 1] - level_start_index[i]
+            else:
+                split_size_or_sections[i] = y.shape[1] - level_start_index[i]
+        y = torch.split(y, split_size_or_sections, dim=1)
+
+        out = []
+        multi_scale_features = []
+        num_cur_levels = 0
+        for i, z in enumerate(y):
+            out.append(
+                z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0],
+                                       spatial_shapes[i][1]))
+
+        # append `out` with extra FPN levels
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]):
+            x = features[f].float()
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            cur_fpn = lateral_conv(x)
+            # Following FPN implementation, we use nearest upsampling here
+            y = cur_fpn + F.interpolate(
+                out[self.high_resolution_index],
+                size=cur_fpn.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = output_conv(y)
+            out.append(y)
+        for o in out:
+            if num_cur_levels < self.total_num_feature_levels:
+                multi_scale_features.append(o)
+                num_cur_levels += 1
+        return self.mask_features(out[-1]), out[0], multi_scale_features
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/ms_deform_attn.py b/modelscope/models/cv/image_instance_segmentation/maskdino/ms_deform_attn.py
new file mode 100644
index 00000000..4bcdb998
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/ms_deform_attn.py
@@ -0,0 +1,160 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from __future__ import absolute_import, division, print_function
+import math
+import warnings
+
+import torch
+import torch.nn.functional as F
+from mmcv.ops.multi_scale_deform_attn import (
+    MultiScaleDeformableAttnFunction, multi_scale_deformable_attn_pytorch)
+from torch import nn
+from torch.nn.init import constant_, xavier_uniform_
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError(
+            'invalid input for _is_power_of_2: {} (type: {})'.format(
+                n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """Multi-Scale Deformable Attention Module
+
+        Args:
+            d_model: hidden dimension
+            n_levels: number of feature levels
+            n_heads: number of attention heads
+            n_points: number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError(
+                'd_model must be divisible by n_heads, but got {} and {}'.
+                format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn(
+                "You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = 128
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model,
+                                          n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model,
+                                           n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(
+            self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init
+                     / grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.n_heads, 1, 1, 2).repeat(1, self.n_levels,
+                                                       self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self,
+                query,
+                reference_points,
+                input_flatten,
+                input_spatial_shapes,
+                input_level_start_index,
+                input_padding_mask=None):
+        """
+        Args:
+            query: (N, Length_{query}, C)
+            reference_points: (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area or (N, Length_{query}, n_levels, 4),
+                add additional (w, h) to form reference boxes
+            input_flatten: (N, H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}, C)
+            input_spatial_shapes: (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            input_level_start_index: (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1,
+                H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+            input_padding_mask: (N, H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}), True for padding elements,
+                False for non-padding elements
+
+        Returns:
+             output: (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0]
+                * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads,
+                           self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights,
+                                      -1).view(N, Len_q, self.n_heads,
+                                               self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]],
+                -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :] + sampling_offsets
+                / offset_normalizer[None, None, None, :, None, :])
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points
+                * reference_points[:, :, None, :, None, 2:] * 0.5)
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'
+                .format(reference_points.shape[-1]))
+        try:
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, input_spatial_shapes, input_level_start_index,
+                sampling_locations, attention_weights, self.im2col_step)
+        except Exception:
+            # CPU
+            output = multi_scale_deformable_attn_pytorch(
+                value, input_spatial_shapes, sampling_locations,
+                attention_weights)
+        output = self.output_proj(output)
+        return output
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/position_encoding.py b/modelscope/models/cv/image_instance_segmentation/maskdino/position_encoding.py
new file mode 100644
index 00000000..923f68ec
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/position_encoding.py
@@ -0,0 +1,71 @@
+# The implementation is adopted from Mask DINO, made publicly available under the Apache License,
+# Version 2.0 at https://github.com/IDEA-Research/MaskDINO
+# Part of implementation is borrowed from Mask2Former,
+# https://github.com/facebookresearch/Mask2Former, under MIT license.
+
+import math
+
+import torch
+from torch import nn
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperature=10000,
+                 normalize=False,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.zeros((x.size(0), x.size(2), x.size(3)),
+                               device=x.device,
+                               dtype=torch.bool)
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(
+            self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self, _repr_indent=4):
+        head = 'Positional encoding ' + self.__class__.__name__
+        body = [
+            'num_pos_feats: {}'.format(self.num_pos_feats),
+            'temperature: {}'.format(self.temperature),
+            'normalize: {}'.format(self.normalize),
+            'scale: {}'.format(self.scale),
+        ]
+        # _repr_indent = 4
+        lines = [head] + [' ' * _repr_indent + line for line in body]
+        return '\n'.join(lines)
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino/utils.py b/modelscope/models/cv/image_instance_segmentation/maskdino/utils.py
new file mode 100644
index 00000000..bc674d0b
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino/utils.py
@@ -0,0 +1,184 @@
+# Part of the implementation is borrowed and modified from Mask DINO, publicly available at
+# https://github.com/IDEA-Research/MaskDINO
+
+import copy
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class Conv2d(torch.nn.Conv2d):
+
+    def __init__(self, *args, **kwargs):
+        norm = kwargs.pop('norm', None)
+        activation = kwargs.pop('activation', None)
+        super().__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                     self.dilation, self.groups)
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def get_bounding_boxes(mask_tensor):
+    boxes = torch.zeros(
+        mask_tensor.shape[0],
+        4,
+        dtype=torch.float32,
+        device=mask_tensor.device)
+    x_any = torch.any(mask_tensor, dim=1)
+    y_any = torch.any(mask_tensor, dim=2)
+    for idx in range(mask_tensor.shape[0]):
+        x = torch.where(x_any[idx, :])[0]
+        y = torch.where(y_any[idx, :])[0]
+        if len(x) > 0 and len(y) > 0:
+            boxes[idx, :] = torch.as_tensor([x[0], y[0], x[-1] + 1, y[-1] + 1],
+                                            dtype=torch.float32,
+                                            device=mask_tensor.device)
+    return boxes
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+def gen_encoder_output_proposals(memory: Tensor, memory_padding_mask: Tensor,
+                                 spatial_shapes: Tensor):
+    """
+    Args:
+        memory: bs, h_0*w_0+...+h_{L-1}*w_{L-1}, d_model
+        memory_padding_mask: bs, h_0*w_0+...+h_{L-1}*w_{L-1}
+        spatial_shapes: nlevel, 2
+    Returns:
+        output_memory: bs, h_0*w_0+...+h_{L-1}*w_{L-1}, d_model
+        output_proposals: bs, h_0*w_0+...+h_{L-1}*w_{L-1}, 4
+    """
+    N_, S_, C_ = memory.shape
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(
+            N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        grid_y, grid_x = torch.meshgrid(
+            torch.linspace(
+                0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+            torch.linspace(
+                0, W_ - 1, W_, dtype=torch.float32, device=memory.device))
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+        scale = torch.cat([valid_W.unsqueeze(-1),
+                           valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += (H_ * W_)
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = (output_proposals > 0.01) & (
+        output_proposals < 0.99)
+    output_proposals_valid = output_proposals_valid.all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals / (1 - output_proposals))
+    output_proposals = output_proposals.masked_fill(
+        memory_padding_mask.unsqueeze(-1), float('inf'))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid,
+                                                    float('inf'))
+
+    output_memory = memory
+    output_memory = output_memory.masked_fill(
+        memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                              float(0))
+    return output_memory, output_proposals
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000**(2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
+                        dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
+                        dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()),
+                            dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()),
+                            dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
+            pos_tensor.size(-1)))
+    return pos
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == 'relu':
+        return F.relu
+    if activation == 'gelu':
+        return F.gelu
+    if activation == 'glu':
+        return F.glu
+    if activation == 'prelu':
+        return nn.PReLU()
+    if activation == 'selu':
+        return F.selu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
+
+
+def _get_clones(module, N, layer_share=False):
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino_model.py b/modelscope/models/cv/image_instance_segmentation/maskdino_model.py
new file mode 100644
index 00000000..5cdf48c1
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino_model.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_instance_segmentation import MaskDINOSwin
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.maskdino_swin)
+class MaskDINOSwinModel(TorchModel):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        """
+        Args:
+            model_dir (str): model directory.
+        """
+        super(MaskDINOSwinModel, self).__init__(
+            model_dir=model_dir, *args, **kwargs)
+
+        if 'backbone' not in kwargs:
+            config_path = os.path.join(model_dir, ModelFile.CONFIGURATION)
+            cfg = Config.from_file(config_path)
+            model_cfg = cfg.model
+            kwargs.update(model_cfg)
+
+        self.model = MaskDINOSwin(model_dir=model_dir, **kwargs)
+
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        output = self.model(**input)
+        return output
+
+    def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        return input
diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py b/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py
new file mode 100644
index 00000000..5b60eb40
--- /dev/null
+++ b/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py
@@ -0,0 +1,282 @@
+# Part of the implementation is borrowed and modified from Mask DINO, publicly available at
+# https://github.com/IDEA-Research/MaskDINO
+# Part of implementation is borrowed and modified from Mask2Former, publicly available at
+# https://github.com/facebookresearch/Mask2Former.
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.image_instance_segmentation.backbones import \
+    D2SwinTransformer
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from .maskdino.maskdino_decoder import MaskDINODecoder
+from .maskdino.maskdino_encoder import MaskDINOEncoder
+
+logger = get_logger()
+
+
+class MaskDINOSwin(nn.Module):
+
+    def __init__(self, backbone, encoder, decoder, pretrained=None, **kwargs):
+        """
+        Mask DINO: Towards A Unified Transformer-based Framework for Object
+            Detection and Segmentation. See https://arxiv.org/abs/2206.02777
+        Args:
+            backbone (dict): backbone config.
+            encoder (dict): encoder config.
+            decoder (dict): decoder config.
+            pretrained (bool): whether to use pretrained model
+        """
+        super(MaskDINOSwin, self).__init__()
+        self.register_buffer(
+            'pixel_mean',
+            torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
+        self.register_buffer(
+            'pixel_std',
+            torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
+        self.size_divisibility = 32
+
+        self.backbone = D2SwinTransformer(**backbone)
+        input_shape = {
+            k: v
+            for k, v in self.backbone.output_shape().items()
+            if k in encoder['transformer_in_features']
+        }
+        encoder = MaskDINOEncoder(input_shape=input_shape, **encoder)
+        decoder = MaskDINODecoder(**decoder)
+        self.sem_seg_head = MaskDINOHead(
+            pixel_decoder=encoder, transformer_predictor=decoder)
+        self.num_classes = decoder.num_classes
+        self.num_queries = decoder.num_queries
+        self.test_topk_per_image = 100
+
+        self.classes = kwargs.pop('classes', None)
+
+        if pretrained:
+            assert 'model_dir' in kwargs, 'pretrained model dir is missing.'
+            model_path = os.path.join(kwargs['model_dir'],
+                                      ModelFile.TORCH_MODEL_FILE)
+            logger.info(f'loading model from {model_path}')
+            weight = torch.load(model_path)['model']
+            tgt_weight = self.state_dict()
+            for name in list(weight.keys()):
+                if name in tgt_weight:
+                    load_size = weight[name].size()
+                    tgt_size = tgt_weight[name].size()
+                    mis_match = False
+                    if len(load_size) != len(tgt_size):
+                        mis_match = True
+                    else:
+                        for n1, n2 in zip(load_size, tgt_size):
+                            if n1 != n2:
+                                mis_match = True
+                                break
+                    if mis_match:
+                        logger.info(f'size mismatch for {name}, skip loading.')
+                        del weight[name]
+                else:
+                    logger.info(
+                        f'{name} doesn\'t exist in current model, skip loading.'
+                    )
+
+            self.load_state_dict(weight, strict=False)
+            logger.info('load model done')
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs, **kwargs):
+
+        images = [x['image'].to(self.device) for x in batched_inputs]
+        images = [(255. * x - self.pixel_mean) / self.pixel_std
+                  for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+
+        if self.training:
+            raise NotImplementedError
+        else:
+            outputs, _ = self.sem_seg_head(features)
+            mask_cls_results = outputs['pred_logits']
+            mask_pred_results = outputs['pred_masks']
+            mask_box_results = outputs['pred_boxes']
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode='bilinear',
+                align_corners=False,
+            )
+
+            del outputs
+
+            processed_results = []
+            for mask_cls_result, mask_pred_result, mask_box_result, input_per_image, image_size in zip(
+                    mask_cls_results, mask_pred_results, mask_box_results,
+                    batched_inputs, images.image_sizes
+            ):  # image_size is augmented size, not divisible to 32
+                height = input_per_image.get('height',
+                                             image_size[0])  # real size
+                width = input_per_image.get('width', image_size[1])
+                processed_results.append({})
+                new_size = mask_pred_result.shape[
+                    -2:]  # padded size (divisible to 32)
+
+                # post process
+                mask_pred_result = mask_pred_result[:, :image_size[0], :
+                                                    image_size[1]].expand(
+                                                        1, -1, -1, -1)
+                mask_pred_result = F.interpolate(
+                    mask_pred_result,
+                    size=(height, width),
+                    mode='bilinear',
+                    align_corners=False)[0]
+
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+                mask_box_result = mask_box_result.to(mask_pred_result)
+                height = new_size[0] / image_size[0] * height
+                width = new_size[1] / image_size[1] * width
+                mask_box_result = self.box_postprocess(mask_box_result, height,
+                                                       width)
+
+                instance_r = self.instance_inference(mask_cls_result,
+                                                     mask_pred_result,
+                                                     mask_box_result)
+                processed_results[-1]['instances'] = instance_r
+
+            return dict(eval_result=processed_results)
+
+    def instance_inference(self, mask_cls, mask_pred, mask_box_result):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+        scores = mask_cls.sigmoid()  # [100, 80]
+        labels = torch.arange(
+            self.num_classes,
+            device=self.device).unsqueeze(0).repeat(self.num_queries,
+                                                    1).flatten(0, 1)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+            self.test_topk_per_image, sorted=False)  # select 100
+        labels_per_image = labels[topk_indices]
+        topk_indices = topk_indices // self.num_classes
+        mask_pred = mask_pred[topk_indices]
+        # if this is panoptic segmentation, we only keep the "thing" classes
+
+        result = {'image_size': image_size}
+        # mask (before sigmoid)
+        result['pred_masks'] = (mask_pred > 0).float()
+        # half mask box half pred box
+        mask_box_result = mask_box_result[topk_indices]
+        result['pred_boxes'] = mask_box_result
+
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred.sigmoid().flatten(1)
+                                 * result['pred_masks'].flatten(1)).sum(1) / (
+                                     result['pred_masks'].flatten(1).sum(1)
+                                     + 1e-6)
+        result['scores'] = scores_per_image * mask_scores_per_image
+        result['pred_classes'] = labels_per_image
+        return result
+
+    def box_postprocess(self, out_bbox, img_h, img_w):
+        # postprocess box height and width
+        x_c, y_c, w, h = out_bbox.unbind(-1)
+        boxes = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w),
+                 (y_c + 0.5 * h)]
+        boxes = torch.stack(boxes, dim=-1)
+        scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
+        scale_fct = scale_fct.to(out_bbox)
+        boxes = boxes * scale_fct
+        return boxes
+
+
+class MaskDINOHead(nn.Module):
+
+    def __init__(
+        self,
+        pixel_decoder: nn.Module,
+        transformer_predictor: nn.Module,
+    ):
+        super().__init__()
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+
+    def forward(self, features, mask=None, targets=None):
+        return self.layers(features, mask, targets=targets)
+
+    def layers(self, features, mask=None, targets=None):
+        mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features(
+            features, mask)
+        predictions = self.predictor(
+            multi_scale_features, mask_features, mask, targets=targets)
+        return predictions
+
+
+class ImageList(object):
+
+    def __init__(self, tensor, image_sizes):
+        self.tensor = tensor
+        self.image_sizes = image_sizes
+
+    def __len__(self):
+        return len(self.image_sizes)
+
+    def __getitem__(self, idx):
+        size = self.image_sizes[idx]
+        return self.tensor[idx, ..., :size[0], :size[1]]
+
+    @torch.jit.unused
+    def to(self, *args, **kwargs):
+        cast_tensor = self.tensor.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes)
+
+    @property
+    def device(self):
+        return self.tensor.device
+
+    @staticmethod
+    def from_tensors(tensors, size_divisibility=0, pad_value=0.0):
+        assert len(tensors) > 0
+        assert isinstance(tensors, (tuple, list))
+        for t in tensors:
+            assert isinstance(t, torch.Tensor), type(t)
+            assert t.shape[:-2] == tensors[0].shape[:-2], t.shape
+
+        image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors]
+        image_sizes_tensor = [torch.as_tensor(x) for x in image_sizes]
+        max_size = torch.stack(image_sizes_tensor).max(0).values
+
+        if size_divisibility > 1:
+            stride = size_divisibility
+            # the last two dims are H,W, both subject to divisibility requirement
+            max_size = (max_size + (stride - 1)) // stride * stride
+
+        # handle weirdness of scripting and tracing ...
+        if torch.jit.is_scripting():
+            max_size = max_size.to(dtype=torch.long).tolist()
+        else:
+            if torch.jit.is_tracing():
+                image_sizes = image_sizes_tensor
+
+        if len(tensors) == 1:
+            image_size = image_sizes[0]
+            padding_size = [
+                0, max_size[-1] - image_size[1], 0,
+                max_size[-2] - image_size[0]
+            ]
+            batched_imgs = F.pad(
+                tensors[0], padding_size, value=pad_value).unsqueeze_(0)
+        else:
+            # max_size can be a tensor in tracing mode, therefore convert to list
+            batch_shape = [len(tensors)] + list(
+                tensors[0].shape[:-2]) + list(max_size)
+            batched_imgs = tensors[0].new_full(batch_shape, pad_value)
+            for img, pad_img in zip(tensors, batched_imgs):
+                pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)
+
+        return ImageList(batched_imgs.contiguous(), image_sizes)
diff --git a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
index 6058cd73..fdbb2fb0 100644
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -201,3 +201,42 @@ def show_result(
         img[idx[0], idx[1], :] += alpha * random_color
 
     cv2.imwrite(out_file, img)
+
+
+def get_maskdino_ins_seg_result(maskdino_seg_result,
+                                class_names,
+                                score_thr=0.3):
+    scores = maskdino_seg_result['scores'].detach().cpu().numpy()
+    pred_masks = maskdino_seg_result['pred_masks'].detach().cpu().numpy()
+    pred_boxes = maskdino_seg_result['pred_boxes'].detach().cpu().numpy()
+    pred_classes = maskdino_seg_result['pred_classes'].detach().cpu().numpy()
+
+    thresholded_idxs = np.array(scores) >= score_thr
+    scores = scores[thresholded_idxs]
+    pred_classes = pred_classes[thresholded_idxs]
+    pred_masks = pred_masks[thresholded_idxs]
+    pred_boxes = pred_boxes[thresholded_idxs]
+
+    results_dict = {
+        OutputKeys.BOXES: [],
+        OutputKeys.MASKS: [],
+        OutputKeys.LABELS: [],
+        OutputKeys.SCORES: []
+    }
+    for score, cls, mask, box in zip(scores, pred_classes, pred_masks,
+                                     pred_boxes):
+        score = np.float64(score)
+        label = class_names[int(cls)]
+        mask = np.array(mask, dtype=np.float64)
+        box = [
+            np.int64(box[0]),
+            np.int64(box[1]),
+            np.int64(box[2]),
+            np.int64(box[3])
+        ]
+        results_dict[OutputKeys.SCORES].append(score)
+        results_dict[OutputKeys.LABELS].append(label)
+        results_dict[OutputKeys.MASKS].append(mask)
+        results_dict[OutputKeys.BOXES].append(box)
+
+    return results_dict
diff --git a/modelscope/models/cv/image_matching/__init__.py b/modelscope/models/cv/image_matching/__init__.py
new file mode 100644
index 00000000..919c249c
--- /dev/null
+++ b/modelscope/models/cv/image_matching/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .quadtree_attention_model import QuadTreeAttentionForImageMatching
+
+else:
+    _import_structure = {
+        'quadtree_attention_model': ['QuadTreeAttentionForImageMatching'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_matching/config/__init__.py b/modelscope/models/cv/image_matching/config/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_matching/config/default.py b/modelscope/models/cv/image_matching/config/default.py
new file mode 100644
index 00000000..129d1559
--- /dev/null
+++ b/modelscope/models/cv/image_matching/config/default.py
@@ -0,0 +1,173 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+from yacs.config import CfgNode as CN
+
+_CN = CN()
+
+_CN.LOFTR = CN()
+_CN.LOFTR.BACKBONE_TYPE = 'ResNetFPN'
+_CN.LOFTR.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
+_CN.LOFTR.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
+_CN.LOFTR.FINE_CONCAT_COARSE_FEAT = True
+
+# 1. LoFTR-backbone (local feature CNN) config
+_CN.LOFTR.RESNETFPN = CN()
+_CN.LOFTR.RESNETFPN.INITIAL_DIM = 128
+_CN.LOFTR.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3
+
+# 2. LoFTR-coarse module config
+_CN.LOFTR.COARSE = CN()
+_CN.LOFTR.COARSE.D_MODEL = 256
+_CN.LOFTR.COARSE.D_FFN = 256
+_CN.LOFTR.COARSE.NHEAD = 8
+_CN.LOFTR.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
+_CN.LOFTR.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
+_CN.LOFTR.COARSE.TEMP_BUG_FIX = True
+_CN.LOFTR.COARSE.BLOCK_TYPE = 'quadtree'
+_CN.LOFTR.COARSE.ATTN_TYPE = 'B'
+_CN.LOFTR.COARSE.TOPKS = [16, 8, 8]
+
+# 3. Coarse-Matching config
+_CN.LOFTR.MATCH_COARSE = CN()
+_CN.LOFTR.MATCH_COARSE.THR = 0.2
+_CN.LOFTR.MATCH_COARSE.BORDER_RM = 2
+_CN.LOFTR.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
+_CN.LOFTR.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
+_CN.LOFTR.MATCH_COARSE.SKH_ITERS = 3
+_CN.LOFTR.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
+_CN.LOFTR.MATCH_COARSE.SKH_PREFILTER = False
+_CN.LOFTR.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.3  # training tricks: save GPU memory
+_CN.LOFTR.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200  # training tricks: avoid DDP deadlock
+_CN.LOFTR.MATCH_COARSE.SPARSE_SPVS = False
+
+# 4. LoFTR-fine module config
+_CN.LOFTR.FINE = CN()
+_CN.LOFTR.FINE.D_MODEL = 128
+_CN.LOFTR.FINE.D_FFN = 128
+_CN.LOFTR.FINE.NHEAD = 8
+_CN.LOFTR.FINE.LAYER_NAMES = ['self', 'cross'] * 1
+_CN.LOFTR.FINE.ATTENTION = 'linear'
+_CN.LOFTR.FINE.BLOCK_TYPE = 'loftr'
+
+# 5. LoFTR Losses
+# -- # coarse-level
+_CN.LOFTR.LOSS = CN()
+_CN.LOFTR.LOSS.COARSE_TYPE = 'focal'  # ['focal', 'cross_entropy']
+_CN.LOFTR.LOSS.COARSE_WEIGHT = 1.0
+# -- - -- # focal loss (coarse)
+_CN.LOFTR.LOSS.FOCAL_ALPHA = 0.25
+_CN.LOFTR.LOSS.FOCAL_GAMMA = 2.0
+_CN.LOFTR.LOSS.POS_WEIGHT = 1.0
+_CN.LOFTR.LOSS.NEG_WEIGHT = 1.0
+
+# -- # fine-level
+_CN.LOFTR.LOSS.FINE_TYPE = 'l2_with_std'  # ['l2_with_std', 'l2']
+_CN.LOFTR.LOSS.FINE_WEIGHT = 1.0
+_CN.LOFTR.LOSS.FINE_CORRECT_THR = 1.0
+
+_CN.DATASET = CN()
+# 1. data config
+# training and validating
+_CN.DATASET.TRAINVAL_DATA_SOURCE = None  # options: ['ScanNet', 'MegaDepth']
+_CN.DATASET.TRAIN_DATA_ROOT = None
+_CN.DATASET.TRAIN_POSE_ROOT = None  # (optional directory for poses)
+_CN.DATASET.TRAIN_NPZ_ROOT = None
+_CN.DATASET.TRAIN_LIST_PATH = None
+_CN.DATASET.TRAIN_INTRINSIC_PATH = None
+_CN.DATASET.VAL_DATA_ROOT = None
+_CN.DATASET.VAL_POSE_ROOT = None  # (optional directory for poses)
+_CN.DATASET.VAL_NPZ_ROOT = None
+_CN.DATASET.VAL_LIST_PATH = None  # None if val data from all scenes are bundled into a single npz file
+_CN.DATASET.VAL_INTRINSIC_PATH = None
+# testing
+_CN.DATASET.TEST_DATA_SOURCE = None
+_CN.DATASET.TEST_DATA_ROOT = None
+_CN.DATASET.TEST_POSE_ROOT = None  # (optional directory for poses)
+_CN.DATASET.TEST_NPZ_ROOT = None
+_CN.DATASET.TEST_LIST_PATH = None  # None if test data from all scenes are bundled into a single npz file
+_CN.DATASET.TEST_INTRINSIC_PATH = None
+
+# 2. dataset config
+# general options
+_CN.DATASET.MIN_OVERLAP_SCORE_TRAIN = 0.4  # discard data with overlap_score < min_overlap_score
+_CN.DATASET.MIN_OVERLAP_SCORE_TEST = 0.0
+_CN.DATASET.AUGMENTATION_TYPE = None  # options: [None, 'dark', 'mobile']
+
+# MegaDepth options
+_CN.DATASET.MGDPT_IMG_RESIZE = 640  # resize the longer side, zero-pad bottom-right to square.
+_CN.DATASET.MGDPT_IMG_PAD = True  # pad img to square with size = MGDPT_IMG_RESIZE
+_CN.DATASET.MGDPT_DEPTH_PAD = True  # pad depthmap to square with size = 2000
+_CN.DATASET.MGDPT_DF = 8
+
+_CN.TRAINER = CN()
+_CN.TRAINER.WORLD_SIZE = 1
+_CN.TRAINER.CANONICAL_BS = 64
+_CN.TRAINER.CANONICAL_LR = 8e-3
+_CN.TRAINER.SCALING = None  # this will be calculated automatically
+_CN.TRAINER.FIND_LR = False  # use learning rate finder from pytorch-lightning
+
+# optimizer
+_CN.TRAINER.OPTIMIZER = 'adamw'  # [adam, adamw]
+_CN.TRAINER.TRUE_LR = None  # this will be calculated automatically at runtime
+_CN.TRAINER.ADAM_DECAY = 0.  # ADAM: for adam
+_CN.TRAINER.ADAMW_DECAY = 0.1
+
+# step-based warm-up
+_CN.TRAINER.WARMUP_TYPE = 'linear'  # [linear, constant]
+_CN.TRAINER.WARMUP_RATIO = 0.1
+_CN.TRAINER.WARMUP_STEP = 1875
+
+# learning rate scheduler
+_CN.TRAINER.SCHEDULER = 'MultiStepLR'  # [MultiStepLR, CosineAnnealing, ExponentialLR]
+_CN.TRAINER.SCHEDULER_INTERVAL = 'epoch'  # [epoch, step]
+_CN.TRAINER.MSLR_MILESTONES = [8, 12, 16, 20, 24]  # MSLR: MultiStepLR
+_CN.TRAINER.MSLR_GAMMA = 0.5
+_CN.TRAINER.COSA_TMAX = 30  # COSA: CosineAnnealing
+_CN.TRAINER.ELR_GAMMA = 0.999992  # ELR: ExponentialLR, this value for 'step' interval
+
+# plotting related
+_CN.TRAINER.ENABLE_PLOTTING = True
+_CN.TRAINER.N_VAL_PAIRS_TO_PLOT = 32  # number of val/test paris for plotting
+_CN.TRAINER.PLOT_MODE = 'evaluation'  # ['evaluation', 'confidence']
+_CN.TRAINER.PLOT_MATCHES_ALPHA = 'dynamic'
+
+# geometric metrics and pose solver
+_CN.TRAINER.EPI_ERR_THR = 5e-4  # recommendation: 5e-4 for ScanNet, 1e-4 for MegaDepth (from SuperGlue)
+_CN.TRAINER.POSE_GEO_MODEL = 'E'  # ['E', 'F', 'H']
+_CN.TRAINER.POSE_ESTIMATION_METHOD = 'RANSAC'  # [RANSAC, DEGENSAC, MAGSAC]
+_CN.TRAINER.RANSAC_PIXEL_THR = 0.5
+_CN.TRAINER.RANSAC_CONF = 0.99999
+_CN.TRAINER.RANSAC_MAX_ITERS = 10000
+_CN.TRAINER.USE_MAGSACPP = False
+
+# data sampler for train_dataloader
+_CN.TRAINER.DATA_SAMPLER = 'scene_balance'  # options: ['scene_balance', 'random', 'normal']
+# 'scene_balance' config
+_CN.TRAINER.N_SAMPLES_PER_SUBSET = 200
+_CN.TRAINER.SB_SUBSET_SAMPLE_REPLACEMENT = True  # whether sample each scene with replacement or not
+_CN.TRAINER.SB_SUBSET_SHUFFLE = True  # after sampling from scenes, whether shuffle within the epoch or not
+_CN.TRAINER.SB_REPEAT = 1  # repeat N times for training the sampled data
+# 'random' config
+_CN.TRAINER.RDM_REPLACEMENT = True
+_CN.TRAINER.RDM_NUM_SAMPLES = None
+
+# gradient clipping
+_CN.TRAINER.GRADIENT_CLIPPING = 0.5
+
+# reproducibility
+# This seed affects the data sampling. With the same seed, the data sampling is promised
+# to be the same. When resume training from a checkpoint, it's better to use a different
+# seed, otherwise the sampled data will be exactly the same as before resuming, which will
+# cause less unique data items sampled during the entire training.
+# Use of different seed values might affect the final training result, since not all data items
+# are used during training on ScanNet. (60M pairs of images sampled during traing from 230M pairs in total.)
+_CN.TRAINER.SEED = 66
+
+
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _CN.clone()
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/__init__.py b/modelscope/models/cv/image_matching/loftr_quadtree/__init__.py
new file mode 100644
index 00000000..1841a5c6
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/__init__.py
@@ -0,0 +1,5 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+from .loftr import LoFTR
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/backbone/__init__.py b/modelscope/models/cv/image_matching/loftr_quadtree/backbone/__init__.py
new file mode 100644
index 00000000..48343c7a
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/backbone/__init__.py
@@ -0,0 +1,16 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+from .resnet_fpn import ResNetFPN_8_2, ResNetFPN_16_4
+
+
+def build_backbone(config):
+    if config['backbone_type'] == 'ResNetFPN':
+        if config['resolution'] == (8, 2):
+            return ResNetFPN_8_2(config['resnetfpn'])
+        elif config['resolution'] == (16, 4):
+            return ResNetFPN_16_4(config['resnetfpn'])
+    else:
+        raise ValueError(
+            f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.")
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/backbone/resnet_fpn.py b/modelscope/models/cv/image_matching/loftr_quadtree/backbone/resnet_fpn.py
new file mode 100644
index 00000000..b635559a
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/backbone/resnet_fpn.py
@@ -0,0 +1,223 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution without padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=1,
+        stride=stride,
+        padding=0,
+        bias=False)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(in_planes, planes, stride)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                conv1x1(in_planes, planes, stride=stride),
+                nn.BatchNorm2d(planes))
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.bn1(self.conv1(y)))
+        y = self.bn2(self.conv2(y))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class ResNetFPN_8_2(nn.Module):
+    """
+    ResNet+FPN, output resolution are 1/8 and 1/2.
+    Each block has 2 layers.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        # Config
+        block = BasicBlock
+        initial_dim = config['initial_dim']
+        block_dims = config['block_dims']
+
+        # Class Variable
+        self.in_planes = initial_dim
+
+        # Networks
+        self.conv1 = nn.Conv2d(
+            1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(initial_dim)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.layer1 = self._make_layer(block, block_dims[0], stride=1)  # 1/2
+        self.layer2 = self._make_layer(block, block_dims[1], stride=2)  # 1/4
+        self.layer3 = self._make_layer(block, block_dims[2], stride=2)  # 1/8
+
+        # 3. FPN upsample
+        self.layer3_outconv = conv1x1(block_dims[2], block_dims[2])
+        self.layer2_outconv = conv1x1(block_dims[1], block_dims[2])
+        self.layer2_outconv2 = nn.Sequential(
+            conv3x3(block_dims[2], block_dims[2]),
+            nn.BatchNorm2d(block_dims[2]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[2], block_dims[1]),
+        )
+        self.layer1_outconv = conv1x1(block_dims[0], block_dims[1])
+        self.layer1_outconv2 = nn.Sequential(
+            conv3x3(block_dims[1], block_dims[1]),
+            nn.BatchNorm2d(block_dims[1]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[1], block_dims[0]),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, dim, stride=1):
+        layer1 = block(self.in_planes, dim, stride=stride)
+        layer2 = block(dim, dim, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # ResNet Backbone
+        x0 = self.relu(self.bn1(self.conv1(x)))
+        x1 = self.layer1(x0)  # 1/2
+        x2 = self.layer2(x1)  # 1/4
+        x3 = self.layer3(x2)  # 1/8
+
+        # FPN
+        x3_out = self.layer3_outconv(x3)
+
+        x3_out_2x = F.interpolate(
+            x3_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x2_out = self.layer2_outconv(x2)
+        x2_out = self.layer2_outconv2(x2_out + x3_out_2x)
+
+        x2_out_2x = F.interpolate(
+            x2_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x1_out = self.layer1_outconv(x1)
+        x1_out = self.layer1_outconv2(x1_out + x2_out_2x)
+
+        return [x3_out, x1_out]
+
+
+class ResNetFPN_16_4(nn.Module):
+    """
+    ResNet+FPN, output resolution are 1/16 and 1/4.
+    Each block has 2 layers.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        # Config
+        block = BasicBlock
+        initial_dim = config['initial_dim']
+        block_dims = config['block_dims']
+
+        # Class Variable
+        self.in_planes = initial_dim
+
+        # Networks
+        self.conv1 = nn.Conv2d(
+            1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(initial_dim)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.layer1 = self._make_layer(block, block_dims[0], stride=1)  # 1/2
+        self.layer2 = self._make_layer(block, block_dims[1], stride=2)  # 1/4
+        self.layer3 = self._make_layer(block, block_dims[2], stride=2)  # 1/8
+        self.layer4 = self._make_layer(block, block_dims[3], stride=2)  # 1/16
+
+        # 3. FPN upsample
+        self.layer4_outconv = conv1x1(block_dims[3], block_dims[3])
+        self.layer3_outconv = conv1x1(block_dims[2], block_dims[3])
+        self.layer3_outconv2 = nn.Sequential(
+            conv3x3(block_dims[3], block_dims[3]),
+            nn.BatchNorm2d(block_dims[3]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[3], block_dims[2]),
+        )
+
+        self.layer2_outconv = conv1x1(block_dims[1], block_dims[2])
+        self.layer2_outconv2 = nn.Sequential(
+            conv3x3(block_dims[2], block_dims[2]),
+            nn.BatchNorm2d(block_dims[2]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[2], block_dims[1]),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, dim, stride=1):
+        layer1 = block(self.in_planes, dim, stride=stride)
+        layer2 = block(dim, dim, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # ResNet Backbone
+        x0 = self.relu(self.bn1(self.conv1(x)))
+        x1 = self.layer1(x0)  # 1/2
+        x2 = self.layer2(x1)  # 1/4
+        x3 = self.layer3(x2)  # 1/8
+        x4 = self.layer4(x3)  # 1/16
+
+        # FPN
+        x4_out = self.layer4_outconv(x4)
+
+        x4_out_2x = F.interpolate(
+            x4_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x3_out = self.layer3_outconv(x3)
+        x3_out = self.layer3_outconv2(x3_out + x4_out_2x)
+
+        x3_out_2x = F.interpolate(
+            x3_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x2_out = self.layer2_outconv(x2)
+        x2_out = self.layer2_outconv2(x2_out + x3_out_2x)
+
+        return [x4_out, x2_out]
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/loftr.py b/modelscope/models/cv/image_matching/loftr_quadtree/loftr.py
new file mode 100644
index 00000000..16dd6521
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/loftr.py
@@ -0,0 +1,98 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+import torch
+import torch.nn as nn
+from einops.einops import rearrange
+
+from .backbone import build_backbone
+from .loftr_module import FinePreprocess, LocalFeatureTransformer
+from .utils.coarse_matching import CoarseMatching
+from .utils.fine_matching import FineMatching
+from .utils.position_encoding import PositionEncodingSine
+
+
+class LoFTR(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        # Misc
+        self.config = config
+
+        # Modules
+        self.backbone = build_backbone(config)
+        self.pos_encoding = PositionEncodingSine(
+            config['coarse']['d_model'],
+            temp_bug_fix=config['coarse']['temp_bug_fix'])
+        self.loftr_coarse = LocalFeatureTransformer(config['coarse'])
+        self.coarse_matching = CoarseMatching(config['match_coarse'])
+        self.fine_preprocess = FinePreprocess(config)
+        self.loftr_fine = LocalFeatureTransformer(config['fine'])
+        self.fine_matching = FineMatching()
+
+    def forward(self, data):
+        """
+        Update:
+            data (dict): {
+                'image0': (torch.Tensor): (N, 1, H, W)
+                'image1': (torch.Tensor): (N, 1, H, W)
+                'mask0'(optional) : (torch.Tensor): (N, H, W) '0' indicates a padded position
+                'mask1'(optional) : (torch.Tensor): (N, H, W)
+            }
+        """
+        # 1. Local Feature CNN
+        data.update({
+            'bs': data['image0'].size(0),
+            'hw0_i': data['image0'].shape[2:],
+            'hw1_i': data['image1'].shape[2:]
+        })
+
+        if data['hw0_i'] == data['hw1_i']:  # faster & better BN convergence
+            feats_c, feats_f = self.backbone(
+                torch.cat([data['image0'], data['image1']], dim=0))
+            (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(
+                data['bs']), feats_f.split(data['bs'])
+        else:  # handle different input shapes
+            (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(
+                data['image0']), self.backbone(data['image1'])
+
+        data.update({
+            'hw0_c': feat_c0.shape[2:],
+            'hw1_c': feat_c1.shape[2:],
+            'hw0_f': feat_f0.shape[2:],
+            'hw1_f': feat_f1.shape[2:]
+        })
+
+        # 2. coarse-level loftr module
+        # add featmap with positional encoding, then flatten it to sequence [N, HW, C]
+
+        feat_c0 = self.pos_encoding(feat_c0)
+        feat_c1 = self.pos_encoding(feat_c1)
+
+        mask_c0 = mask_c1 = None  # mask is useful in training
+        if 'mask0' in data:
+            mask_c0, mask_c1 = data['mask0'].flatten(
+                -2), data['mask1'].flatten(-2)
+        feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0,
+                                             mask_c1)
+
+        # 3. match coarse-level
+        self.coarse_matching(
+            feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1)
+
+        # 4. fine-level refinement
+        feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(
+            feat_f0, feat_f1, feat_c0, feat_c1, data)
+        if feat_f0_unfold.size(0) != 0:  # at least one coarse level predicted
+            feat_f0_unfold, feat_f1_unfold = self.loftr_fine(
+                feat_f0_unfold, feat_f1_unfold)
+
+        # 5. match fine-level
+        self.fine_matching(feat_f0_unfold, feat_f1_unfold, data)
+
+    def load_state_dict(self, state_dict, *args, **kwargs):
+        for k in list(state_dict.keys()):
+            if k.startswith('matcher.'):
+                state_dict[k.replace('matcher.', '', 1)] = state_dict.pop(k)
+        return super().load_state_dict(state_dict, *args, **kwargs)
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/__init__.py b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/__init__.py
new file mode 100644
index 00000000..9d766a46
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/__init__.py
@@ -0,0 +1,6 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+from .fine_preprocess import FinePreprocess
+from .transformer import LocalFeatureTransformer
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/fine_preprocess.py b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/fine_preprocess.py
new file mode 100644
index 00000000..1b1c159d
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/fine_preprocess.py
@@ -0,0 +1,77 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.einops import rearrange, repeat
+
+
+class FinePreprocess(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.cat_c_feat = config['fine_concat_coarse_feat']
+        self.W = self.config['fine_window_size']
+
+        d_model_c = self.config['coarse']['d_model']
+        d_model_f = self.config['fine']['d_model']
+        self.d_model_f = d_model_f
+        if self.cat_c_feat:
+            self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True)
+            self.merge_feat = nn.Linear(2 * d_model_f, d_model_f, bias=True)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.kaiming_normal_(p, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data):
+        W = self.W
+        stride = data['hw0_f'][0] // data['hw0_c'][0]
+
+        data.update({'W': W})
+        if data['b_ids'].shape[0] == 0:
+            feat0 = torch.empty(
+                0, self.W**2, self.d_model_f, device=feat_f0.device)
+            feat1 = torch.empty(
+                0, self.W**2, self.d_model_f, device=feat_f0.device)
+            return feat0, feat1
+
+        # 1. unfold(crop) all local windows
+        feat_f0_unfold = F.unfold(
+            feat_f0, kernel_size=(W, W), stride=stride, padding=W // 2)
+        feat_f0_unfold = rearrange(
+            feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
+        feat_f1_unfold = F.unfold(
+            feat_f1, kernel_size=(W, W), stride=stride, padding=W // 2)
+        feat_f1_unfold = rearrange(
+            feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
+
+        # 2. select only the predicted matches
+        feat_f0_unfold = feat_f0_unfold[data['b_ids'],
+                                        data['i_ids']]  # [n, ww, cf]
+        feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']]
+
+        # option: use coarse-level loftr feature as context: concat and linear
+        if self.cat_c_feat:
+            feat_c_win = self.down_proj(
+                torch.cat([
+                    feat_c0[data['b_ids'], data['i_ids']],
+                    feat_c1[data['b_ids'], data['j_ids']]
+                ], 0))  # [2n, c]
+            feat_cf_win = self.merge_feat(
+                torch.cat(
+                    [
+                        torch.cat([feat_f0_unfold, feat_f1_unfold],
+                                  0),  # [2n, ww, cf]
+                        repeat(feat_c_win, 'n c -> n ww c', ww=W**2),  # noqa
+                    ],
+                    -1))  # noqa
+            feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0)
+
+        return feat_f0_unfold, feat_f1_unfold
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/linear_attention.py b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/linear_attention.py
new file mode 100644
index 00000000..76d58764
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/linear_attention.py
@@ -0,0 +1,89 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+"""
+Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention"
+Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+"""
+
+import torch
+from torch.nn import Dropout, Module
+
+
+def elu_feature_map(x):
+    return torch.nn.functional.elu(x) + 1
+
+
+class LinearAttention(Module):
+
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.feature_map = elu_feature_map
+        self.eps = eps
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-Head linear attention proposed in "Transformers are RNNs"
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        Q = self.feature_map(queries)
+        K = self.feature_map(keys)
+
+        # set padded position to zero
+        if q_mask is not None:
+            Q = Q * q_mask[:, :, None, None]
+        if kv_mask is not None:
+            K = K * kv_mask[:, :, None, None]
+            values = values * kv_mask[:, :, None, None]
+
+        v_length = values.size(1)
+        values = values / v_length  # prevent fp16 overflow
+        KV = torch.einsum('nshd,nshv->nhdv', K, values)  # (S,D)' @ S,V
+        Z = 1 / (torch.einsum('nlhd,nhd->nlh', Q, K.sum(dim=1)) + self.eps)
+        queried_values = torch.einsum('nlhd,nhdv,nlh->nlhv', Q, KV,
+                                      Z) * v_length
+
+        return queried_values.contiguous()
+
+
+class FullAttention(Module):
+
+    def __init__(self, use_dropout=False, attention_dropout=0.1):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.dropout = Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-head scaled dot-product attention, a.k.a full attention.
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum('nlhd,nshd->nlsh', queries, keys)
+        if kv_mask is not None:
+            QK.masked_fill_(
+                ~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]),
+                float('-inf'))
+
+        # Compute the attention and the weighted average
+        softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
+        A = torch.softmax(softmax_temp * QK, dim=2)
+        if self.use_dropout:
+            A = self.dropout(A)
+
+        queried_values = torch.einsum('nlsh,nshd->nlhd', A, values)
+
+        return queried_values.contiguous()
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/quadtree_attention.py b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/quadtree_attention.py
new file mode 100644
index 00000000..a0a40615
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/quadtree_attention.py
@@ -0,0 +1,98 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_
+
+from modelscope.ops.quadtree_attention import QTAttA, QTAttB
+
+
+class QuadtreeAttention(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        topks,
+        value_branch=False,
+        act=nn.GELU(),
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        scale=1,
+        attn_type='B',
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, f'dim {dim} should be divided by num_heads {num_heads}.'
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q_proj = nn.Conv2d(
+            dim, dim, kernel_size=1, stride=1, bias=qkv_bias)
+        self.k_proj = nn.Conv2d(
+            dim, dim, kernel_size=1, stride=1, bias=qkv_bias)
+        self.v_proj = nn.Conv2d(
+            dim, dim, kernel_size=1, stride=1, bias=qkv_bias)
+        if attn_type == 'A':
+            self.py_att = QTAttA(
+                num_heads, dim // num_heads, scale=scale, topks=topks)
+        else:
+            self.py_att = QTAttB(
+                num_heads, dim // num_heads, scale=scale, topks=topks)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.scale = scale
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            trunc_normal_(m.weight, std=0.02)
+            m.init = True
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, target, H, W, msg=None):
+
+        B, N, C = x.shape
+        x = x.permute(0, 2, 1).reshape(B, C, H, W)
+        target = target.permute(0, 2, 1).reshape(B, C, H, W)
+        keys = []
+        values = []
+        queries = []
+
+        q = self.q_proj(x)
+        k = self.k_proj(target)
+        v = self.v_proj(target)
+        for i in range(self.scale):
+            keys.append(k)
+            values.append(v)
+            queries.append(q)
+
+            if i != self.scale - 1:
+                k = F.avg_pool2d(k, kernel_size=2, stride=2)
+                q = F.avg_pool2d(q, kernel_size=2, stride=2)
+                v = F.avg_pool2d(v, kernel_size=2, stride=2)
+
+        msg = self.py_att(queries, keys, values).view(B, -1, C)
+
+        x = self.proj(msg)
+        x = self.proj_drop(x)
+
+        return x
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/transformer.py b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/transformer.py
new file mode 100644
index 00000000..69a6f7da
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/loftr_module/transformer.py
@@ -0,0 +1,287 @@
+# Part of the implementation is borrowed and modified from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+import copy
+import math
+
+import torch
+import torch.nn as nn
+from einops.einops import rearrange
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+from .linear_attention import FullAttention, LinearAttention
+from .quadtree_attention import QuadtreeAttention
+
+
+class DWConv(nn.Module):
+
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+
+        return x
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.relu = nn.ReLU(inplace=True)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.relu(x)
+
+        x = self.dwconv(x, H, W)
+
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+
+        return x
+
+
+class LoFTREncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, attention='linear'):
+        super(LoFTREncoderLayer, self).__init__()
+
+        self.dim = d_model // nhead
+        self.nhead = nhead
+
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.attention = LinearAttention(
+        ) if attention == 'linear' else FullAttention()
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+
+        # feed-forward network
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 2, d_model * 2, bias=False),
+            nn.ReLU(True),
+            nn.Linear(d_model * 2, d_model, bias=False),
+        )
+
+        # norm and dropout
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    def forward(self, x, source, x_mask=None, source_mask=None):
+        """
+        Args:
+            x (torch.Tensor): [N, L, C]
+            source (torch.Tensor): [N, S, C]
+            x_mask (torch.Tensor): [N, L] (optional)
+            source_mask (torch.Tensor): [N, S] (optional)
+        """
+        bs = x.size(0)
+        query, key, value = x, source, source
+
+        # multi-head attention
+        query = self.q_proj(query).view(bs, -1, self.nhead,
+                                        self.dim)  # [N, L, (H, D)]
+        key = self.k_proj(key).view(bs, -1, self.nhead,
+                                    self.dim)  # [N, S, (H, D)]
+        value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
+        message = self.attention(
+            query, key, value, q_mask=x_mask,
+            kv_mask=source_mask)  # [N, L, (H, D)]
+        message = self.merge(message.view(bs, -1,
+                                          self.nhead * self.dim))  # [N, L, C]
+        message = self.norm1(message)
+
+        # feed-forward network
+        message = self.mlp(torch.cat([x, message], dim=2))
+        message = self.norm2(message)
+
+        return x + message
+
+
+class QuadtreeBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 topks,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 scale=1,
+                 attn_type='B'):
+
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+
+        self.attn = QuadtreeAttention(
+            dim,
+            num_heads=num_heads,
+            topks=topks,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            scale=scale,
+            attn_type=attn_type)
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+
+        self.norm2 = norm_layer(dim)
+
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if hasattr(m, 'init'):
+            return
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+
+    def forward(self, x, target, H, W):
+
+        x = x + self.drop_path(
+            self.attn(self.norm1(x), self.norm1(target), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class LocalFeatureTransformer(nn.Module):
+    """A Local Feature Transformer (LoFTR) module."""
+
+    def __init__(self, config):
+        super(LocalFeatureTransformer, self).__init__()
+        self.block_type = config['block_type']
+        self.config = config
+        self.d_model = config['d_model']
+        self.nhead = config['nhead']
+        self.layer_names = config['layer_names']
+
+        if config['block_type'] == 'loftr':
+            encoder_layer = LoFTREncoderLayer(config['d_model'],
+                                              config['nhead'],
+                                              config['attention'])
+            self.layers = nn.ModuleList([
+                copy.deepcopy(encoder_layer)
+                for _ in range(len(self.layer_names))
+            ])
+            self._reset_parameters()
+        elif config['block_type'] == 'quadtree':
+            encoder_layer = QuadtreeBlock(
+                config['d_model'],
+                config['nhead'],
+                attn_type=config['attn_type'],
+                topks=config['topks'],
+                scale=3)
+            self.layers = nn.ModuleList([
+                copy.deepcopy(encoder_layer)
+                for _ in range(len(self.layer_names))
+            ])
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feat0, feat1, mask0=None, mask1=None):
+        """
+        Args:
+            feat0 (torch.Tensor): [N, L, C]
+            feat1 (torch.Tensor): [N, S, C]
+            mask0 (torch.Tensor): [N, L] (optional)
+            mask1 (torch.Tensor): [N, S] (optional)
+        """
+
+        if len(feat0.shape) == 4:
+            B, C, H, W = feat0.shape
+            feat0 = rearrange(feat0, 'b c h w -> b (h w) c')
+            feat1 = rearrange(feat1, 'b c h w -> b (h w) c')
+
+        if self.block_type == 'loftr':
+            for layer, name in zip(self.layers, self.layer_names):
+                if name == 'self':
+                    feat0 = layer(feat0, feat0, mask0, mask0)
+                    feat1 = layer(feat1, feat1, mask1, mask1)
+                elif name == 'cross':
+                    feat0 = layer(feat0, feat1, mask0, mask1)
+                    feat1 = layer(feat1, feat0, mask1, mask0)
+                else:
+                    raise KeyError
+        else:
+            for layer, name in zip(self.layers, self.layer_names):
+                if name == 'self':
+                    feat0 = layer(feat0, feat0, H, W)
+                    feat1 = layer(feat1, feat1, H, W)
+                elif name == 'cross':
+                    if self.config['block_type'] == 'quadtree':
+                        feat0, feat1 = layer(feat0, feat1, H,
+                                             W), layer(feat1, feat0, H, W)
+                    else:
+                        feat0 = layer(feat0, feat1, H, W)
+                        feat1 = layer(feat1, feat0, H, W)
+                else:
+                    raise KeyError
+
+        return feat0, feat1
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/utils/__init__.py b/modelscope/models/cv/image_matching/loftr_quadtree/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/utils/coarse_matching.py b/modelscope/models/cv/image_matching/loftr_quadtree/utils/coarse_matching.py
new file mode 100644
index 00000000..d31686b2
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/utils/coarse_matching.py
@@ -0,0 +1,268 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.einops import rearrange
+
+INF = 1e9
+
+
+def mask_border(m, b: int, v):
+    """ Mask borders with value
+    Args:
+        m (torch.Tensor): [N, H0, W0, H1, W1]
+        b (int)
+        v (m.dtype)
+    """
+    if b <= 0:
+        return
+
+    m[:, :b] = v
+    m[:, :, :b] = v
+    m[:, :, :, :b] = v
+    m[:, :, :, :, :b] = v
+    m[:, -b:] = v
+    m[:, :, -b:] = v
+    m[:, :, :, -b:] = v
+    m[:, :, :, :, -b:] = v
+
+
+def mask_border_with_padding(m, bd, v, p_m0, p_m1):
+    if bd <= 0:
+        return
+
+    m[:, :bd] = v
+    m[:, :, :bd] = v
+    m[:, :, :, :bd] = v
+    m[:, :, :, :, :bd] = v
+
+    h0s, w0s = p_m0.sum(1).max(-1)[0].int(), p_m0.sum(-1).max(-1)[0].int()
+    h1s, w1s = p_m1.sum(1).max(-1)[0].int(), p_m1.sum(-1).max(-1)[0].int()
+    for b_idx, (h0, w0, h1, w1) in enumerate(zip(h0s, w0s, h1s, w1s)):
+        m[b_idx, h0 - bd:] = v
+        m[b_idx, :, w0 - bd:] = v
+        m[b_idx, :, :, h1 - bd:] = v
+        m[b_idx, :, :, :, w1 - bd:] = v
+
+
+def compute_max_candidates(p_m0, p_m1):
+    """Compute the max candidates of all pairs within a batch
+
+    Args:
+        p_m0, p_m1 (torch.Tensor): padded masks
+    """
+    h0s, w0s = p_m0.sum(1).max(-1)[0], p_m0.sum(-1).max(-1)[0]
+    h1s, w1s = p_m1.sum(1).max(-1)[0], p_m1.sum(-1).max(-1)[0]
+    max_cand = torch.sum(
+        torch.min(torch.stack([h0s * w0s, h1s * w1s], -1), -1)[0])
+    return max_cand
+
+
+class CoarseMatching(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # general config
+        self.thr = config['thr']
+        self.border_rm = config['border_rm']
+        # -- # for trainig fine-level LoFTR
+        self.train_coarse_percent = config['train_coarse_percent']
+        self.train_pad_num_gt_min = config['train_pad_num_gt_min']
+
+        # we provide 2 options for differentiable matching
+        self.match_type = config['match_type']
+        if self.match_type == 'dual_softmax':
+            self.temperature = config['dsmax_temperature']
+        elif self.match_type == 'sinkhorn':
+            try:
+                from .superglue import log_optimal_transport
+            except ImportError:
+                raise ImportError('download superglue.py first!')
+            self.log_optimal_transport = log_optimal_transport
+            self.bin_score = nn.Parameter(
+                torch.tensor(config['skh_init_bin_score'], requires_grad=True))
+            self.skh_iters = config['skh_iters']
+            self.skh_prefilter = config['skh_prefilter']
+        else:
+            raise NotImplementedError()
+
+    def forward(self, feat_c0, feat_c1, data, mask_c0=None, mask_c1=None):
+        """
+        Args:
+            feat0 (torch.Tensor): [N, L, C]
+            feat1 (torch.Tensor): [N, S, C]
+            data (dict)
+            mask_c0 (torch.Tensor): [N, L] (optional)
+            mask_c1 (torch.Tensor): [N, S] (optional)
+        Update:
+            data (dict): {
+                'b_ids' (torch.Tensor): [M'],
+                'i_ids' (torch.Tensor): [M'],
+                'j_ids' (torch.Tensor): [M'],
+                'gt_mask' (torch.Tensor): [M'],
+                'mkpts0_c' (torch.Tensor): [M, 2],
+                'mkpts1_c' (torch.Tensor): [M, 2],
+                'mconf' (torch.Tensor): [M]}
+            NOTE: M' != M during training.
+        """
+        _, L, S, _ = feat_c0.size(0), feat_c0.size(1), feat_c1.size(
+            1), feat_c0.size(2)
+
+        # normalize
+        feat_c0, feat_c1 = map(lambda feat: feat / feat.shape[-1]**.5,
+                               [feat_c0, feat_c1])
+
+        if self.match_type == 'dual_softmax':
+            sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0,
+                                      feat_c1) / self.temperature
+            if mask_c0 is not None:
+                sim_matrix.masked_fill_(
+                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF)
+            conf_matrix = F.softmax(sim_matrix, 1) * F.softmax(sim_matrix, 2)
+
+        elif self.match_type == 'sinkhorn':
+            # sinkhorn, dustbin included
+            sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0, feat_c1)
+            if mask_c0 is not None:
+                sim_matrix[:, :L, :S].masked_fill_(
+                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF)
+
+            # build uniform prior & use sinkhorn
+            log_assign_matrix = self.log_optimal_transport(
+                sim_matrix, self.bin_score, self.skh_iters)
+            assign_matrix = log_assign_matrix.exp()
+            conf_matrix = assign_matrix[:, :-1, :-1]
+
+            # filter prediction with dustbin score (only in evaluation mode)
+            if not self.training and self.skh_prefilter:
+                filter0 = (assign_matrix.max(dim=2)[1] == S)[:, :-1]  # [N, L]
+                filter1 = (assign_matrix.max(dim=1)[1] == L)[:, :-1]  # [N, S]
+                conf_matrix[filter0[..., None].repeat(1, 1, S)] = 0
+                conf_matrix[filter1[:, None].repeat(1, L, 1)] = 0
+
+            if self.config['sparse_spvs']:
+                data.update({'conf_matrix_with_bin': assign_matrix.clone()})
+
+        data.update({'conf_matrix': conf_matrix})
+
+        # predict coarse matches from conf_matrix
+        data.update(**self.get_coarse_match(conf_matrix, data))
+
+    @torch.no_grad()
+    def get_coarse_match(self, conf_matrix, data):
+        """
+        Args:
+            conf_matrix (torch.Tensor): [N, L, S]
+            data (dict): with keys ['hw0_i', 'hw1_i', 'hw0_c', 'hw1_c']
+        Returns:
+            coarse_matches (dict): {
+                'b_ids' (torch.Tensor): [M'],
+                'i_ids' (torch.Tensor): [M'],
+                'j_ids' (torch.Tensor): [M'],
+                'gt_mask' (torch.Tensor): [M'],
+                'm_bids' (torch.Tensor): [M],
+                'mkpts0_c' (torch.Tensor): [M, 2],
+                'mkpts1_c' (torch.Tensor): [M, 2],
+                'mconf' (torch.Tensor): [M]}
+        """
+        axes_lengths = {
+            'h0c': data['hw0_c'][0],
+            'w0c': data['hw0_c'][1],
+            'h1c': data['hw1_c'][0],
+            'w1c': data['hw1_c'][1]
+        }
+        _device = conf_matrix.device
+        # 1. confidence thresholding
+        mask = conf_matrix > self.thr
+        mask = rearrange(mask, 'b (h0c w0c) (h1c w1c) -> b h0c w0c h1c w1c',
+                         **axes_lengths)
+        if 'mask0' not in data:
+            mask_border(mask, self.border_rm, False)
+        else:
+            mask_border_with_padding(mask, self.border_rm, False,
+                                     data['mask0'], data['mask1'])
+        mask = rearrange(mask, 'b h0c w0c h1c w1c -> b (h0c w0c) (h1c w1c)',
+                         **axes_lengths)
+
+        # 2. mutual nearest
+        mask = mask \
+            * (conf_matrix == conf_matrix.max(dim=2, keepdim=True)[0]) \
+            * (conf_matrix == conf_matrix.max(dim=1, keepdim=True)[0])
+
+        # 3. find all valid coarse matches
+        # this only works when at most one `True` in each row
+        mask_v, all_j_ids = mask.max(dim=2)
+        b_ids, i_ids = torch.where(mask_v)
+        j_ids = all_j_ids[b_ids, i_ids]
+        mconf = conf_matrix[b_ids, i_ids, j_ids]
+
+        # 4. Random sampling of training samples for fine-level LoFTR
+        # (optional) pad samples with gt coarse-level matches
+        if self.training:
+            # NOTE:
+            # The sampling is performed across all pairs in a batch without manually balancing
+            # #samples for fine-level increases w.r.t. batch_size
+            if 'mask0' not in data:
+                num_candidates_max = mask.size(0) * max(
+                    mask.size(1), mask.size(2))
+            else:
+                num_candidates_max = compute_max_candidates(
+                    data['mask0'], data['mask1'])
+            num_matches_train = int(num_candidates_max
+                                    * self.train_coarse_percent)
+            num_matches_pred = len(b_ids)
+            assert self.train_pad_num_gt_min < num_matches_train, 'min-num-gt-pad should be less than num-train-matches'
+
+            # pred_indices is to select from prediction
+            if num_matches_pred <= num_matches_train - self.train_pad_num_gt_min:
+                pred_indices = torch.arange(num_matches_pred, device=_device)
+            else:
+                pred_indices = torch.randint(
+                    num_matches_pred,
+                    (num_matches_train - self.train_pad_num_gt_min, ),
+                    device=_device)
+
+            # gt_pad_indices is to select from gt padding. e.g. max(3787-4800, 200)
+            gt_pad_indices = torch.randint(
+                len(data['spv_b_ids']),
+                (max(num_matches_train - num_matches_pred,
+                     self.train_pad_num_gt_min), ),
+                device=_device)
+            mconf_gt = torch.zeros(
+                len(data['spv_b_ids']),
+                device=_device)  # set conf of gt paddings to all zero
+
+            b_ids, i_ids, j_ids, mconf = map(
+                lambda x, y: torch.cat([x[pred_indices], y[gt_pad_indices]],
+                                       dim=0),
+                *zip([b_ids, data['spv_b_ids']], [i_ids, data['spv_i_ids']],
+                     [j_ids, data['spv_j_ids']], [mconf, mconf_gt]))
+
+        # These matches select patches that feed into fine-level network
+        coarse_matches = {'b_ids': b_ids, 'i_ids': i_ids, 'j_ids': j_ids}
+
+        # 4. Update with matches in original image resolution
+        scale = data['hw0_i'][0] / data['hw0_c'][0]
+        scale0 = scale * data['scale0'][b_ids] if 'scale0' in data else scale
+        scale1 = scale * data['scale1'][b_ids] if 'scale1' in data else scale
+        mkpts0_c = torch.stack(
+            [i_ids % data['hw0_c'][1], i_ids // data['hw0_c'][1]],
+            dim=1) * scale0
+        mkpts1_c = torch.stack(
+            [j_ids % data['hw1_c'][1], j_ids // data['hw1_c'][1]],
+            dim=1) * scale1
+
+        # These matches is the current prediction (for visualization)
+        coarse_matches.update({
+            'gt_mask': mconf == 0,
+            'm_bids': b_ids[mconf != 0],  # mconf == 0 => gt matches
+            'mkpts0_c': mkpts0_c[mconf != 0],
+            'mkpts1_c': mkpts1_c[mconf != 0],
+            'mconf': mconf[mconf != 0]
+        })
+
+        return coarse_matches
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/utils/fine_matching.py b/modelscope/models/cv/image_matching/loftr_quadtree/utils/fine_matching.py
new file mode 100644
index 00000000..d0340eb8
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/utils/fine_matching.py
@@ -0,0 +1,86 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+import math
+
+import torch
+import torch.nn as nn
+from kornia.geometry.subpix import dsnt
+from kornia.utils.grid import create_meshgrid
+
+
+class FineMatching(nn.Module):
+    """FineMatching with s2d paradigm"""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, feat_f0, feat_f1, data):
+        """
+        Args:
+            feat0 (torch.Tensor): [M, WW, C]
+            feat1 (torch.Tensor): [M, WW, C]
+            data (dict)
+        Update:
+            data (dict):{
+                'expec_f' (torch.Tensor): [M, 3],
+                'mkpts0_f' (torch.Tensor): [M, 2],
+                'mkpts1_f' (torch.Tensor): [M, 2]}
+        """
+        M, WW, C = feat_f0.shape
+        W = int(math.sqrt(WW))
+        scale = data['hw0_i'][0] / data['hw0_f'][0]
+        self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale
+
+        # corner case: if no coarse matches found
+        if M == 0:
+            assert self.training is False, 'M is always >0, when training, see coarse_matching.py'
+            # logger.warning('No matches found in coarse-level.')
+            data.update({
+                'expec_f': torch.empty(0, 3, device=feat_f0.device),
+                'mkpts0_f': data['mkpts0_c'],
+                'mkpts1_f': data['mkpts1_c'],
+            })
+            return
+
+        feat_f0_picked = feat_f0_picked = feat_f0[:, WW // 2, :]
+        sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1)
+        softmax_temp = 1. / C**.5
+        heatmap = torch.softmax(
+            softmax_temp * sim_matrix, dim=1).view(-1, W, W)
+
+        # compute coordinates from heatmap
+        coords_normalized = dsnt.spatial_expectation2d(heatmap[None],
+                                                       True)[0]  # [M, 2]
+        grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(
+            1, -1, 2)  # [1, WW, 2]
+
+        # compute std over <x, y>
+        var = torch.sum(
+            grid_normalized**2 * heatmap.view(-1, WW, 1),
+            dim=1) - coords_normalized**2  # [M, 2]
+        std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)),
+                        -1)  # [M]  clamp needed for numerical stability
+
+        # for fine-level supervision
+        data.update(
+            {'expec_f':
+             torch.cat([coords_normalized, std.unsqueeze(1)], -1)})
+
+        # compute absolute kpt coords
+        self.get_fine_match(coords_normalized, data)
+
+    @torch.no_grad()
+    def get_fine_match(self, coords_normed, data):
+        W, scale = self.W, self.scale
+
+        # mkpts0_f and mkpts1_f
+        mkpts0_f = data['mkpts0_c']
+        scale1 = scale * data['scale1'][
+            data['b_ids']] if 'scale0' in data else scale
+        mkpts1_f = data['mkpts1_c'] + (
+            coords_normed *  # noqa
+            (W // 2) * scale1)[:len(data['mconf'])]  # noqa
+
+        data.update({'mkpts0_f': mkpts0_f, 'mkpts1_f': mkpts1_f})
diff --git a/modelscope/models/cv/image_matching/loftr_quadtree/utils/position_encoding.py b/modelscope/models/cv/image_matching/loftr_quadtree/utils/position_encoding.py
new file mode 100644
index 00000000..4f3cfe97
--- /dev/null
+++ b/modelscope/models/cv/image_matching/loftr_quadtree/utils/position_encoding.py
@@ -0,0 +1,52 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+import math
+
+import torch
+from torch import nn
+
+
+class PositionEncodingSine(nn.Module):
+    """
+    This is a sinusoidal position encoding that generalized to 2-dimensional images
+    """
+
+    def __init__(self, d_model, max_shape=(256, 256), temp_bug_fix=True):
+        """
+        Args:
+            max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels
+            temp_bug_fix (bool): As noted in this [issue](https://github.com/zju3dv/LoFTR/issues/41),
+                the original implementation of LoFTR includes a bug in the pos-enc impl, which has little impact
+                on the final performance. For now, we keep both impls for backward compatability.
+                We will remove the buggy impl after re-training all variants of our released models.
+        """
+        super().__init__()
+
+        pe = torch.zeros((d_model, *max_shape))
+        y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0)
+        x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0)
+        if temp_bug_fix:
+            div_term = torch.exp(
+                torch.arange(0, d_model // 2, 2).float() *  # noqa
+                (-math.log(10000.0) / (d_model // 2)))
+        else:  # a buggy implementation (for backward compatability only)
+            div_term = torch.exp(
+                torch.arange(0, d_model // 2, 2).float() *  # noqa
+                (-math.log(10000.0) / d_model // 2))
+        div_term = div_term[:, None, None]  # [C//4, 1, 1]
+        pe[0::4, :, :] = torch.sin(x_position * div_term)
+        pe[1::4, :, :] = torch.cos(x_position * div_term)
+        pe[2::4, :, :] = torch.sin(y_position * div_term)
+        pe[3::4, :, :] = torch.cos(y_position * div_term)
+
+        self.register_buffer(
+            'pe', pe.unsqueeze(0), persistent=False)  # [1, C, H, W]
+
+    def forward(self, x):
+        """
+        Args:
+            x: [N, C, H, W]
+        """
+        return x + self.pe[:, :, :x.size(2), :x.size(3)]
diff --git a/modelscope/models/cv/image_matching/quadtree_attention_model.py b/modelscope/models/cv/image_matching/quadtree_attention_model.py
new file mode 100644
index 00000000..377c5c4f
--- /dev/null
+++ b/modelscope/models/cv/image_matching/quadtree_attention_model.py
@@ -0,0 +1,78 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from .config.default import get_cfg_defaults
+from .loftr_quadtree.loftr import LoFTR
+from .utils.misc import lower_config
+
+
+@MODELS.register_module(
+    Tasks.image_matching, module_name=Models.quadtree_attention_image_matching)
+class QuadTreeAttentionForImageMatching(TorchModel):
+    '''
+    Image matching with quadtree attention. This model is trained on outdoor images.
+    For more details, please refer to https://arxiv.org/abs/2201.02767
+    '''
+
+    def __init__(self, model_dir: str, model_type='outdoor', **kwargs):
+        '''
+        Args:
+            model_dir: model directory
+            model_type: model type, 'outdoor' or 'indoor'. Only support outdoor model for modelscope.
+        '''
+        assert model_type == 'outdoor', 'Only support outdoor model for modelscope'
+        # Note: for indoor model, max_image_size should be 640 because scannet training image size is 640,
+        # and currently, this model is overfited on scannet. For outdoor model, larger image size will be better
+
+        super().__init__(model_dir, **kwargs)
+        config = get_cfg_defaults()
+        _config = lower_config(config)
+
+        matcher = LoFTR(config=_config['loftr'])
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        state_dict = torch.load(
+            str(model_path), map_location='cpu')['state_dict']
+
+        matcher.load_state_dict(state_dict, strict=True)
+        self.matcher = matcher
+
+        self.matcher.eval()
+        self.matcher.to('cuda')
+
+    def forward(self, Inputs):
+        '''
+        Args:
+            Inputs: a dict with keys 'image0', 'image1' and 'preprocess_info'.
+                'image0' and 'image1' are torch tensor with shape [1, 1, H1, W1]
+                and [1, 1, H2, W2]. 'preprocess_info' contains the information of
+                resizing, which will be used for postprocessing.
+        '''
+        self.matcher(Inputs)
+        return {
+            'kpts0': Inputs['mkpts0_f'],
+            'kpts1': Inputs['mkpts1_f'],
+            'conf': Inputs['mconf'],
+            'preprocess_info': Inputs['preprocess_info']
+        }
+
+    def postprocess(self, Inputs):
+        matching_result = Inputs
+
+        results = {OutputKeys.MATCHES: matching_result}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+
+        return results
diff --git a/modelscope/models/cv/image_matching/utils/__init__.py b/modelscope/models/cv/image_matching/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/image_matching/utils/misc.py b/modelscope/models/cv/image_matching/utils/misc.py
new file mode 100644
index 00000000..6b5acc10
--- /dev/null
+++ b/modelscope/models/cv/image_matching/utils/misc.py
@@ -0,0 +1,11 @@
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+from yacs.config import CfgNode as CN
+
+
+def lower_config(yacs_cfg):
+    if not isinstance(yacs_cfg, CN):
+        return yacs_cfg
+    return {k.lower(): lower_config(v) for k, v in yacs_cfg.items()}
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation/__init__.py
new file mode 100644
index 00000000..9137b4b1
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .casmvs_model import ImageMultiViewDepthEstimation
+
+else:
+    _import_structure = {
+        'casmvs_model': ['ImageMultiViewDepthEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/cas_mvsnet.py b/modelscope/models/cv/image_mvs_depth_estimation/cas_mvsnet.py
new file mode 100644
index 00000000..63dbf034
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/cas_mvsnet.py
@@ -0,0 +1,221 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .module import (CostRegNet, FeatureNet, RefineNet, depth_regression,
+                     get_depth_range_samples, homo_warping)
+
+Align_Corners_Range = False
+
+
+class DepthNet(nn.Module):
+
+    def __init__(self):
+        super(DepthNet, self).__init__()
+
+    def forward(self,
+                features,
+                proj_matrices,
+                depth_values,
+                num_depth,
+                cost_regularization,
+                prob_volume_init=None):
+        proj_matrices = torch.unbind(proj_matrices, 1)
+        assert len(features) == len(
+            proj_matrices
+        ), 'Different number of images and projection matrices'
+        assert depth_values.shape[
+            1] == num_depth, 'depth_values.shape[1]:{}  num_depth:{}'.format(
+                depth_values.shapep[1], num_depth)
+        num_views = len(features)
+
+        # step 1. feature extraction
+        # in: images; out: 32-channel feature maps
+        ref_feature, src_features = features[0], features[1:]
+        ref_proj, src_projs = proj_matrices[0], proj_matrices[1:]
+
+        # step 2. differentiable homograph, build cost volume
+        ref_volume = ref_feature.unsqueeze(2).repeat(1, 1, num_depth, 1, 1)
+        volume_sum = ref_volume
+        volume_sq_sum = ref_volume**2
+        del ref_volume
+        for src_fea, src_proj in zip(src_features, src_projs):
+            # warpped features
+            src_proj_new = src_proj[:, 0].clone()
+            src_proj_new[:, :3, :4] = torch.matmul(src_proj[:, 1, :3, :3],
+                                                   src_proj[:, 0, :3, :4])
+            ref_proj_new = ref_proj[:, 0].clone()
+            ref_proj_new[:, :3, :4] = torch.matmul(ref_proj[:, 1, :3, :3],
+                                                   ref_proj[:, 0, :3, :4])
+            warped_volume = homo_warping(src_fea, src_proj_new, ref_proj_new,
+                                         depth_values)
+            if self.training:
+                volume_sum = volume_sum + warped_volume
+                volume_sq_sum = volume_sq_sum + warped_volume**2
+            else:
+                # TODO: this is only a temporary solution to save memory, better way?
+                volume_sum += warped_volume
+                volume_sq_sum += warped_volume.pow_(
+                    2)  # the memory of warped_volume has been modified
+            del warped_volume
+        # aggregate multiple feature volumes by variance
+        volume_variance = volume_sq_sum.div_(num_views).sub_(
+            volume_sum.div_(num_views).pow_(2))
+
+        # step 3. cost volume regularization
+        cost_reg = cost_regularization(volume_variance)
+        prob_volume_pre = cost_reg.squeeze(1)
+
+        if prob_volume_init is not None:
+            prob_volume_pre += prob_volume_init
+
+        prob_volume = F.softmax(prob_volume_pre, dim=1)
+        depth = depth_regression(prob_volume, depth_values=depth_values)
+
+        with torch.no_grad():
+            # photometric confidence
+            prob_volume_sum4 = 4 * F.avg_pool3d(
+                F.pad(prob_volume.unsqueeze(1), pad=(0, 0, 0, 0, 1, 2)),
+                (4, 1, 1),
+                stride=1,
+                padding=0).squeeze(1)
+            depth_index = depth_regression(
+                prob_volume,
+                depth_values=torch.arange(
+                    num_depth, device=prob_volume.device,
+                    dtype=torch.float)).long()
+            depth_index = depth_index.clamp(min=0, max=num_depth - 1)
+            photometric_confidence = torch.gather(
+                prob_volume_sum4, 1, depth_index.unsqueeze(1)).squeeze(1)
+
+        return {
+            'depth': depth,
+            'photometric_confidence': photometric_confidence
+        }
+
+
+class CascadeMVSNet(nn.Module):
+
+    def __init__(self,
+                 refine=False,
+                 ndepths=[48, 32, 8],
+                 depth_interals_ratio=[4, 2, 1],
+                 share_cr=False,
+                 grad_method='detach',
+                 arch_mode='fpn',
+                 cr_base_chs=[8, 8, 8]):
+        super(CascadeMVSNet, self).__init__()
+        self.refine = refine
+        self.share_cr = share_cr
+        self.ndepths = ndepths
+        self.depth_interals_ratio = depth_interals_ratio
+        self.grad_method = grad_method
+        self.arch_mode = arch_mode
+        self.cr_base_chs = cr_base_chs
+        self.num_stage = len(ndepths)
+
+        assert len(ndepths) == len(depth_interals_ratio)
+
+        self.stage_infos = {
+            'stage1': {
+                'scale': 4.0,
+            },
+            'stage2': {
+                'scale': 2.0,
+            },
+            'stage3': {
+                'scale': 1.0,
+            }
+        }
+
+        self.feature = FeatureNet(
+            base_channels=8,
+            stride=4,
+            num_stage=self.num_stage,
+            arch_mode=self.arch_mode)
+        if self.share_cr:
+            self.cost_regularization = CostRegNet(
+                in_channels=self.feature.out_channels, base_channels=8)
+        else:
+            self.cost_regularization = nn.ModuleList([
+                CostRegNet(
+                    in_channels=self.feature.out_channels[i],
+                    base_channels=self.cr_base_chs[i])
+                for i in range(self.num_stage)
+            ])
+        if self.refine:
+            self.refine_network = RefineNet()
+        self.DepthNet = DepthNet()
+
+    def forward(self, imgs, proj_matrices, depth_values):
+        depth_min = float(depth_values[0, 0].cpu().numpy())
+        depth_max = float(depth_values[0, -1].cpu().numpy())
+        depth_interval = (depth_max - depth_min) / depth_values.size(1)
+
+        # step 1. feature extraction
+        features = []
+        for nview_idx in range(imgs.size(1)):  # imgs shape (B, N, C, H, W)
+            img = imgs[:, nview_idx]
+            features.append(self.feature(img))
+
+        outputs = {}
+        depth, cur_depth = None, None
+        for stage_idx in range(self.num_stage):
+            # stage feature, proj_mats, scales
+            features_stage = [
+                feat['stage{}'.format(stage_idx + 1)] for feat in features
+            ]
+            proj_matrices_stage = proj_matrices['stage{}'.format(stage_idx
+                                                                 + 1)]
+            stage_scale = self.stage_infos['stage{}'.format(stage_idx
+                                                            + 1)]['scale']
+
+            if depth is not None:
+                if self.grad_method == 'detach':
+                    cur_depth = depth.detach()
+                else:
+                    cur_depth = depth
+                cur_depth = F.interpolate(
+                    cur_depth.unsqueeze(1), [img.shape[2], img.shape[3]],
+                    mode='bilinear',
+                    align_corners=Align_Corners_Range).squeeze(1)
+            else:
+                cur_depth = depth_values
+            depth_range_samples = get_depth_range_samples(
+                cur_depth=cur_depth,
+                ndepth=self.ndepths[stage_idx],
+                depth_inteval_pixel=self.depth_interals_ratio[stage_idx]
+                * depth_interval,
+                dtype=img[0].dtype,
+                device=img[0].device,
+                shape=[img.shape[0], img.shape[2], img.shape[3]],
+                max_depth=depth_max,
+                min_depth=depth_min)
+
+            outputs_stage = self.DepthNet(
+                features_stage,
+                proj_matrices_stage,
+                depth_values=F.interpolate(
+                    depth_range_samples.unsqueeze(1), [
+                        self.ndepths[stage_idx], img.shape[2]
+                        // int(stage_scale), img.shape[3] // int(stage_scale)
+                    ],
+                    mode='trilinear',
+                    align_corners=Align_Corners_Range).squeeze(1),
+                num_depth=self.ndepths[stage_idx],
+                cost_regularization=self.cost_regularization
+                if self.share_cr else self.cost_regularization[stage_idx])
+
+            depth = outputs_stage['depth']
+
+            outputs['stage{}'.format(stage_idx + 1)] = outputs_stage
+            outputs.update(outputs_stage)
+
+        # depth map refinement
+        if self.refine:
+            refined_depth = self.refine_network(
+                torch.cat((imgs[:, 0], depth), 1))
+            outputs['refined_depth'] = refined_depth
+
+        return outputs
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/casmvs_model.py b/modelscope/models/cv/image_mvs_depth_estimation/casmvs_model.py
new file mode 100644
index 00000000..e5215607
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/casmvs_model.py
@@ -0,0 +1,164 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+
+import cv2
+import numpy as np
+import torch
+from easydict import EasyDict as edict
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .cas_mvsnet import CascadeMVSNet
+from .colmap2mvsnet import processing_single_scene
+from .depth_filter import pcd_depth_filter
+from .general_eval_dataset import MVSDataset, save_pfm
+from .utils import (generate_pointcloud, numpy2torch, tensor2numpy, tocuda,
+                    write_cam)
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.image_multi_view_depth_estimation,
+    module_name=Models.image_casmvs_depth_estimation)
+class ImageMultiViewDepthEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        # build model
+        self.model = CascadeMVSNet(
+            refine=False,
+            ndepths=[48, 32, 8],
+            depth_interals_ratio=[float(d_i) for d_i in [4, 2, 1]],
+            share_cr=False,
+            cr_base_chs=[8, 8, 8],
+            grad_method='detach')
+
+        # load checkpoint file
+        ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model {ckpt_path}')
+        state_dict = torch.load(ckpt_path, map_location=torch.device('cpu'))
+        self.model.load_state_dict(state_dict['model'], strict=True)
+
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+
+        self.model.to(self.device)
+        self.model.eval()
+        logger.info(f'model init done! Device:{self.device}')
+
+    def preprocess_make_pair(self, inputs):
+
+        data = inputs['input_dir']
+        casmvs_inp_dir = inputs['casmvs_inp_dir']
+
+        args = edict()
+        args.dense_folder = data
+        args.save_folder = casmvs_inp_dir
+        args.max_d = 192
+        args.interval_scale = 1.06
+        args.theta0 = 5
+        args.sigma1 = 1
+        args.sigma2 = 10
+        args.model_ext = '.bin'
+
+        logger.info('preprocess of making pair data start')
+        processing_single_scene(args)
+        logger.info('preprocess of making pair data done')
+
+    def forward(self, inputs):
+        test_dir = os.path.dirname(inputs['casmvs_inp_dir'])
+        scene = os.path.basename(inputs['casmvs_inp_dir'])
+        test_list = [scene]
+        save_dir = inputs['casmvs_res_dir']
+
+        logger.info('depth estimation start')
+
+        test_dataset = MVSDataset(
+            test_dir,
+            test_list,
+            'test',
+            5,
+            192,
+            1.06,
+            max_h=1200,
+            max_w=1200,
+            fix_res=False)
+
+        with torch.no_grad():
+            for batch_idx, sample in enumerate(test_dataset):
+                sample = numpy2torch(sample)
+
+                if self.device == 'cuda':
+                    sample_cuda = tocuda(sample)
+
+                proj_matrices_dict = sample_cuda['proj_matrices']
+                proj_matrices_dict_new = {}
+                for k, v in proj_matrices_dict.items():
+                    proj_matrices_dict_new[k] = v.unsqueeze(0)
+
+                outputs = self.model(sample_cuda['imgs'].unsqueeze(0),
+                                     proj_matrices_dict_new,
+                                     sample_cuda['depth_values'].unsqueeze(0))
+
+                outputs = tensor2numpy(outputs)
+                del sample_cuda
+                filenames = [sample['filename']]
+                cams = sample['proj_matrices']['stage{}'.format(3)].unsqueeze(
+                    0).numpy()
+                imgs = sample['imgs'].unsqueeze(0).numpy()
+
+                # save depth maps and confidence maps
+                for filename, cam, img, depth_est, photometric_confidence in zip(
+                        filenames, cams, imgs, outputs['depth'],
+                        outputs['photometric_confidence']):
+
+                    img = img[0]  # ref view
+                    cam = cam[0]  # ref cam
+                    depth_filename = os.path.join(
+                        save_dir, filename.format('depth_est', '.pfm'))
+                    confidence_filename = os.path.join(
+                        save_dir, filename.format('confidence', '.pfm'))
+                    cam_filename = os.path.join(
+                        save_dir, filename.format('cams', '_cam.txt'))
+                    img_filename = os.path.join(
+                        save_dir, filename.format('images', '.jpg'))
+                    ply_filename = os.path.join(
+                        save_dir, filename.format('ply_local', '.ply'))
+                    os.makedirs(
+                        depth_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(
+                        confidence_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(cam_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(img_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(ply_filename.rsplit('/', 1)[0], exist_ok=True)
+                    # save depth maps
+                    save_pfm(depth_filename, depth_est)
+                    # save confidence maps
+                    save_pfm(confidence_filename, photometric_confidence)
+                    # save cams, img
+                    write_cam(cam_filename, cam)
+                    img = np.clip(np.transpose(img, (1, 2, 0)) * 255, 0,
+                                  255).astype(np.uint8)
+                    img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+                    cv2.imwrite(img_filename, img_bgr)
+
+        logger.info('depth estimation end')
+        return inputs
+
+    def postprocess(self, inputs):
+        test_dir = os.path.dirname(inputs['casmvs_inp_dir'])
+        scene = os.path.basename(inputs['casmvs_inp_dir'])
+        logger.info('depth fusion start')
+        pcd = pcd_depth_filter(
+            scene, test_dir, inputs['casmvs_res_dir'], thres_view=4)
+        logger.info('depth fusion end')
+        return pcd
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py b/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py
new file mode 100644
index 00000000..feda4430
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py
@@ -0,0 +1,472 @@
+# The implementation is borrowed from https://github.com/YoYo000/MVSNet. Model reading is provided by COLMAP.
+
+from __future__ import print_function
+import collections
+import multiprocessing as mp
+import os
+import shutil
+import struct
+from functools import partial
+
+import cv2
+import numpy as np
+
+# ============================ read_model.py ============================#
+CameraModel = collections.namedtuple('CameraModel',
+                                     ['model_id', 'model_name', 'num_params'])
+Camera = collections.namedtuple('Camera',
+                                ['id', 'model', 'width', 'height', 'params'])
+BaseImage = collections.namedtuple(
+    'Image', ['id', 'qvec', 'tvec', 'camera_id', 'name', 'xys', 'point3D_ids'])
+Point3D = collections.namedtuple(
+    'Point3D', ['id', 'xyz', 'rgb', 'error', 'image_ids', 'point2D_idxs'])
+
+
+class Image(BaseImage):
+
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+
+
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name='SIMPLE_PINHOLE', num_params=3),
+    CameraModel(model_id=1, model_name='PINHOLE', num_params=4),
+    CameraModel(model_id=2, model_name='SIMPLE_RADIAL', num_params=4),
+    CameraModel(model_id=3, model_name='RADIAL', num_params=5),
+    CameraModel(model_id=4, model_name='OPENCV', num_params=8),
+    CameraModel(model_id=5, model_name='OPENCV_FISHEYE', num_params=8),
+    CameraModel(model_id=6, model_name='FULL_OPENCV', num_params=12),
+    CameraModel(model_id=7, model_name='FOV', num_params=5),
+    CameraModel(model_id=8, model_name='SIMPLE_RADIAL_FISHEYE', num_params=4),
+    CameraModel(model_id=9, model_name='RADIAL_FISHEYE', num_params=5),
+    CameraModel(model_id=10, model_name='THIN_PRISM_FISHEYE', num_params=12)
+}
+CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
+                         for camera_model in CAMERA_MODELS])
+
+
+def read_next_bytes(fid,
+                    num_bytes,
+                    format_char_sequence,
+                    endian_character='<'):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+
+
+def read_cameras_text(path):
+    cameras = {}
+    with open(path, 'r', encoding='utf-8') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(
+                    id=camera_id,
+                    model=model,
+                    width=width,
+                    height=height,
+                    params=params)
+    return cameras
+
+
+def read_cameras_binary(path_to_model_file):
+    cameras = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_cameras = read_next_bytes(fid, 8, 'Q')[0]
+        for camera_line_index in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence='iiQQ')
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(
+                fid,
+                num_bytes=8 * num_params,
+                format_char_sequence='d' * num_params)
+            cameras[camera_id] = Camera(
+                id=camera_id,
+                model=model_name,
+                width=width,
+                height=height,
+                params=np.array(params))
+        assert len(cameras) == num_cameras
+    return cameras
+
+
+def read_images_text(path):
+    images = {}
+    with open(path, 'r', encoding='utf-8') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack([
+                    tuple(map(float, elems[0::3])),
+                    tuple(map(float, elems[1::3]))
+                ])
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id,
+                    qvec=qvec,
+                    tvec=tvec,
+                    camera_id=camera_id,
+                    name=image_name,
+                    xys=xys,
+                    point3D_ids=point3D_ids)
+    return images
+
+
+def read_images_binary(path_to_model_file):
+    images = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_reg_images = read_next_bytes(fid, 8, 'Q')[0]
+        for image_index in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence='idddddddi')
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ''
+            current_char = read_next_bytes(fid, 1, 'c')[0]
+            while current_char != b'\x00':  # look for the ASCII 0 entry
+                image_name += current_char.decode('utf-8')
+                current_char = read_next_bytes(fid, 1, 'c')[0]
+            num_points2D = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence='Q')[0]
+            x_y_id_s = read_next_bytes(
+                fid,
+                num_bytes=24 * num_points2D,
+                format_char_sequence='ddq' * num_points2D)
+            xys = np.column_stack([
+                tuple(map(float, x_y_id_s[0::3])),
+                tuple(map(float, x_y_id_s[1::3]))
+            ])
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id,
+                qvec=qvec,
+                tvec=tvec,
+                camera_id=camera_id,
+                name=image_name,
+                xys=xys,
+                point3D_ids=point3D_ids)
+    return images
+
+
+def read_points3D_text(path):
+    points3D = {}
+    with open(path, 'r', encoding='utf-8') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(
+                    id=point3D_id,
+                    xyz=xyz,
+                    rgb=rgb,
+                    error=error,
+                    image_ids=image_ids,
+                    point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def read_points3d_binary(path_to_model_file):
+    points3D = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_points = read_next_bytes(fid, 8, 'Q')[0]
+        for point_line_index in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence='QdddBBBd')
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence='Q')[0]
+            track_elems = read_next_bytes(
+                fid,
+                num_bytes=8 * track_length,
+                format_char_sequence='ii' * track_length)
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id,
+                xyz=xyz,
+                rgb=rgb,
+                error=error,
+                image_ids=image_ids,
+                point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def read_model(path, ext):
+    if ext == '.txt':
+        cameras = read_cameras_text(os.path.join(path, 'cameras' + ext))
+        images = read_images_text(os.path.join(path, 'images' + ext))
+        points3D = read_points3D_text(os.path.join(path, 'points3D') + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, 'cameras' + ext))
+        images = read_images_binary(os.path.join(path, 'images' + ext))
+        points3D = read_points3d_binary(os.path.join(path, 'points3D') + ext)
+    return cameras, images, points3D
+
+
+def qvec2rotmat(qvec):
+    return np.array([
+        [
+            1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
+            2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+            2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
+        ],  # noqa
+        [
+            2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+            1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
+            2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
+        ],  # noqa
+        [
+            2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+            2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+            1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
+        ]
+    ])  # noqa
+
+
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = np.array(
+        [[Rxx - Ryy - Rzz, 0, 0, 0], [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+         [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+         [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0  # noqa
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec
+
+
+def calc_score(inputs, images, points3d, extrinsic, args):
+    i, j = inputs
+    id_i = images[i + 1].point3D_ids
+    id_j = images[j + 1].point3D_ids
+    id_intersect = [it for it in id_i if it in id_j]
+    cam_center_i = -np.matmul(extrinsic[i + 1][:3, :3].transpose(),
+                              extrinsic[i + 1][:3, 3:4])[:, 0]
+    cam_center_j = -np.matmul(extrinsic[j + 1][:3, :3].transpose(),
+                              extrinsic[j + 1][:3, 3:4])[:, 0]
+    score = 0
+    for pid in id_intersect:
+        if pid == -1:
+            continue
+        p = points3d[pid].xyz
+        theta = (180 / np.pi) * np.arccos(
+            np.dot(cam_center_i - p, cam_center_j - p)
+            / np.linalg.norm(cam_center_i - p)
+            / np.linalg.norm(cam_center_j - p))
+        tmp_value = (
+            2 *  # noqa
+            (args.sigma1 if theta <= args.theta0 else args.sigma2)**2)
+        score += np.exp(-(theta - args.theta0) *  # noqa
+                        (theta - args.theta0) / tmp_value)
+    return i, j, score
+
+
+def processing_single_scene(args):
+
+    image_dir = os.path.join(args.dense_folder, 'images')
+    model_dir = os.path.join(args.dense_folder, 'sparse')
+    cam_dir = os.path.join(args.save_folder, 'cams')
+    image_converted_dir = os.path.join(args.save_folder, 'images_post')
+
+    if os.path.exists(image_converted_dir):
+        shutil.rmtree(image_converted_dir)
+    os.makedirs(image_converted_dir)
+    if os.path.exists(cam_dir):
+        shutil.rmtree(cam_dir)
+
+    cameras, images, points3d = read_model(model_dir, args.model_ext)
+    num_images = len(list(images.items()))
+
+    param_type = {
+        'SIMPLE_PINHOLE': ['f', 'cx', 'cy'],
+        'PINHOLE': ['fx', 'fy', 'cx', 'cy'],
+        'SIMPLE_RADIAL': ['f', 'cx', 'cy', 'k'],
+        'SIMPLE_RADIAL_FISHEYE': ['f', 'cx', 'cy', 'k'],
+        'RADIAL': ['f', 'cx', 'cy', 'k1', 'k2'],
+        'RADIAL_FISHEYE': ['f', 'cx', 'cy', 'k1', 'k2'],
+        'OPENCV': ['fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2'],
+        'OPENCV_FISHEYE': ['fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'k3', 'k4'],
+        'FULL_OPENCV': [
+            'fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2', 'k3', 'k4', 'k5',
+            'k6'
+        ],
+        'FOV': ['fx', 'fy', 'cx', 'cy', 'omega'],
+        'THIN_PRISM_FISHEYE': [
+            'fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2', 'k3', 'k4', 'sx1',
+            'sy1'
+        ]
+    }
+
+    # intrinsic
+    intrinsic = {}
+    for camera_id, cam in cameras.items():
+        params_dict = {
+            key: value
+            for key, value in zip(param_type[cam.model], cam.params)
+        }
+        if 'f' in param_type[cam.model]:
+            params_dict['fx'] = params_dict['f']
+            params_dict['fy'] = params_dict['f']
+        i = np.array([[params_dict['fx'], 0, params_dict['cx']],
+                      [0, params_dict['fy'], params_dict['cy']], [0, 0, 1]])
+        intrinsic[camera_id] = i
+
+    new_images = {}
+    for i, image_id in enumerate(sorted(images.keys())):
+        new_images[i + 1] = images[image_id]
+    images = new_images
+
+    # extrinsic
+    extrinsic = {}
+    for image_id, image in images.items():
+        e = np.zeros((4, 4))
+        e[:3, :3] = qvec2rotmat(image.qvec)
+        e[:3, 3] = image.tvec
+        e[3, 3] = 1
+        extrinsic[image_id] = e
+
+    # depth range and interval
+    depth_ranges = {}
+    for i in range(num_images):
+        zs = []
+        for p3d_id in images[i + 1].point3D_ids:
+            if p3d_id == -1:
+                continue
+            transformed = np.matmul(extrinsic[i + 1], [
+                points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
+                points3d[p3d_id].xyz[2], 1
+            ])
+            zs.append(np.asscalar(transformed[2]))
+        zs_sorted = sorted(zs)
+        # relaxed depth range
+        max_ratio = 0.1
+        min_ratio = 0.03
+        num_max = max(5, int(len(zs) * max_ratio))
+        num_min = max(1, int(len(zs) * min_ratio))
+        depth_min = 1.0 * sum(zs_sorted[:num_min]) / len(zs_sorted[:num_min])
+        depth_max = 1.0 * sum(zs_sorted[-num_max:]) / len(zs_sorted[-num_max:])
+        if args.max_d == 0:
+            image_int = intrinsic[images[i + 1].camera_id]
+            image_ext = extrinsic[i + 1]
+            image_r = image_ext[0:3, 0:3]
+            image_t = image_ext[0:3, 3]
+            p1 = [image_int[0, 2], image_int[1, 2], 1]
+            p2 = [image_int[0, 2] + 1, image_int[1, 2], 1]
+            P1 = np.matmul(np.linalg.inv(image_int), p1) * depth_min
+            P1 = np.matmul(np.linalg.inv(image_r), (P1 - image_t))
+            P2 = np.matmul(np.linalg.inv(image_int), p2) * depth_min
+            P2 = np.matmul(np.linalg.inv(image_r), (P2 - image_t))
+            depth_num = (1 / depth_min - 1 / depth_max) / (
+                1 / depth_min - 1 / (depth_min + np.linalg.norm(P2 - P1)))
+        else:
+            depth_num = args.max_d
+        depth_interval = (depth_max - depth_min) / (depth_num
+                                                    - 1) / args.interval_scale
+        depth_ranges[i + 1] = (depth_min, depth_interval, depth_num, depth_max)
+
+    # view selection
+    score = np.zeros((len(images), len(images)))
+    queue = []
+    for i in range(len(images)):
+        for j in range(i + 1, len(images)):
+            queue.append((i, j))
+
+    p = mp.Pool(processes=mp.cpu_count())
+    func = partial(
+        calc_score,
+        images=images,
+        points3d=points3d,
+        args=args,
+        extrinsic=extrinsic)
+    result = p.map(func, queue)
+    for i, j, s in result:
+        score[i, j] = s
+        score[j, i] = s
+    view_sel = []
+    for i in range(len(images)):
+        sorted_score = np.argsort(score[i])[::-1]
+        view_sel.append([(k, score[i, k]) for k in sorted_score[:10]])
+
+    # write
+    os.makedirs(cam_dir, exist_ok=True)
+
+    for i in range(num_images):
+        with open(os.path.join(cam_dir, '%08d_cam.txt' % i), 'w') as f:
+            f.write('extrinsic\n')
+            for j in range(4):
+                for k in range(4):
+                    f.write(str(extrinsic[i + 1][j, k]) + ' ')
+                f.write('\n')
+            f.write('\nintrinsic\n')
+            for j in range(3):
+                for k in range(3):
+                    f.write(
+                        str(intrinsic[images[i + 1].camera_id][j, k]) + ' ')
+                f.write('\n')
+            f.write('\n%f %f %f %f\n' %
+                    (depth_ranges[i + 1][0], depth_ranges[i + 1][1],
+                     depth_ranges[i + 1][2], depth_ranges[i + 1][3]))
+    with open(os.path.join(args.save_folder, 'pair.txt'), 'w') as f:
+        f.write('%d\n' % len(images))
+        for i, sorted_score in enumerate(view_sel):
+            f.write('%d\n%d ' % (i, len(sorted_score)))
+            for image_id, s in sorted_score:
+                f.write('%d %f ' % (image_id, s))
+            f.write('\n')
+
+    # convert to jpg
+    for i in range(num_images):
+        img_path = os.path.join(image_dir, images[i + 1].name)
+        if not img_path.endswith('.jpg'):
+            img = cv2.imread(img_path)
+            cv2.imwrite(os.path.join(image_converted_dir, '%08d.jpg' % i), img)
+        else:
+            shutil.copyfile(
+                os.path.join(image_dir, images[i + 1].name),
+                os.path.join(image_converted_dir, '%08d.jpg' % i))
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py b/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py
new file mode 100644
index 00000000..16cdedf4
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py
@@ -0,0 +1,249 @@
+# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch
+import os
+
+import cv2
+import numpy as np
+from PIL import Image
+from plyfile import PlyData, PlyElement
+
+from .general_eval_dataset import read_pfm
+
+
+# read intrinsics and extrinsics
+def read_camera_parameters(filename):
+    with open(filename) as f:
+        lines = f.readlines()
+        lines = [line.rstrip() for line in lines]
+    # extrinsics: line [1,5), 4x4 matrix
+    extrinsics = np.fromstring(
+        ' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+    # intrinsics: line [7-10), 3x3 matrix
+    intrinsics = np.fromstring(
+        ' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+    # assume the feature is 1/4 of the original image size
+    # intrinsics[:2, :] /= 4
+    return intrinsics, extrinsics
+
+
+# read an image
+def read_img(filename):
+    img = Image.open(filename)
+    # scale 0~255 to 0~1
+    np_img = np.array(img, dtype=np.float32) / 255.
+    return np_img
+
+
+# read a binary mask
+def read_mask(filename):
+    return read_img(filename) > 0.5
+
+
+# save a binary mask
+def save_mask(filename, mask):
+    assert mask.dtype == np.bool
+    mask = mask.astype(np.uint8) * 255
+    Image.fromarray(mask).save(filename)
+
+
+# read a pair file, [(ref_view1, [src_view1-1, ...]), (ref_view2, [src_view2-1, ...]), ...]
+def read_pair_file(filename):
+    data = []
+    with open(filename) as f:
+        num_viewpoint = int(f.readline())
+        # 49 viewpoints
+        for view_idx in range(num_viewpoint):
+            ref_view = int(f.readline().rstrip())
+            src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+            if len(src_views) > 0:
+                data.append((ref_view, src_views))
+    return data
+
+
+# project the reference point cloud into the source view, then project back
+def reproject_with_depth(depth_ref, intrinsics_ref, extrinsics_ref, depth_src,
+                         intrinsics_src, extrinsics_src):
+    width, height = depth_ref.shape[1], depth_ref.shape[0]
+    # step1. project reference pixels to the source view
+    # reference view x, y
+    x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
+    x_ref, y_ref = x_ref.reshape([-1]), y_ref.reshape([-1])
+    # reference 3D space
+    xyz_ref = np.matmul(
+        np.linalg.inv(intrinsics_ref),
+        np.vstack(
+            (x_ref, y_ref, np.ones_like(x_ref))) * depth_ref.reshape([-1]))
+    # source 3D space
+    xyz_src = np.matmul(
+        np.matmul(extrinsics_src, np.linalg.inv(extrinsics_ref)),
+        np.vstack((xyz_ref, np.ones_like(x_ref))))[:3]
+    # source view x, y
+    K_xyz_src = np.matmul(intrinsics_src, xyz_src)
+    xy_src = K_xyz_src[:2] / K_xyz_src[2:3]
+
+    # step2. reproject the source view points with source view depth estimation
+    # find the depth estimation of the source view
+    x_src = xy_src[0].reshape([height, width]).astype(np.float32)
+    y_src = xy_src[1].reshape([height, width]).astype(np.float32)
+    sampled_depth_src = cv2.remap(
+        depth_src, x_src, y_src, interpolation=cv2.INTER_LINEAR)
+
+    # source 3D space
+    # NOTE that we should use sampled source-view depth_here to project back
+    xyz_src = np.matmul(
+        np.linalg.inv(intrinsics_src),
+        np.vstack(
+            (xy_src, np.ones_like(x_ref))) * sampled_depth_src.reshape([-1]))
+    # reference 3D space
+    xyz_reprojected = np.matmul(
+        np.matmul(extrinsics_ref, np.linalg.inv(extrinsics_src)),
+        np.vstack((xyz_src, np.ones_like(x_ref))))[:3]
+    # source view x, y, depth
+    depth_reprojected = xyz_reprojected[2].reshape([height,
+                                                    width]).astype(np.float32)
+    K_xyz_reprojected = np.matmul(intrinsics_ref, xyz_reprojected)
+    xy_reprojected = K_xyz_reprojected[:2] / K_xyz_reprojected[2:3]
+    x_reprojected = xy_reprojected[0].reshape([height,
+                                               width]).astype(np.float32)
+    y_reprojected = xy_reprojected[1].reshape([height,
+                                               width]).astype(np.float32)
+
+    return depth_reprojected, x_reprojected, y_reprojected, x_src, y_src
+
+
+def check_geometric_consistency(depth_ref, intrinsics_ref, extrinsics_ref,
+                                depth_src, intrinsics_src, extrinsics_src):
+    width, height = depth_ref.shape[1], depth_ref.shape[0]
+    x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
+    depth_reprojected, x2d_reprojected, y2d_reprojected, x2d_src, y2d_src = reproject_with_depth(
+        depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src,
+        extrinsics_src)
+    # check |p_reproj-p_1| < 1
+    dist = np.sqrt((x2d_reprojected - x_ref)**2 + (y2d_reprojected - y_ref)**2)
+
+    # check |d_reproj-d_1| / d_1 < 0.01
+    depth_diff = np.abs(depth_reprojected - depth_ref)
+    relative_depth_diff = depth_diff / depth_ref
+
+    mask = np.logical_and(dist < 1, relative_depth_diff < 0.01)
+    depth_reprojected[~mask] = 0
+
+    return mask, depth_reprojected, x2d_src, y2d_src
+
+
+def filter_depth(pair_folder, scan_folder, out_folder, thres_view):
+    # the pair file
+    pair_file = os.path.join(pair_folder, 'pair.txt')
+    # for the final point cloud
+    vertexs = []
+    vertex_colors = []
+
+    pair_data = read_pair_file(pair_file)
+
+    # for each reference view and the corresponding source views
+    for ref_view, src_views in pair_data:
+        # src_views = src_views[:args.num_view]
+        # load the camera parameters
+        ref_intrinsics, ref_extrinsics = read_camera_parameters(
+            os.path.join(scan_folder, 'cams/{:0>8}_cam.txt'.format(ref_view)))
+        # load the reference image
+        ref_img = read_img(
+            os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(ref_view)))
+        # load the estimated depth of the reference view
+        ref_depth_est = read_pfm(
+            os.path.join(out_folder,
+                         'depth_est/{:0>8}.pfm'.format(ref_view)))[0]
+        # load the photometric mask of the reference view
+        confidence = read_pfm(
+            os.path.join(out_folder,
+                         'confidence/{:0>8}.pfm'.format(ref_view)))[0]
+        photo_mask = confidence > 0.9
+
+        all_srcview_depth_ests = []
+        all_srcview_x = []
+        all_srcview_y = []
+        all_srcview_geomask = []
+
+        # compute the geometric mask
+        geo_mask_sum = 0
+        for src_view in src_views:
+            # camera parameters of the source view
+            src_intrinsics, src_extrinsics = read_camera_parameters(
+                os.path.join(scan_folder,
+                             'cams/{:0>8}_cam.txt'.format(src_view)))
+            # the estimated depth of the source view
+            src_depth_est = read_pfm(
+                os.path.join(out_folder,
+                             'depth_est/{:0>8}.pfm'.format(src_view)))[0]
+
+            geo_mask, depth_reprojected, x2d_src, y2d_src = check_geometric_consistency(
+                ref_depth_est, ref_intrinsics, ref_extrinsics, src_depth_est,
+                src_intrinsics, src_extrinsics)
+            geo_mask_sum += geo_mask.astype(np.int32)
+            all_srcview_depth_ests.append(depth_reprojected)
+            all_srcview_x.append(x2d_src)
+            all_srcview_y.append(y2d_src)
+            all_srcview_geomask.append(geo_mask)
+
+        depth_est_averaged = (sum(all_srcview_depth_ests) + ref_depth_est) / (
+            geo_mask_sum + 1)
+        # at least 3 source views matched
+        geo_mask = geo_mask_sum >= thres_view
+        final_mask = np.logical_and(photo_mask, geo_mask)
+
+        os.makedirs(os.path.join(out_folder, 'mask'), exist_ok=True)
+        save_mask(
+            os.path.join(out_folder, 'mask/{:0>8}_photo.png'.format(ref_view)),
+            photo_mask)
+        save_mask(
+            os.path.join(out_folder, 'mask/{:0>8}_geo.png'.format(ref_view)),
+            geo_mask)
+        save_mask(
+            os.path.join(out_folder, 'mask/{:0>8}_final.png'.format(ref_view)),
+            final_mask)
+
+        height, width = depth_est_averaged.shape[:2]
+        x, y = np.meshgrid(np.arange(0, width), np.arange(0, height))
+        valid_points = final_mask
+        x, y, depth = x[valid_points], y[valid_points], depth_est_averaged[
+            valid_points]
+
+        color = ref_img[valid_points]
+
+        xyz_ref = np.matmul(
+            np.linalg.inv(ref_intrinsics),
+            np.vstack((x, y, np.ones_like(x))) * depth)
+        xyz_world = np.matmul(
+            np.linalg.inv(ref_extrinsics), np.vstack(
+                (xyz_ref, np.ones_like(x))))[:3]
+        vertexs.append(xyz_world.transpose((1, 0)))
+        vertex_colors.append((color * 255).astype(np.uint8))
+
+    vertexs = np.concatenate(vertexs, axis=0)
+    vertex_colors = np.concatenate(vertex_colors, axis=0)
+    vertexs = np.array([tuple(v) for v in vertexs],
+                       dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+    vertex_colors = np.array([tuple(v) for v in vertex_colors],
+                             dtype=[('red', 'u1'), ('green', 'u1'),
+                                    ('blue', 'u1')])
+
+    vertex_all = np.empty(
+        len(vertexs), vertexs.dtype.descr + vertex_colors.dtype.descr)
+    for prop in vertexs.dtype.names:
+        vertex_all[prop] = vertexs[prop]
+    for prop in vertex_colors.dtype.names:
+        vertex_all[prop] = vertex_colors[prop]
+
+    el = PlyElement.describe(vertex_all, 'vertex')
+    # PlyData([el]).write(plyfilename)
+    pcd = PlyData([el])
+
+    return pcd
+
+
+def pcd_depth_filter(scene, test_dir, save_dir, thres_view):
+    old_scene_folder = os.path.join(test_dir, scene)
+    new_scene_folder = os.path.join(save_dir, scene)
+    out_folder = os.path.join(save_dir, scene)
+    pcd = filter_depth(old_scene_folder, new_scene_folder, out_folder,
+                       thres_view)
+    return pcd
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/general_eval_dataset.py b/modelscope/models/cv/image_mvs_depth_estimation/general_eval_dataset.py
new file mode 100644
index 00000000..87ab311d
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/general_eval_dataset.py
@@ -0,0 +1,284 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import re
+import sys
+
+import cv2
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+
+
+def read_pfm(filename):
+    file = open(filename, 'rb')
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().decode('utf-8').rstrip()
+    if header == 'PF':
+        color = True
+    elif header == 'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8'))
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    file.close()
+    return data, scale
+
+
+def save_pfm(filename, image, scale=1):
+    file = open(filename, 'wb')
+    color = None
+
+    image = np.flipud(image)
+
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif len(image.shape) == 2 or len(
+            image.shape) == 3 and image.shape[2] == 1:  # greyscale
+        color = False
+    else:
+        raise Exception(
+            'Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+
+    file.write('PF\n'.encode('utf-8') if color else 'Pf\n'.encode('utf-8'))
+    file.write('{} {}\n'.format(image.shape[1],
+                                image.shape[0]).encode('utf-8'))
+
+    endian = image.dtype.byteorder
+
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+
+    file.write(('%f\n' % scale).encode('utf-8'))
+
+    image.tofile(file)
+    file.close()
+
+
+S_H, S_W = 0, 0
+
+
+class MVSDataset(Dataset):
+
+    def __init__(self,
+                 datapath,
+                 listfile,
+                 mode,
+                 nviews,
+                 ndepths=192,
+                 interval_scale=1.06,
+                 **kwargs):
+        super(MVSDataset, self).__init__()
+        self.datapath = datapath
+        self.listfile = listfile
+        self.mode = mode
+        self.nviews = nviews
+        self.ndepths = ndepths
+        self.interval_scale = interval_scale
+        self.max_h, self.max_w = kwargs['max_h'], kwargs['max_w']
+        self.fix_res = kwargs.get(
+            'fix_res', False)  # whether to fix the resolution of input image.
+        self.fix_wh = False
+
+        assert self.mode == 'test'
+        self.metas = self.build_list()
+
+    def build_list(self):
+        metas = []
+        scans = self.listfile
+
+        interval_scale_dict = {}
+        # scans
+        for scan in scans:
+            # determine the interval scale of each scene. default is 1.06
+            if isinstance(self.interval_scale, float):
+                interval_scale_dict[scan] = self.interval_scale
+            else:
+                interval_scale_dict[scan] = self.interval_scale[scan]
+
+            pair_file = '{}/pair.txt'.format(scan)
+            # read the pair file
+            with open(os.path.join(self.datapath, pair_file)) as f:
+                num_viewpoint = int(f.readline())
+                # viewpoints
+                for view_idx in range(num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [
+                        int(x) for x in f.readline().rstrip().split()[1::2]
+                    ]
+                    # filter by no src view and fill to nviews
+                    if len(src_views) > 0:
+                        if len(src_views) < self.nviews:
+                            src_views += [src_views[0]] * (
+                                self.nviews - len(src_views))
+                        metas.append((scan, ref_view, src_views, scan))
+
+        self.interval_scale = interval_scale_dict
+        return metas
+
+    def __len__(self):
+        return len(self.metas)
+
+    def read_cam_file(self, filename, interval_scale):
+        with open(filename) as f:
+            lines = f.readlines()
+            lines = [line.rstrip() for line in lines]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(
+            ' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(
+            ' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+        intrinsics[:2, :] /= 4.0
+        # depth_min & depth_interval: line 11
+        depth_min = float(lines[11].split()[0])
+        depth_interval = float(lines[11].split()[1])
+
+        if len(lines[11].split()) >= 3:
+            num_depth = lines[11].split()[2]
+            depth_max = depth_min + int(float(num_depth)) * depth_interval
+            depth_interval = (depth_max - depth_min) / self.ndepths
+
+        depth_interval *= interval_scale
+
+        return intrinsics, extrinsics, depth_min, depth_interval
+
+    def read_img(self, filename):
+        img = Image.open(filename)
+        # scale 0~255 to 0~1
+        np_img = np.array(img, dtype=np.float32) / 255.
+
+        return np_img
+
+    def read_depth(self, filename):
+        # read pfm depth file
+        return np.array(read_pfm(filename)[0], dtype=np.float32)
+
+    def scale_mvs_input(self, img, intrinsics, max_w, max_h, base=32):
+        h, w = img.shape[:2]
+        if h > max_h or w > max_w:
+            scale = 1.0 * max_h / h
+            if scale * w > max_w:
+                scale = 1.0 * max_w / w
+            new_w, new_h = scale * w // base * base, scale * h // base * base
+        else:
+            new_w, new_h = 1.0 * w // base * base, 1.0 * h // base * base
+
+        scale_w = 1.0 * new_w / w
+        scale_h = 1.0 * new_h / h
+        intrinsics[0, :] *= scale_w
+        intrinsics[1, :] *= scale_h
+
+        img = cv2.resize(img, (int(new_w), int(new_h)))
+
+        return img, intrinsics
+
+    def __getitem__(self, idx):
+        global S_H, S_W
+        meta = self.metas[idx]
+        scan, ref_view, src_views, scene_name = meta
+        # use only the reference view and first nviews-1 source views
+        view_ids = [ref_view] + src_views[:self.nviews - 1]
+
+        imgs = []
+        depth_values = None
+        proj_matrices = []
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(
+                self.datapath, '{}/images_post/{:0>8}.jpg'.format(scan, vid))
+            if not os.path.exists(img_filename):
+                img_filename = os.path.join(
+                    self.datapath, '{}/images/{:0>8}.jpg'.format(scan, vid))
+
+            proj_mat_filename = os.path.join(
+                self.datapath, '{}/cams/{:0>8}_cam.txt'.format(scan, vid))
+
+            img = self.read_img(img_filename)
+            intrinsics, extrinsics, depth_min, depth_interval = self.read_cam_file(
+                proj_mat_filename,
+                interval_scale=self.interval_scale[scene_name])
+            # scale input
+            img, intrinsics = self.scale_mvs_input(img, intrinsics, self.max_w,
+                                                   self.max_h)
+
+            if self.fix_res:
+                # using the same standard height or width in entire scene.
+                S_H, S_W = img.shape[:2]
+                self.fix_res = False
+                self.fix_wh = True
+
+            if i == 0:
+                if not self.fix_wh:
+                    # using the same standard height or width in each nviews.
+                    S_H, S_W = img.shape[:2]
+
+            # resize to standard height or width
+            c_h, c_w = img.shape[:2]
+            if (c_h != S_H) or (c_w != S_W):
+                scale_h = 1.0 * S_H / c_h
+                scale_w = 1.0 * S_W / c_w
+                img = cv2.resize(img, (S_W, S_H))
+                intrinsics[0, :] *= scale_w
+                intrinsics[1, :] *= scale_h
+
+            imgs.append(img)
+            # extrinsics, intrinsics
+            proj_mat = np.zeros(shape=(2, 4, 4), dtype=np.float32)  #
+            proj_mat[0, :4, :4] = extrinsics
+            proj_mat[1, :3, :3] = intrinsics
+            proj_matrices.append(proj_mat)
+
+            if i == 0:  # reference view
+                depth_values = np.arange(
+                    depth_min,
+                    depth_interval * (self.ndepths - 0.5) + depth_min,
+                    depth_interval,
+                    dtype=np.float32)
+
+        # all
+        imgs = np.stack(imgs).transpose([0, 3, 1, 2])
+        proj_matrices = np.stack(proj_matrices)
+
+        stage2_pjmats = proj_matrices.copy()
+        stage2_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] * 2
+        stage3_pjmats = proj_matrices.copy()
+        stage3_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] * 4
+
+        proj_matrices_ms = {
+            'stage1': proj_matrices,
+            'stage2': stage2_pjmats,
+            'stage3': stage3_pjmats
+        }
+
+        return {
+            'imgs': imgs,
+            'proj_matrices': proj_matrices_ms,
+            'depth_values': depth_values,
+            'filename': scan + '/{}/' + '{:0>8}'.format(view_ids[0]) + '{}'
+        }
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/module.py b/modelscope/models/cv/image_mvs_depth_estimation/module.py
new file mode 100644
index 00000000..2ffda232
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/module.py
@@ -0,0 +1,678 @@
+# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def init_bn(module):
+    if module.weight is not None:
+        nn.init.ones_(module.weight)
+    if module.bias is not None:
+        nn.init.zeros_(module.bias)
+    return
+
+
+def init_uniform(module, init_method):
+    if module.weight is not None:
+        if init_method == 'kaiming':
+            nn.init.kaiming_uniform_(module.weight)
+        elif init_method == 'xavier':
+            nn.init.xavier_uniform_(module.weight)
+    return
+
+
+class Conv2d(nn.Module):
+    """Applies a 2D convolution (optionally with batch normalization and relu activation)
+    over an input signal composed of several input planes.
+
+    Attributes:
+        conv (nn.Module): convolution module
+        bn (nn.Module): batch normalization module
+        relu (bool): whether to activate by relu
+
+    Notes:
+        Default momentum for batch normalization is set to be 0.01,
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Conv2d, self).__init__()
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.bn = nn.BatchNorm2d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class Deconv2d(nn.Module):
+    """Applies a 2D deconvolution (optionally with batch normalization and relu activation)
+       over an input signal composed of several input planes.
+
+       Attributes:
+           conv (nn.Module): convolution module
+           bn (nn.Module): batch normalization module
+           relu (bool): whether to activate by relu
+
+       Notes:
+           Default momentum for batch normalization is set to be 0.01,
+
+       """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Deconv2d, self).__init__()
+        self.out_channels = out_channels
+        assert stride in [1, 2]
+        self.stride = stride
+
+        self.conv = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.bn = nn.BatchNorm2d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        y = self.conv(x)
+        if self.stride == 2:
+            h, w = list(x.size())[2:]
+            y = y[:, :, :2 * h, :2 * w].contiguous()
+        if self.bn is not None:
+            x = self.bn(y)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class Conv3d(nn.Module):
+    """Applies a 3D convolution (optionally with batch normalization and relu activation)
+    over an input signal composed of several input planes.
+
+    Attributes:
+        conv (nn.Module): convolution module
+        bn (nn.Module): batch normalization module
+        relu (bool): whether to activate by relu
+
+    Notes:
+        Default momentum for batch normalization is set to be 0.01,
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Conv3d, self).__init__()
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        assert stride in [1, 2]
+        self.stride = stride
+
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.bn = nn.BatchNorm3d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class Deconv3d(nn.Module):
+    """Applies a 3D deconvolution (optionally with batch normalization and relu activation)
+       over an input signal composed of several input planes.
+
+       Attributes:
+           conv (nn.Module): convolution module
+           bn (nn.Module): batch normalization module
+           relu (bool): whether to activate by relu
+
+       Notes:
+           Default momentum for batch normalization is set to be 0.01,
+
+       """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Deconv3d, self).__init__()
+        self.out_channels = out_channels
+        assert stride in [1, 2]
+        self.stride = stride
+
+        self.conv = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.bn = nn.BatchNorm3d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        y = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(y)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class ConvBnReLU(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 pad=1):
+        super(ConvBnReLU, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=pad,
+            bias=False)
+        self.bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        return F.relu(self.bn(self.conv(x)), inplace=True)
+
+
+class ConvBn(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 pad=1):
+        super(ConvBn, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=pad,
+            bias=False)
+        self.bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        return self.bn(self.conv(x))
+
+
+def homo_warping(src_fea, src_proj, ref_proj, depth_values):
+    """
+        src_fea: [B, C, H, W]
+        src_proj: [B, 4, 4]
+        ref_proj: [B, 4, 4]
+        depth_values: [B, Ndepth] o [B, Ndepth, H, W]
+        out: [B, C, Ndepth, H, W]
+    """
+    batch, channels = src_fea.shape[0], src_fea.shape[1]
+    num_depth = depth_values.shape[1]
+    height, width = src_fea.shape[2], src_fea.shape[3]
+
+    with torch.no_grad():
+        proj = torch.matmul(src_proj, torch.inverse(ref_proj))
+        rot = proj[:, :3, :3]  # [B,3,3]
+        trans = proj[:, :3, 3:4]  # [B,3,1]
+
+        y, x = torch.meshgrid([
+            torch.arange(
+                0, height, dtype=torch.float32, device=src_fea.device),
+            torch.arange(0, width, dtype=torch.float32, device=src_fea.device)
+        ])
+        y, x = y.contiguous(), x.contiguous()
+        y, x = y.view(height * width), x.view(height * width)
+        xyz = torch.stack((x, y, torch.ones_like(x)))  # [3, H*W]
+        xyz = torch.unsqueeze(xyz, 0).repeat(batch, 1, 1)  # [B, 3, H*W]
+        rot_xyz = torch.matmul(rot, xyz)  # [B, 3, H*W]
+        rot_depth_xyz = rot_xyz.unsqueeze(2).repeat(
+            1, 1, num_depth, 1) * depth_values.view(batch, 1, num_depth,
+                                                    -1)  # [B, 3, Ndepth, H*W]
+        proj_xyz = rot_depth_xyz + trans.view(batch, 3, 1,
+                                              1)  # [B, 3, Ndepth, H*W]
+        proj_xy = proj_xyz[:, :
+                           2, :, :] / proj_xyz[:, 2:
+                                               3, :, :]  # [B, 2, Ndepth, H*W]
+        proj_x_normalized = proj_xy[:, 0, :, :] / ((width - 1) / 2) - 1
+        proj_y_normalized = proj_xy[:, 1, :, :] / ((height - 1) / 2) - 1
+        proj_xy = torch.stack((proj_x_normalized, proj_y_normalized),
+                              dim=3)  # [B, Ndepth, H*W, 2]
+        grid = proj_xy
+
+    warped_src_fea = F.grid_sample(
+        src_fea,
+        grid.view(batch, num_depth * height, width, 2),
+        mode='bilinear',
+        padding_mode='zeros',
+        align_corners=True)
+    warped_src_fea = warped_src_fea.view(batch, channels, num_depth, height,
+                                         width)
+
+    return warped_src_fea
+
+
+class DeConv2dFuse(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1):
+        super(DeConv2dFuse, self).__init__()
+
+        self.deconv = Deconv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=2,
+            padding=1,
+            output_padding=1,
+            bn=True,
+            relu=relu,
+            bn_momentum=bn_momentum)
+
+        self.conv = Conv2d(
+            2 * out_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=1,
+            bn=bn,
+            relu=relu,
+            bn_momentum=bn_momentum)
+
+    def forward(self, x_pre, x):
+        x = self.deconv(x)
+        x = torch.cat((x, x_pre), dim=1)
+        x = self.conv(x)
+        return x
+
+
+class FeatureNet(nn.Module):
+
+    def __init__(self, base_channels, num_stage=3, stride=4, arch_mode='unet'):
+        super(FeatureNet, self).__init__()
+        assert arch_mode in [
+            'unet', 'fpn'
+        ], f"mode must be in 'unet' or 'fpn', but get:{arch_mode}"
+        self.arch_mode = arch_mode
+        self.stride = stride
+        self.base_channels = base_channels
+        self.num_stage = num_stage
+
+        self.conv0 = nn.Sequential(
+            Conv2d(3, base_channels, 3, 1, padding=1),
+            Conv2d(base_channels, base_channels, 3, 1, padding=1),
+        )
+
+        self.conv1 = nn.Sequential(
+            Conv2d(base_channels, base_channels * 2, 5, stride=2, padding=2),
+            Conv2d(base_channels * 2, base_channels * 2, 3, 1, padding=1),
+            Conv2d(base_channels * 2, base_channels * 2, 3, 1, padding=1),
+        )
+
+        self.conv2 = nn.Sequential(
+            Conv2d(
+                base_channels * 2, base_channels * 4, 5, stride=2, padding=2),
+            Conv2d(base_channels * 4, base_channels * 4, 3, 1, padding=1),
+            Conv2d(base_channels * 4, base_channels * 4, 3, 1, padding=1),
+        )
+
+        self.out1 = nn.Conv2d(
+            base_channels * 4, base_channels * 4, 1, bias=False)
+        self.out_channels = [4 * base_channels]
+
+        if self.arch_mode == 'unet':
+            if num_stage == 3:
+                self.deconv1 = DeConv2dFuse(base_channels * 4,
+                                            base_channels * 2, 3)
+                self.deconv2 = DeConv2dFuse(base_channels * 2, base_channels,
+                                            3)
+
+                self.out2 = nn.Conv2d(
+                    base_channels * 2, base_channels * 2, 1, bias=False)
+                self.out3 = nn.Conv2d(
+                    base_channels, base_channels, 1, bias=False)
+                self.out_channels.append(2 * base_channels)
+                self.out_channels.append(base_channels)
+
+            elif num_stage == 2:
+                self.deconv1 = DeConv2dFuse(base_channels * 4,
+                                            base_channels * 2, 3)
+
+                self.out2 = nn.Conv2d(
+                    base_channels * 2, base_channels * 2, 1, bias=False)
+                self.out_channels.append(2 * base_channels)
+        elif self.arch_mode == 'fpn':
+            final_chs = base_channels * 4
+            if num_stage == 3:
+                self.inner1 = nn.Conv2d(
+                    base_channels * 2, final_chs, 1, bias=True)
+                self.inner2 = nn.Conv2d(
+                    base_channels * 1, final_chs, 1, bias=True)
+
+                self.out2 = nn.Conv2d(
+                    final_chs, base_channels * 2, 3, padding=1, bias=False)
+                self.out3 = nn.Conv2d(
+                    final_chs, base_channels, 3, padding=1, bias=False)
+                self.out_channels.append(base_channels * 2)
+                self.out_channels.append(base_channels)
+
+            elif num_stage == 2:
+                self.inner1 = nn.Conv2d(
+                    base_channels * 2, final_chs, 1, bias=True)
+
+                self.out2 = nn.Conv2d(
+                    final_chs, base_channels, 3, padding=1, bias=False)
+                self.out_channels.append(base_channels)
+
+    def forward(self, x):
+        conv0 = self.conv0(x)
+        conv1 = self.conv1(conv0)
+        conv2 = self.conv2(conv1)
+
+        intra_feat = conv2
+        outputs = {}
+        out = self.out1(intra_feat)
+        outputs['stage1'] = out
+        if self.arch_mode == 'unet':
+            if self.num_stage == 3:
+                intra_feat = self.deconv1(conv1, intra_feat)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+                intra_feat = self.deconv2(conv0, intra_feat)
+                out = self.out3(intra_feat)
+                outputs['stage3'] = out
+
+            elif self.num_stage == 2:
+                intra_feat = self.deconv1(conv1, intra_feat)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+        elif self.arch_mode == 'fpn':
+            if self.num_stage == 3:
+                intra_feat = F.interpolate(
+                    intra_feat, scale_factor=2,
+                    mode='nearest') + self.inner1(conv1)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+                intra_feat = F.interpolate(
+                    intra_feat, scale_factor=2,
+                    mode='nearest') + self.inner2(conv0)
+                out = self.out3(intra_feat)
+                outputs['stage3'] = out
+
+            elif self.num_stage == 2:
+                intra_feat = F.interpolate(
+                    intra_feat, scale_factor=2,
+                    mode='nearest') + self.inner1(conv1)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+        return outputs
+
+
+class CostRegNet(nn.Module):
+
+    def __init__(self, in_channels, base_channels):
+        super(CostRegNet, self).__init__()
+        self.conv0 = Conv3d(in_channels, base_channels, padding=1)
+
+        self.conv1 = Conv3d(
+            base_channels, base_channels * 2, stride=2, padding=1)
+        self.conv2 = Conv3d(base_channels * 2, base_channels * 2, padding=1)
+
+        self.conv3 = Conv3d(
+            base_channels * 2, base_channels * 4, stride=2, padding=1)
+        self.conv4 = Conv3d(base_channels * 4, base_channels * 4, padding=1)
+
+        self.conv5 = Conv3d(
+            base_channels * 4, base_channels * 8, stride=2, padding=1)
+        self.conv6 = Conv3d(base_channels * 8, base_channels * 8, padding=1)
+
+        self.conv7 = Deconv3d(
+            base_channels * 8,
+            base_channels * 4,
+            stride=2,
+            padding=1,
+            output_padding=1)
+
+        self.conv9 = Deconv3d(
+            base_channels * 4,
+            base_channels * 2,
+            stride=2,
+            padding=1,
+            output_padding=1)
+
+        self.conv11 = Deconv3d(
+            base_channels * 2,
+            base_channels * 1,
+            stride=2,
+            padding=1,
+            output_padding=1)
+
+        self.prob = nn.Conv3d(
+            base_channels, 1, 3, stride=1, padding=1, bias=False)
+
+    def forward(self, x):
+        conv0 = self.conv0(x)
+        conv2 = self.conv2(self.conv1(conv0))
+        conv4 = self.conv4(self.conv3(conv2))
+        x = self.conv6(self.conv5(conv4))
+        x = conv4 + self.conv7(x)
+        x = conv2 + self.conv9(x)
+        x = conv0 + self.conv11(x)
+        x = self.prob(x)
+        return x
+
+
+class RefineNet(nn.Module):
+
+    def __init__(self):
+        super(RefineNet, self).__init__()
+        self.conv1 = ConvBnReLU(4, 32)
+        self.conv2 = ConvBnReLU(32, 32)
+        self.conv3 = ConvBnReLU(32, 32)
+        self.res = ConvBnReLU(32, 1)
+
+    def forward(self, img, depth_init):
+        concat = F.cat((img, depth_init), dim=1)
+        depth_residual = self.res(self.conv3(self.conv2(self.conv1(concat))))
+        depth_refined = depth_init + depth_residual
+        return depth_refined
+
+
+def depth_regression(p, depth_values):
+    if depth_values.dim() <= 2:
+        depth_values = depth_values.view(*depth_values.shape, 1, 1)
+    depth = torch.sum(p * depth_values, 1)
+
+    return depth
+
+
+def cas_mvsnet_loss(inputs, depth_gt_ms, mask_ms, **kwargs):
+    depth_loss_weights = kwargs.get('dlossw', None)
+
+    total_loss = torch.tensor(
+        0.0,
+        dtype=torch.float32,
+        device=mask_ms['stage1'].device,
+        requires_grad=False)
+
+    for (stage_inputs, stage_key) in [(inputs[k], k) for k in inputs.keys()
+                                      if 'stage' in k]:
+        depth_est = stage_inputs['depth']
+        depth_gt = depth_gt_ms[stage_key]
+        mask = mask_ms[stage_key]
+        mask = mask > 0.5
+
+        depth_loss = F.smooth_l1_loss(
+            depth_est[mask], depth_gt[mask], reduction='mean')
+
+        if depth_loss_weights is not None:
+            stage_idx = int(stage_key.replace('stage', '')) - 1
+            total_loss += depth_loss_weights[stage_idx] * depth_loss
+        else:
+            total_loss += 1.0 * depth_loss
+
+    return total_loss, depth_loss
+
+
+def get_cur_depth_range_samples(cur_depth,
+                                ndepth,
+                                depth_inteval_pixel,
+                                shape,
+                                max_depth=192.0,
+                                min_depth=0.0):
+    """
+        shape, (B, H, W)
+        cur_depth: (B, H, W)
+        return depth_range_values: (B, D, H, W)
+    """
+    cur_depth_min = (cur_depth - ndepth / 2 * depth_inteval_pixel)  # (B, H, W)
+    cur_depth_max = (cur_depth + ndepth / 2 * depth_inteval_pixel)
+
+    assert cur_depth.shape == torch.Size(
+        shape), 'cur_depth:{}, input shape:{}'.format(cur_depth.shape, shape)
+    new_interval = (cur_depth_max - cur_depth_min) / (ndepth - 1)  # (B, H, W)
+
+    depth_range_samples = cur_depth_min.unsqueeze(1) + (
+        torch.arange(
+            0,
+            ndepth,
+            device=cur_depth.device,
+            dtype=cur_depth.dtype,
+            requires_grad=False).reshape(1, -1, 1, 1)
+        * new_interval.unsqueeze(1))
+
+    return depth_range_samples
+
+
+def get_depth_range_samples(cur_depth,
+                            ndepth,
+                            depth_inteval_pixel,
+                            device,
+                            dtype,
+                            shape,
+                            max_depth=192.0,
+                            min_depth=0.0):
+    """
+        shape: (B, H, W)
+        cur_depth: (B, H, W) or (B, D)
+        return depth_range_samples: (B, D, H, W)
+    """
+    if cur_depth.dim() == 2:
+        cur_depth_min = cur_depth[:, 0]  # (B,)
+        cur_depth_max = cur_depth[:, -1]
+        new_interval = (cur_depth_max - cur_depth_min) / (ndepth - 1)  # (B, )
+
+        depth_range_samples = cur_depth_min.unsqueeze(1) + (torch.arange(
+            0, ndepth, device=device, dtype=dtype,
+            requires_grad=False).reshape(1, -1) * new_interval.unsqueeze(1)
+                                                            )  # noqa  # (B, D)
+
+        depth_range_samples = depth_range_samples.unsqueeze(-1).unsqueeze(
+            -1).repeat(1, 1, shape[1], shape[2])  # (B, D, H, W)
+
+    else:
+
+        depth_range_samples = get_cur_depth_range_samples(
+            cur_depth, ndepth, depth_inteval_pixel, shape, max_depth,
+            min_depth)
+
+    return depth_range_samples
diff --git a/modelscope/models/cv/image_mvs_depth_estimation/utils.py b/modelscope/models/cv/image_mvs_depth_estimation/utils.py
new file mode 100644
index 00000000..aeab02b3
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation/utils.py
@@ -0,0 +1,118 @@
+# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch
+import random
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.utils as vutils
+
+
+# convert a function into recursive style to handle nested dict/list/tuple variables
+def make_recursive_func(func):
+
+    def wrapper(vars):
+        if isinstance(vars, list):
+            return [wrapper(x) for x in vars]
+        elif isinstance(vars, tuple):
+            return tuple([wrapper(x) for x in vars])
+        elif isinstance(vars, dict):
+            return {k: wrapper(v) for k, v in vars.items()}
+        else:
+            return func(vars)
+
+    return wrapper
+
+
+@make_recursive_func
+def tensor2numpy(vars):
+    if isinstance(vars, np.ndarray):
+        return vars
+    elif isinstance(vars, torch.Tensor):
+        return vars.detach().cpu().numpy().copy()
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2numpy'.format(type(vars)))
+
+
+@make_recursive_func
+def numpy2torch(vars):
+    if isinstance(vars, np.ndarray):
+        return torch.from_numpy(vars)
+    elif isinstance(vars, torch.Tensor):
+        return vars
+    elif isinstance(vars, str):
+        return vars
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for numpy2torch'.format(type(vars)))
+
+
+@make_recursive_func
+def tocuda(vars):
+    if isinstance(vars, torch.Tensor):
+        return vars.to(torch.device('cuda'))
+    elif isinstance(vars, str):
+        return vars
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2numpy'.format(type(vars)))
+
+
+def generate_pointcloud(rgb, depth, ply_file, intr, scale=1.0):
+    """
+    Generate a colored point cloud in PLY format from a color and a depth image.
+
+    Input:
+    rgb_file -- filename of color image
+    depth_file -- filename of depth image
+    ply_file -- filename of ply file
+
+    """
+    fx, fy, cx, cy = intr[0, 0], intr[1, 1], intr[0, 2], intr[1, 2]
+    points = []
+    for v in range(rgb.shape[0]):
+        for u in range(rgb.shape[1]):
+            color = rgb[v, u]  # rgb.getpixel((u, v))
+            Z = depth[v, u] / scale
+            if Z == 0:
+                continue
+            X = (u - cx) * Z / fx
+            Y = (v - cy) * Z / fy
+            points.append('%f %f %f %d %d %d 0\n' %
+                          (X, Y, Z, color[0], color[1], color[2]))
+    file = open(ply_file, 'w')
+    file.write('''ply
+            format ascii 1.0
+            element vertex %d
+            property float x
+            property float y
+            property float z
+            property uchar red
+            property uchar green
+            property uchar blue
+            property uchar alpha
+            end_header
+            %s
+            ''' % (len(points), ''.join(points)))
+    file.close()
+
+
+def write_cam(file, cam):
+    f = open(file, 'w')
+    f.write('extrinsic\n')
+    for i in range(0, 4):
+        for j in range(0, 4):
+            f.write(str(cam[0][i][j]) + ' ')
+        f.write('\n')
+    f.write('\n')
+
+    f.write('intrinsic\n')
+    for i in range(0, 3):
+        for j in range(0, 3):
+            f.write(str(cam[1][i][j]) + ' ')
+        f.write('\n')
+
+    f.write('\n' + str(cam[1][3][0]) + ' ' + str(cam[1][3][1]) + ' '
+            + str(cam[1][3][2]) + ' ' + str(cam[1][3][3]) + '\n')
+
+    f.close()
diff --git a/modelscope/models/cv/image_panoptic_segmentation/__init__.py b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
index 2b2be4b7..1af5b6f8 100644
--- a/modelscope/models/cv/image_panoptic_segmentation/__init__.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/__init__.py
@@ -5,6 +5,7 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .panseg_model import SwinLPanopticSegmentation
+    from .r50_panseg_model import R50PanopticSegmentation
 
 else:
     _import_structure = {
diff --git a/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py b/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py
new file mode 100644
index 00000000..73b6b76c
--- /dev/null
+++ b/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py
@@ -0,0 +1,18 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from easycv.models.segmentation import Mask2Former
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.easycv_base import EasyCVBaseModel
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    group_key=Tasks.image_segmentation,
+    module_name=Models.r50_panoptic_segmentation)
+class R50PanopticSegmentation(EasyCVBaseModel, Mask2Former):
+
+    def __init__(self, model_dir=None, *args, **kwargs):
+        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
+        Mask2Former.__init__(self, *args, **kwargs)
diff --git a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
index 26e9e532..c442562b 100644
--- a/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
+++ b/modelscope/models/cv/image_portrait_enhancement/image_portrait_enhancement.py
@@ -1,13 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import os.path as osp
-from copy import deepcopy
 from typing import Any, Dict, List, Union
 
 import torch
 import torch.nn.functional as F
 from torch import autograd, nn
-from torch.nn.parallel import DataParallel, DistributedDataParallel
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
diff --git a/modelscope/models/cv/indoor_layout_estimation/__init__.py b/modelscope/models/cv/indoor_layout_estimation/__init__.py
new file mode 100644
index 00000000..a0b1f83a
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .panovit import LayoutEstimation
+
+else:
+    _import_structure = {
+        'panovit': ['LayoutEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/__init__.py b/modelscope/models/cv/indoor_layout_estimation/networks/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/backbone/__init__.py b/modelscope/models/cv/indoor_layout_estimation/networks/backbone/__init__.py
new file mode 100644
index 00000000..ed6c66a7
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/backbone/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .resnet_DA import ResnetDA
+from .vit_horizon_pry_image import ViTHorizonPryImage
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/backbone/resnet_DA.py b/modelscope/models/cv/indoor_layout_estimation/networks/backbone/resnet_DA.py
new file mode 100644
index 00000000..26bfeb58
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/backbone/resnet_DA.py
@@ -0,0 +1,112 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torchvision.models as models
+
+from ..utils import StripPooling
+
+
+class SPHead(nn.Module):
+
+    def __init__(self, in_channels, out_channels, norm_layer):
+        super(SPHead, self).__init__()
+        inter_channels = in_channels // 2
+        self.trans_layer = nn.Sequential(
+            nn.Conv2d(in_channels, inter_channels, 1, 1, 0, bias=False),
+            norm_layer(inter_channels), nn.ReLU(True))
+        self.strip_pool1 = StripPooling(inter_channels, (20, 12), norm_layer,
+                                        up_kwargs)
+        self.strip_pool2 = StripPooling(inter_channels, (20, 12), norm_layer,
+                                        up_kwargs)
+        self.score_layer = nn.Sequential(
+            nn.Conv2d(
+                inter_channels, inter_channels // 2, 3, 1, 1, bias=False),
+            norm_layer(inter_channels // 2), nn.ReLU(True),
+            nn.Dropout2d(0.1, False),
+            nn.Conv2d(inter_channels // 2, out_channels, 1))
+
+    def forward(self, x):
+        x = self.trans_layer(x)
+        x = self.strip_pool1(x)
+        x = self.strip_pool2(x)
+        x = self.score_layer(x)
+        return x
+
+
+class ResnetDA(nn.Module):
+
+    def __init__(self,
+                 backbone='resnet50',
+                 coco='',
+                 input_extra=0,
+                 input_height=512):
+        super(ResnetDA, self).__init__()
+        self.encoder = getattr(models, backbone)(pretrained=True)
+        del self.encoder.fc, self.encoder.avgpool
+        if coco:
+            coco_pretrain = getattr(models.segmentation,
+                                    coco)(pretrained=True).backbone
+            self.encoder.load_state_dict(coco_pretrain.state_dict())
+        self.out_channels = [256, 512, 1024, 2048]
+        self.feat_heights = [input_height // 4 // (2**i) for i in range(4)]
+        if int(backbone[6:]) < 50:
+            self.out_channels = [_ // 4 for _ in self.out_channels]
+
+        # Patch for extra input channel
+        if input_extra > 0:
+            ori_conv1 = self.encoder.conv1
+            new_conv1 = nn.Conv2d(
+                3 + input_extra,
+                ori_conv1.out_channels,
+                kernel_size=ori_conv1.kernel_size,
+                stride=ori_conv1.stride,
+                padding=ori_conv1.padding,
+                bias=ori_conv1.bias)
+            with torch.no_grad():
+                for i in range(0, 3 + input_extra, 3):
+                    n = new_conv1.weight[:, i:i + 3].shape[1]
+                    new_conv1.weight[:, i:i + n] = ori_conv1.weight[:, :n]
+            self.encoder.conv1 = new_conv1
+
+        # Prepare for pre/pose down height filtering
+        self.pre_down = None
+        self.post_down = None
+        # SPhead
+        self.strip_pool1 = StripPooling(
+            self.out_channels[0], [128, 256], (20, 12),
+            norm_layer=nn.BatchNorm2d)
+        self.strip_pool2 = StripPooling(
+            self.out_channels[1], [64, 128], (20, 12),
+            norm_layer=nn.BatchNorm2d)
+        self.strip_pool3 = StripPooling(
+            self.out_channels[2], [32, 64], (20, 12),
+            norm_layer=nn.BatchNorm2d)
+        self.strip_pool4 = StripPooling(
+            self.out_channels[3], [16, 32], (10, 12),
+            norm_layer=nn.BatchNorm2d)
+
+    def forward(self, x):
+        features = []
+        x = self.encoder.conv1(x)
+        x = self.encoder.bn1(x)
+        x = self.encoder.relu(x)
+        x = self.encoder.maxpool(x)
+
+        if self.pre_down is not None:
+            x = self.pre_down(x)
+        x = self.encoder.layer1(x)
+        x = self.strip_pool1(x)
+        if self.post_down is not None:
+            x = self.post_down(x)
+        features.append(x)  # 1/4
+        x = self.encoder.layer2(x)
+        x = self.strip_pool2(x)
+        features.append(x)  # 1/8
+        x = self.encoder.layer3(x)
+        x = self.strip_pool3(x)
+        features.append(x)  # 1/16
+        x = self.encoder.layer4(x)
+        x = self.strip_pool4(x)
+        features.append(x)  # 1/32
+        return features
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/backbone/vit_horizon_pry_image.py b/modelscope/models/cv/indoor_layout_estimation/networks/backbone/vit_horizon_pry_image.py
new file mode 100644
index 00000000..7ce22fee
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/backbone/vit_horizon_pry_image.py
@@ -0,0 +1,181 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 stride_size=16,
+                 padding=[0, 1],
+                 in_chans=3,
+                 embed_dim=768,
+                 norm_layer=None,
+                 flatten=True):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0],
+                          img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride_size,
+            padding=[0, 1])
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+class ViTHorizonPryImage(nn.Module):
+
+    def __init__(self, backbone, fourier, embedding):
+        super(ViTHorizonPryImage, self).__init__()
+        embed_dim = 192
+        F_lens = [256, 128, 64, 32, 512]
+        position_encode = np.sum(np.array(F_lens))
+        self.embedding = embedding
+        if fourier is False:
+            in_chans = 3
+        else:
+            in_chans = 9
+        self.pre_image = PatchEmbed([512, 1024], [32, 32], [32, 32],
+                                    in_chans=in_chans,
+                                    embed_dim=embed_dim)
+        self.pre_net = nn.ModuleList([
+            PatchEmbed([128, 256], [128, 3], [128, 1],
+                       padding=[0, 1],
+                       in_chans=64,
+                       embed_dim=embed_dim),
+            PatchEmbed([64, 128], [64, 3], [64, 1],
+                       padding=[0, 1],
+                       in_chans=128,
+                       embed_dim=embed_dim),
+            PatchEmbed([32, 64], [32, 3], [32, 1],
+                       padding=[0, 1],
+                       in_chans=256,
+                       embed_dim=embed_dim),
+            PatchEmbed([16, 32], [16, 3], [16, 1],
+                       padding=[0, 1],
+                       in_chans=512,
+                       embed_dim=embed_dim)
+        ])
+
+        self.encoder = timm.create_model(backbone, pretrained=False)
+        del self.encoder.patch_embed, self.encoder.head
+
+        self.dist_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, position_encode + 1, embed_dim))
+
+        def EfficientConvCompressH(in_c, out_c, down_h):
+
+            net1 = nn.Sequential(
+                nn.Conv2d(in_c, out_c, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_c), nn.ReLU(inplace=True))
+
+            net2 = nn.Sequential(
+                nn.Conv2d(out_c, out_c, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_c),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(out_c, out_c, (down_h, 1), groups=out_c, bias=False),
+            )
+            return net1, net2
+
+        self.ECH1, self.ECH2 = EfficientConvCompressH(embed_dim, 2 * embed_dim,
+                                                      4)
+        self.scales = [1, 2, 4, 8]
+        # self.L = nn.Linear(454,1024)
+        if self.embedding == 'sin':
+            import math
+            max_len, d_model = position_encode, embed_dim
+            pe = torch.zeros(max_len, d_model)
+            position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+            div_term = \
+                torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+            pe[:, 0::2] = torch.sin(position * div_term)
+            pe[:, 1::2] = torch.cos(position * div_term)
+            self.register_buffer('pos', pe.T[None].contiguous())
+
+        elif self.embedding == 'recurrent':
+            import math
+            d_model = embed_dim
+            index = torch.randint(0, F_lens[0], [1])
+            for i, max_len in enumerate(F_lens):
+                pe = torch.zeros(max_len, d_model)
+                position = torch.arange(
+                    0, max_len, dtype=torch.float).unsqueeze(1)
+                if i < len(F_lens) - 1:
+                    index = torch.div(index, 2, rounding_mode='floor')**i
+                    position = (index + position) % max_len
+                position = position + np.sum(np.array(F_lens[:i]))
+                div_term = \
+                    torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+                pe[:, 0::2] = torch.sin(position * div_term)
+                pe[:, 1::2] = torch.cos(position * div_term)
+                if i == 0:
+                    pe_re = pe
+                else:
+                    pe_re = torch.cat((pe_re, pe), dim=0)
+            self.register_buffer('pos', pe_re.T[None].contiguous())
+
+    def forward(self, img, x):
+        for i, feat in enumerate(x):
+            pre = self.pre_net[i](feat)
+            if i == 0:
+                inputs = pre
+            else:
+                inputs = torch.cat((inputs, pre), 1)
+        pre = self.pre_image(img)
+        inputs = torch.cat((inputs, pre), 1)
+        if self.embedding == 'learnable':
+            inputs = torch.cat(
+                (self.dist_token.expand(inputs.shape[0], -1, -1), inputs),
+                dim=1)
+            inputs = inputs + self.pos_embed
+        if self.embedding == 'sin':
+            inputs = inputs + self.pos.permute(0, 2, 1)
+        if self.embedding == 'recurrent':
+            inputs = inputs + self.pos.permute(0, 2, 1)
+
+        x = self.encoder.pos_drop(inputs)
+        for i in range(12):
+            x = self.encoder.blocks[i](x)
+        x = x.permute(0, 2, 1)
+        a1 = x[:, :, :256].reshape(x.shape[0], x.shape[1], 1, 256)
+        a1 = F.interpolate(
+            a1, scale_factor=(1, 4), mode='bilinear', align_corners=False)
+        a2 = x[:, :, 256:384].reshape(x.shape[0], x.shape[1], 1, 128)
+        a2 = F.interpolate(
+            a2, scale_factor=(1, 8), mode='bilinear', align_corners=False)
+        a3 = x[:, :, 384:448].reshape(x.shape[0], x.shape[1], 1, 64)
+        a3 = F.interpolate(
+            a3, scale_factor=(1, 16), mode='bilinear', align_corners=False)
+        a4 = x[:, :, 448:480].reshape(x.shape[0], x.shape[1], 1, 32)
+        a4 = F.interpolate(
+            a4, scale_factor=(1, 32), mode='bilinear', align_corners=False)
+        a = torch.cat((a1, a2, a3, a4), dim=2)
+        a = self.ECH1(a)
+        a = self.ECH2(a).flatten(2)
+
+        feat = {}
+        feat['1D'] = a
+        return feat
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/misc/__init__.py b/modelscope/models/cv/indoor_layout_estimation/networks/misc/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/misc/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/misc/fourier.py b/modelscope/models/cv/indoor_layout_estimation/networks/misc/fourier.py
new file mode 100644
index 00000000..92d6a7b1
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/misc/fourier.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+from PIL import Image
+from scipy.fft import fft2, ifft2
+
+AL = 1
+pas = 50
+highpas = 250
+index = 550
+
+
+def gene_mask(f, angle, horizon):
+    alpha = AL
+    an = angle / 360 * (2 * np.pi)
+    W, H = f.shape[:2]
+    X = np.arange(0, W) - 255
+    Y = np.arange(0, H) - 512
+    x, y = np.meshgrid(Y, X)
+    dis = np.sqrt(x**2 + y**2)
+    dis[dis < pas] = 0
+    dis[dis >= highpas] = 0
+    dis[dis != 0] = 1
+
+    maskband = np.zeros(f.shape)
+    maskband[:, index:index + 200] = 1
+
+    angles = np.abs(np.arctan(y / (x + 0.00001)))
+    angles[angles > an] = 1
+    angles[angles < an] = 0
+    if not horizon:
+        mask = angles
+    else:
+        mask = 1 - angles
+    return alpha * mask * maskband
+
+
+def normal(rgb_recons, edge=False):
+    rgb_recons = rgb_recons.astype(int)
+    rgb_recons = (rgb_recons - np.min(rgb_recons)) / (
+        np.max(rgb_recons) - np.min(rgb_recons)) * 255
+    return rgb_recons
+
+
+def fourier_gray(img):
+    img = Image.fromarray(img.astype(np.uint8)).convert('L')
+    img = np.array(img)
+    im = img
+    x = im * 1
+    y = fft2(x, axes=(0, 1))
+    shift2center = np.fft.fftshift(y)
+
+    mask = np.zeros(y.shape)
+    mask_angle = gene_mask(mask, 25, horizon=False)
+    mask = mask_angle
+    crop1 = shift2center * mask
+    iresult = np.fft.ifftshift(crop1)
+    recons = np.abs(ifft2(iresult, axes=(0, 1)))
+    rgb_recons = recons
+
+    mask = np.zeros(y.shape)
+    mask_angle = gene_mask(mask, 20, horizon=True)
+    mask = mask_angle
+    crop1 = shift2center * mask
+    iresult = np.fft.ifftshift(crop1)
+    recons = np.abs(ifft2(iresult, axes=(0, 1)))
+    rgb_reconsH = recons
+
+    rgb_recons = normal(rgb_recons, True)
+
+    rgb_reconsH = normal(rgb_reconsH, True)
+
+    rgb_reconsA = rgb_reconsH * 0
+
+    x = np.concatenate((rgb_recons[:, :, None], rgb_reconsH[:, :, None],
+                        rgb_reconsA[:, :, None]), 2)
+
+    return x
+
+
+def fourier(img):
+    rgb_recons = np.zeros(img.shape)
+    index = 520
+    for k in range(3):
+        im = img[:, :, k:k + 1]
+        x = im * 1
+        y = fft2(x)
+        shift2center = np.fft.fftshift(y)
+
+        mask = np.zeros(y.shape)
+        mask[:, index:index + 200, :] = 1
+
+        crop1 = shift2center * mask
+        iresult = np.fft.ifftshift(crop1)
+        recons = np.abs(ifft2(iresult))
+        rgb_recons[:, :, k:k + 1] = recons
+
+    rgb_recons = (rgb_recons - np.min(rgb_recons)) / (
+        np.max(rgb_recons) - np.min(rgb_recons))
+    return rgb_recons
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/misc/panostretch.py b/modelscope/models/cv/indoor_layout_estimation/networks/misc/panostretch.py
new file mode 100644
index 00000000..a3b3b0b3
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/misc/panostretch.py
@@ -0,0 +1,114 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import functools
+
+import numpy as np
+from scipy.ndimage import map_coordinates
+
+
+def uv_meshgrid(w, h):
+    uv = np.stack(np.meshgrid(range(w), range(h)), axis=-1)
+    uv = uv.astype(np.float64)
+    uv[..., 0] = ((uv[..., 0] + 0.5) / w - 0.5) * 2 * np.pi
+    uv[..., 1] = ((uv[..., 1] + 0.5) / h - 0.5) * np.pi
+    return uv
+
+
+@functools.lru_cache()
+def _uv_tri(w, h):
+    uv = uv_meshgrid(w, h)
+    sin_u = np.sin(uv[..., 0])
+    cos_u = np.cos(uv[..., 0])
+    tan_v = np.tan(uv[..., 1])
+    return sin_u, cos_u, tan_v
+
+
+def uv_tri(w, h):
+    sin_u, cos_u, tan_v = _uv_tri(w, h)
+    return sin_u.copy(), cos_u.copy(), tan_v.copy()
+
+
+def coorx2u(x, w=1024):
+    return ((x + 0.5) / w - 0.5) * 2 * np.pi
+
+
+def coory2v(y, h=512):
+    return ((y + 0.5) / h - 0.5) * np.pi
+
+
+def u2coorx(u, w=1024):
+    return (u / (2 * np.pi) + 0.5) * w - 0.5
+
+
+def v2coory(v, h=512):
+    return (v / np.pi + 0.5) * h - 0.5
+
+
+def uv2xy(u, v, z=-50):
+    c = z / np.tan(v)
+    x = c * np.cos(u)
+    y = c * np.sin(u)
+    return x, y
+
+
+def pano_connect_points(p1, p2, z=-50, w=1024, h=512):
+    if p1[0] == p2[0]:
+        return np.array([p1, p2], np.float32)
+
+    u1 = coorx2u(p1[0], w)
+    v1 = coory2v(p1[1], h)
+    u2 = coorx2u(p2[0], w)
+    v2 = coory2v(p2[1], h)
+
+    x1, y1 = uv2xy(u1, v1, z)
+    x2, y2 = uv2xy(u2, v2, z)
+
+    if abs(p1[0] - p2[0]) < w / 2:
+        pstart = np.ceil(min(p1[0], p2[0]))
+        pend = np.floor(max(p1[0], p2[0]))
+    else:
+        pstart = np.ceil(max(p1[0], p2[0]))
+        pend = np.floor(min(p1[0], p2[0]) + w)
+    coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64)
+    vx = x2 - x1
+    vy = y2 - y1
+    us = coorx2u(coorxs, w)
+    ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx)
+    cs = np.sqrt((x1 + ps * vx)**2 + (y1 + ps * vy)**2)
+    vs = np.arctan2(z, cs)
+    coorys = v2coory(vs, h)
+
+    return np.stack([coorxs, coorys], axis=-1)
+
+
+def visualize_pano_stretch(stretched_img, stretched_cor, title):
+    thikness = 2
+    color = (0, 255, 0)
+    for i in range(4):
+        xys = pano_connect_points(
+            stretched_cor[i * 2], stretched_cor[(i * 2 + 2) % 8], z=-50)
+        xys = xys.astype(int)
+        blue_split = np.where((xys[1:, 0] - xys[:-1, 0]) < 0)[0]
+        if len(blue_split) == 0:
+            cv2.polylines(stretched_img, [xys], False, color, 2)
+        else:
+            t = blue_split[0] + 1
+            cv2.polylines(stretched_img, [xys[:t]], False, color, thikness)
+            cv2.polylines(stretched_img, [xys[t:]], False, color, thikness)
+
+    for i in range(4):
+        xys = pano_connect_points(
+            stretched_cor[i * 2 + 1], stretched_cor[(i * 2 + 3) % 8], z=50)
+        xys = xys.astype(int)
+        blue_split = np.where((xys[1:, 0] - xys[:-1, 0]) < 0)[0]
+        if len(blue_split) == 0:
+            cv2.polylines(stretched_img, [xys], False, color, 2)
+        else:
+            t = blue_split[0] + 1
+            cv2.polylines(stretched_img, [xys[:t]], False, color, thikness)
+            cv2.polylines(stretched_img, [xys[t:]], False, color, thikness)
+
+    cv2.putText(stretched_img, title, (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1,
+                (0, 0, 0), 2, cv2.LINE_AA)
+
+    return stretched_img.astype(np.uint8)
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/misc/post_proc.py b/modelscope/models/cv/indoor_layout_estimation/networks/misc/post_proc.py
new file mode 100644
index 00000000..823d5af5
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/misc/post_proc.py
@@ -0,0 +1,479 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+from scipy.ndimage import map_coordinates
+from scipy.spatial.distance import pdist, squareform
+from sklearn.decomposition import PCA
+
+PI = float(np.pi)
+
+
+def coorx2u(x, w=1024):
+    return ((x + 0.5) / w - 0.5) * 2 * np.pi
+
+
+def coory2v(y, h=512):
+    return ((y + 0.5) / h - 0.5) * np.pi
+
+
+def u2coorx(u, w=1024):
+    return (u / (2 * np.pi) + 0.5) * w - 0.5
+
+
+def v2coory(v, h=512):
+    return (v / np.pi + 0.5) * h - 0.5
+
+
+def uv2xy(u, v, z=-50):
+    c = z / np.tan(v)
+    x = c * np.cos(u)
+    y = c * np.sin(u)
+    return x, y
+
+
+def pano_connect_points(p1, p2, z=-50, w=1024, h=512):
+    if p1[0] == p2[0]:
+        return np.array([p1, p2], np.float32)
+
+    u1 = coorx2u(p1[0], w)
+    v1 = coory2v(p1[1], h)
+    u2 = coorx2u(p2[0], w)
+    v2 = coory2v(p2[1], h)
+
+    x1, y1 = uv2xy(u1, v1, z)
+    x2, y2 = uv2xy(u2, v2, z)
+
+    if abs(p1[0] - p2[0]) < w / 2:
+        pstart = np.ceil(min(p1[0], p2[0]))
+        pend = np.floor(max(p1[0], p2[0]))
+    else:
+        pstart = np.ceil(max(p1[0], p2[0]))
+        pend = np.floor(min(p1[0], p2[0]) + w)
+    coorxs = (np.arange(pstart, pend + 1) % w).astype(np.float64)
+    vx = x2 - x1
+    vy = y2 - y1
+    us = coorx2u(coorxs, w)
+    ps = (np.tan(us) * x1 - y1) / (vy - np.tan(us) * vx)
+    cs = np.sqrt((x1 + ps * vx)**2 + (y1 + ps * vy)**2)
+    vs = np.arctan2(z, cs)
+    coorys = v2coory(vs, h)
+
+    return np.stack([coorxs, coorys], axis=-1)
+
+
+def sort_xy_filter_unique(xs, ys, y_small_first=True):
+    xs, ys = np.array(xs), np.array(ys)
+    idx_sort = np.argsort(xs + ys / ys.max() * (int(y_small_first) * 2 - 1))
+    xs, ys = xs[idx_sort], ys[idx_sort]
+    _, idx_unique = np.unique(xs, return_index=True)
+    xs, ys = xs[idx_unique], ys[idx_unique]
+    assert np.all(np.diff(xs) > 0)
+    return xs, ys
+
+
+def cor_2_1d(cor, H, W):
+    bon_ceil_x, bon_ceil_y = [], []
+    bon_floor_x, bon_floor_y = [], []
+    n_cor = len(cor)
+    for i in range(n_cor // 2):
+        xys = pano_connect_points(
+            cor[i * 2], cor[(i * 2 + 2) % n_cor], z=-50, w=W, h=H)
+        bon_ceil_x.extend(xys[:, 0])
+        bon_ceil_y.extend(xys[:, 1])
+    for i in range(n_cor // 2):
+        xys = pano_connect_points(
+            cor[i * 2 + 1], cor[(i * 2 + 3) % n_cor], z=50, w=W, h=H)
+        bon_floor_x.extend(xys[:, 0])
+        bon_floor_y.extend(xys[:, 1])
+    bon_ceil_x, bon_ceil_y = sort_xy_filter_unique(
+        bon_ceil_x, bon_ceil_y, y_small_first=True)
+    bon_floor_x, bon_floor_y = sort_xy_filter_unique(
+        bon_floor_x, bon_floor_y, y_small_first=False)
+    bon = np.zeros((2, W))
+    bon[0] = np.interp(np.arange(W), bon_ceil_x, bon_ceil_y, period=W)
+    bon[1] = np.interp(np.arange(W), bon_floor_x, bon_floor_y, period=W)
+    bon = ((bon + 0.5) / H - 0.5) * np.pi
+    return bon
+
+
+def fuv2img(fuv, coorW=1024, floorW=1024, floorH=512):
+    floor_plane_x, floor_plane_y = np.meshgrid(range(floorW), range(floorH))
+    floor_plane_x, floor_plane_y = -(floor_plane_y
+                                     - floorH / 2), floor_plane_x - floorW / 2
+    floor_plane_coridx = \
+        (np.arctan2(floor_plane_y, floor_plane_x) / (2 * PI) + 0.5) * coorW - 0.5
+    floor_plane = map_coordinates(
+        fuv, floor_plane_coridx.reshape(1, -1), order=1, mode='wrap')
+    floor_plane = floor_plane.reshape(floorH, floorW)
+    return floor_plane
+
+
+def np_coorx2u(coorx, coorW=1024):
+    return ((coorx + 0.5) / coorW - 0.5) * 2 * PI
+
+
+def np_coory2v(coory, coorH=512):
+    return -((coory + 0.5) / coorH - 0.5) * PI
+
+
+def np_coor2xy(coor, z=50, coorW=1024, coorH=512, floorW=1024, floorH=512):
+    coor = np.array(coor)
+    u = np_coorx2u(coor[:, 0], coorW)
+    v = np_coory2v(coor[:, 1], coorH)
+    c = z / np.tan(v)
+    x = c * np.sin(u) + floorW / 2 - 0.5
+    y = -c * np.cos(u) + floorH / 2 - 0.5
+    return np.hstack([x[:, None], y[:, None]])
+
+
+def np_x_u_solve_y(x, u, floorW=1024, floorH=512):
+    c = (x - floorW / 2 + 0.5) / np.sin(u)
+    return -c * np.cos(u) + floorH / 2 - 0.5
+
+
+def np_y_u_solve_x(y, u, floorW=1024, floorH=512):
+    c = -(y - floorH / 2 + 0.5) / np.cos(u)
+    return c * np.sin(u) + floorW / 2 - 0.5
+
+
+def np_xy2coor(xy, z=50, coorW=1024, coorH=512, floorW=1024, floorH=512):
+    x = xy[:, 0] - floorW / 2 + 0.5
+    y = xy[:, 1] - floorH / 2 + 0.5
+
+    u = np.arctan2(x, -y)
+    v = np.arctan(z / np.sqrt(x**2 + y**2))
+
+    coorx = (u / (2 * PI) + 0.5) * coorW - 0.5
+    coory = (-v / PI + 0.5) * coorH - 0.5
+
+    return np.hstack([coorx[:, None], coory[:, None]])
+
+
+def mean_percentile(vec, p1=25, p2=75):
+    vmin = np.percentile(vec, p1)
+    vmax = np.percentile(vec, p2)
+    return vec[(vmin <= vec) & (vec <= vmax)].mean()
+
+
+def vote(vec, tol):
+    vec = np.sort(vec)
+    n = np.arange(len(vec))[::-1]
+    n = n[:, None] - n[None, :] + 1.0
+    la = squareform(pdist(vec[:, None], 'minkowski', p=1) + 1e-9)
+
+    invalid = (n < len(vec) * 0.4) | (la > tol)
+    if (~invalid).sum() == 0 or len(vec) < tol:
+        best_fit = np.median(vec)
+        p_score = 0
+    else:
+        la[invalid] = 1e5
+        n[invalid] = -1
+        score = n
+        max_idx = score.argmax()
+        max_row = max_idx // len(vec)
+        max_col = max_idx % len(vec)
+        assert max_col > max_row
+        best_fit = np.median(vec)
+        p_score = (max_col - max_row + 1) / len(vec)
+
+    l1_score = np.abs(vec - best_fit).mean()
+
+    return best_fit, p_score, l1_score
+
+
+def get_z1(coory0, coory1, z0=50, coorH=512):
+    v0 = np_coory2v(coory0, coorH)
+    v1 = np_coory2v(coory1, coorH)
+    c0 = z0 / np.tan(v0)
+    z1 = c0 * np.tan(v1)
+    return z1
+
+
+def np_refine_by_fix_z(coory0, coory1, z0=50, coorH=512):
+    v0 = np_coory2v(coory0, coorH)
+    v1 = np_coory2v(coory1, coorH)
+
+    c0 = z0 / np.tan(v0)
+    z1 = c0 * np.tan(v1)
+    z1_mean = mean_percentile(z1)
+    v1_refine = np.arctan2(z1_mean, c0)
+    coory1_refine = (-v1_refine / PI + 0.5) * coorH - 0.5
+
+    return coory1_refine, z1_mean
+
+
+def infer_coory(coory0, h, z0=50, coorH=512):
+    v0 = np_coory2v(coory0, coorH)
+    c0 = z0 / np.tan(v0)
+    z1 = z0 + h
+    v1 = np.arctan2(z1, c0)
+    return (-v1 / PI + 0.5) * coorH - 0.5
+
+
+def get_gpid(coorx, coorW):
+    gpid = np.zeros(coorW)
+    gpid[np.round(coorx).astype(int)] = 1
+    gpid = np.cumsum(gpid).astype(int)
+    gpid[gpid == gpid[-1]] = 0
+    return gpid
+
+
+def get_gpid_idx(gpid, j):
+    idx = np.where(gpid == j)[0]
+    if idx[0] == 0 and idx[-1] != len(idx) - 1:
+        _shift = -np.where(idx != np.arange(len(idx)))[0][0]
+        idx = np.roll(idx, _shift)
+    return idx
+
+
+def gpid_two_split(xy, tpid_a, tpid_b):
+    m = np.arange(len(xy)) + 1
+    cum_a = np.cumsum(xy[:, tpid_a])
+    cum_b = np.cumsum(xy[::-1, tpid_b])
+    l1_a = cum_a / m - cum_a / (m * m)
+    l1_b = cum_b / m - cum_b / (m * m)
+    l1_b = l1_b[::-1]
+
+    score = l1_a[:-1] + l1_b[1:]
+    best_split = score.argmax() + 1
+
+    va = xy[:best_split, tpid_a].mean()
+    vb = xy[best_split:, tpid_b].mean()
+
+    return va, vb
+
+
+def _get_rot_rad(px, py):
+    if px < 0:
+        px, py = -px, -py
+    rad = np.arctan2(py, px) * 180 / np.pi
+    if rad > 45:
+        return 90 - rad
+    if rad < -45:
+        return -90 - rad
+    return -rad
+
+
+def get_rot_rad(init_coorx,
+                coory,
+                z=50,
+                coorW=1024,
+                coorH=512,
+                floorW=1024,
+                floorH=512,
+                tol=5):
+    gpid = get_gpid(init_coorx, coorW)
+    coor = np.hstack([np.arange(coorW)[:, None], coory[:, None]])
+    xy = np_coor2xy(coor, z, coorW, coorH, floorW, floorH)
+
+    rot_rad_suggestions = []
+    for j in range(len(init_coorx)):
+        pca = PCA(n_components=1)
+        pca.fit(xy[gpid == j])
+        rot_rad_suggestions.append(_get_rot_rad(*pca.components_[0]))
+    rot_rad_suggestions = np.sort(rot_rad_suggestions + [1e9])
+
+    rot_rad = np.mean(rot_rad_suggestions[:-1])
+    best_rot_rad_sz = -1
+    last_j = 0
+    for j in range(1, len(rot_rad_suggestions)):
+        if rot_rad_suggestions[j] - rot_rad_suggestions[j - 1] > tol:
+            last_j = j
+        elif j - last_j > best_rot_rad_sz:
+            rot_rad = rot_rad_suggestions[last_j:j + 1].mean()
+            best_rot_rad_sz = j - last_j
+
+    dx = int(round(rot_rad * 1024 / 360))
+    return dx, rot_rad
+
+
+def gen_ww_cuboid(xy, gpid):
+    assert len(np.unique(gpid)) == 4
+    xy_cor = [
+        {
+            'type': 1,
+            'val': np.median(xy[gpid == 0, 1])
+        },
+        {
+            'type': 0,
+            'val': np.median(xy[gpid == 1, 0])
+        },
+        {
+            'type': 1,
+            'val': np.median(xy[gpid == 2, 1])
+        },
+        {
+            'type': 0,
+            'val': np.median(xy[gpid == 3, 0])
+        },
+    ]
+    return xy_cor
+
+
+def gen_ww_general(init_coorx, xy, gpid, tol):
+    xy_cor = []
+    assert len(init_coorx) == len(np.unique(gpid))
+
+    for j in range(len(init_coorx)):
+        now_x = xy[gpid == j, 0]
+        now_y = xy[gpid == j, 1]
+        new_x, x_score, x_l1 = vote(now_x, tol)
+        new_y, y_score, y_l1 = vote(now_y, tol)
+        u0 = np_coorx2u(init_coorx[(j - 1 + len(init_coorx))
+                                   % len(init_coorx)])
+        u1 = np_coorx2u(init_coorx[j])
+        if (x_score, -x_l1) > (y_score, -y_l1):
+            xy_cor.append({
+                'type': 0,
+                'val': new_x,
+                'score': x_score,
+                'action': 'ori',
+                'gpid': j,
+                'u0': u0,
+                'u1': u1,
+                'tbd': True
+            })
+        else:
+            xy_cor.append({
+                'type': 1,
+                'val': new_y,
+                'score': y_score,
+                'action': 'ori',
+                'gpid': j,
+                'u0': u0,
+                'u1': u1,
+                'tbd': True
+            })
+
+    while True:
+        tbd = -1
+        for i in range(len(xy_cor)):
+            if xy_cor[i]['tbd'] and (
+                    tbd == -1 or xy_cor[i]['score'] > xy_cor[tbd]['score']):
+                tbd = i
+        if tbd == -1:
+            break
+
+        xy_cor[tbd]['tbd'] = False
+        p_idx = (tbd - 1 + len(xy_cor)) % len(xy_cor)
+        n_idx = (tbd + 1) % len(xy_cor)
+
+        num_tbd_neighbor = xy_cor[p_idx]['tbd'] + xy_cor[n_idx]['tbd']
+
+        if num_tbd_neighbor == 2:
+            continue
+
+        if num_tbd_neighbor == 1:
+            if (not xy_cor[p_idx]['tbd'] and xy_cor[p_idx]['type'] == xy_cor[tbd]['type']) or\
+                    (not xy_cor[n_idx]['tbd'] and xy_cor[n_idx]['type'] == xy_cor[tbd]['type']):
+                if xy_cor[tbd]['score'] >= -1:
+                    xy_cor[tbd]['tbd'] = True
+                    xy_cor[tbd]['score'] -= 100
+                else:
+                    if not xy_cor[p_idx]['tbd']:
+                        insert_at = tbd
+                        if xy_cor[p_idx]['type'] == 0:
+                            new_val = np_x_u_solve_y(xy_cor[p_idx]['val'],
+                                                     xy_cor[p_idx]['u1'])
+                            new_type = 1
+                        else:
+                            new_val = np_y_u_solve_x(xy_cor[p_idx]['val'],
+                                                     xy_cor[p_idx]['u1'])
+                            new_type = 0
+                    else:
+                        insert_at = n_idx
+                        if xy_cor[n_idx]['type'] == 0:
+                            new_val = np_x_u_solve_y(xy_cor[n_idx]['val'],
+                                                     xy_cor[n_idx]['u0'])
+                            new_type = 1
+                        else:
+                            new_val = np_y_u_solve_x(xy_cor[n_idx]['val'],
+                                                     xy_cor[n_idx]['u0'])
+                            new_type = 0
+                    new_add = {
+                        'type': new_type,
+                        'val': new_val,
+                        'score': 0,
+                        'action': 'forced infer',
+                        'gpid': -1,
+                        'u0': -1,
+                        'u1': -1,
+                        'tbd': False
+                    }
+                    xy_cor.insert(insert_at, new_add)
+            continue
+
+        if xy_cor[p_idx]['type'] == xy_cor[n_idx]['type']:
+            if xy_cor[tbd]['type'] == xy_cor[p_idx]['type']:
+                xy_cor[tbd]['type'] = (xy_cor[tbd]['type'] + 1) % 2
+                xy_cor[tbd]['action'] = 'forced change'
+                xy_cor[tbd]['val'] = xy[gpid == xy_cor[tbd]['gpid'],
+                                        xy_cor[tbd]['type']].mean()
+        else:
+            tp0 = xy_cor[n_idx]['type']
+            tp1 = xy_cor[p_idx]['type']
+            if xy_cor[p_idx]['type'] == 0:
+                val0 = np_x_u_solve_y(xy_cor[p_idx]['val'],
+                                      xy_cor[p_idx]['u1'])
+                val1 = np_y_u_solve_x(xy_cor[n_idx]['val'],
+                                      xy_cor[n_idx]['u0'])
+            else:
+                val0 = np_y_u_solve_x(xy_cor[p_idx]['val'],
+                                      xy_cor[p_idx]['u1'])
+                val1 = np_x_u_solve_y(xy_cor[n_idx]['val'],
+                                      xy_cor[n_idx]['u0'])
+            new_add = [
+                {
+                    'type': tp0,
+                    'val': val0,
+                    'score': 0,
+                    'action': 'forced infer',
+                    'gpid': -1,
+                    'u0': -1,
+                    'u1': -1,
+                    'tbd': False
+                },
+                {
+                    'type': tp1,
+                    'val': val1,
+                    'score': 0,
+                    'action': 'forced infer',
+                    'gpid': -1,
+                    'u0': -1,
+                    'u1': -1,
+                    'tbd': False
+                },
+            ]
+            xy_cor = xy_cor[:tbd] + new_add + xy_cor[tbd + 1:]
+
+    return xy_cor
+
+
+def gen_ww(init_coorx,
+           coory,
+           z=50,
+           coorW=1024,
+           coorH=512,
+           floorW=1024,
+           floorH=512,
+           tol=3,
+           force_cuboid=True):
+    gpid = get_gpid(init_coorx, coorW)
+    coor = np.hstack([np.arange(coorW)[:, None], coory[:, None]])
+    xy = np_coor2xy(coor, z, coorW, coorH, floorW, floorH)
+
+    if force_cuboid:
+        xy_cor = gen_ww_cuboid(xy, gpid)
+    else:
+        xy_cor = gen_ww_general(init_coorx, xy, gpid, tol)
+
+    cor = []
+    for j in range(len(xy_cor)):
+        next_j = (j + 1) % len(xy_cor)
+        if xy_cor[j]['type'] == 1:
+            cor.append((xy_cor[next_j]['val'], xy_cor[j]['val']))
+        else:
+            cor.append((xy_cor[j]['val'], xy_cor[next_j]['val']))
+    cor = np_xy2coor(np.array(cor), z, coorW, coorH, floorW, floorH)
+    cor = np.roll(cor, -2 * cor[::2, 0].argmin(), axis=0)
+
+    return cor, xy_cor
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/modality/__init__.py b/modelscope/models/cv/indoor_layout_estimation/networks/modality/__init__.py
new file mode 100644
index 00000000..4c2ff23e
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/modality/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .layout import LayoutEstimator
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/modality/layout.py b/modelscope/models/cv/indoor_layout_estimation/networks/modality/layout.py
new file mode 100644
index 00000000..12ff3a34
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/modality/layout.py
@@ -0,0 +1,147 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+from scipy.ndimage.filters import maximum_filter
+from shapely.geometry import Polygon
+
+from ..misc import post_proc
+
+
+class LayoutEstimator(nn.Module):
+
+    def __init__(self,
+                 emb_dim,
+                 bon_weight=1.,
+                 cor_weight=1.,
+                 bon_loss='l1',
+                 cor_loss='bce',
+                 bon_scale=1.,
+                 init_weight=0.1,
+                 dropout=0.,
+                 oneconv=True,
+                 last_ks=1,
+                 last_bias=True,
+                 sigmod_normlize=False,
+                 sample_weight=False,
+                 H=512,
+                 W=1024,
+                 post_force_cuboid=False):
+        super(LayoutEstimator, self).__init__()
+        self.bon_loss = bon_loss
+        self.cor_loss = cor_loss
+        self.bon_scale = bon_scale
+        self.bon_weight = bon_weight
+        self.cor_weight = cor_weight
+        self.H = H
+        self.W = W
+        self.post_force_cuboid = post_force_cuboid
+        if oneconv:
+            self.pred_bon = nn.Conv1d(
+                emb_dim, 2, last_ks, padding=last_ks // 2, bias=last_bias)
+            self.pred_cor = nn.Conv1d(
+                emb_dim, 1, last_ks, padding=last_ks // 2, bias=last_bias)
+            if last_bias:
+                nn.init.constant_(self.pred_bon.bias[0], -0.478)
+                nn.init.constant_(self.pred_bon.bias[1], 0.425)
+                nn.init.constant_(self.pred_cor.bias, -1.)
+        else:
+            self.pred_bon = nn.Sequential(
+                nn.Conv1d(emb_dim, emb_dim, 3, padding=1, bias=False),
+                nn.BatchNorm1d(emb_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(emb_dim, 2, 1),
+            )
+            self.pred_cor = nn.Sequential(
+                nn.Conv1d(emb_dim, emb_dim, 3, padding=1, bias=False),
+                nn.BatchNorm1d(emb_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv1d(emb_dim, 1, 1),
+            )
+            nn.init.constant_(self.pred_bon[-1].bias[0], -0.478)
+            nn.init.constant_(self.pred_bon[-1].bias[1], 0.425)
+            nn.init.constant_(self.pred_cor[-1].bias, -1.)
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout(dropout)
+
+        self.sigmod_normlize = sigmod_normlize
+
+        self.sample_weight = sample_weight
+
+    def forward(self, x_emb):
+        x_emb = x_emb['1D']
+        if self.dropout is not None:
+            x_emb = self.dropout(x_emb)
+        pred_bon = self.pred_bon(x_emb)
+        pred_cor = self.pred_cor(x_emb)
+        if self.sigmod_normlize:
+            bon = torch.sigmoid(pred_bon)
+            up = bon[:, 0:1, :] * -0.5 * math.pi
+            down = bon[:, 1:, :] * 0.5 * math.pi
+            pred_bon = torch.cat([up, down], dim=1)
+        return {'pred_bon': pred_bon, 'pred_cor': pred_cor}
+
+    def infer(self, x_emb):
+        results = self(x_emb)
+        pred_bon, pred_cor = results['pred_bon'], results['pred_cor']
+        pred_bon = pred_bon.clone() / self.bon_scale
+        H, W = self.H, self.W
+
+        y_bon_ = (pred_bon[0].cpu().numpy() / np.pi + 0.5) * H - 0.5
+        y_cor_ = pred_cor[0, 0].sigmoid().cpu().numpy()
+        z0 = 50
+        _, z1 = post_proc.np_refine_by_fix_z(*y_bon_, z0)
+
+        def find_N_peaks(signal, r, min_v, N):
+            max_v = maximum_filter(signal, size=r, mode='wrap')
+            pk_loc = np.where(max_v == signal)[0]
+            pk_loc = pk_loc[signal[pk_loc] > min_v]
+            if N is not None:
+                order = np.argsort(-signal[pk_loc])
+                pk_loc = pk_loc[order[:N]]
+                pk_loc = pk_loc[np.argsort(pk_loc)]
+            return pk_loc, signal[pk_loc]
+
+        min_v = 0 if self.post_force_cuboid else 0.05
+        r = int(round(W * 0.05 / 2))
+        N = 4 if self.post_force_cuboid else None
+        xs_ = find_N_peaks(y_cor_, r=r, min_v=min_v, N=N)[0]
+
+        cor, xy_cor = post_proc.gen_ww(
+            xs_,
+            y_bon_[0],
+            z0,
+            tol=abs(0.16 * z1 / 1.6),
+            force_cuboid=self.post_force_cuboid)
+        if not self.post_force_cuboid:
+            xy2d = np.zeros((len(xy_cor), 2), np.float32)
+            for i in range(len(xy_cor)):
+                xy2d[i, xy_cor[i]['type']] = xy_cor[i]['val']
+                xy2d[i, xy_cor[i - 1]['type']] = xy_cor[i - 1]['val']
+            if len(xy2d) < 3 or not Polygon(xy2d).is_valid:
+                import sys
+                print(
+                    'Fail to generate valid general layout!! '
+                    'Generate cuboid as fallback.',
+                    file=sys.stderr)
+                xs_ = find_N_peaks(y_cor_, r=r, min_v=0, N=4)[0]
+                cor, xy_cor = post_proc.gen_ww(
+                    xs_,
+                    y_bon_[0],
+                    z0,
+                    tol=abs(0.16 * z1 / 1.6),
+                    force_cuboid=True)
+
+        cor = np.hstack(
+            [cor, post_proc.infer_coory(cor[:, 1], z1 - z0, z0)[:, None]])
+        cor_id = np.zeros((len(cor) * 2, 2), np.float32)
+        for j in range(len(cor)):
+            cor_id[j * 2] = cor[j, 0], cor[j, 1]
+            cor_id[j * 2 + 1] = cor[j, 0], cor[j, 2]
+        y_pred_bon = post_proc.cor_2_1d(cor_id, 512, 1024)
+        y_pred_bon = ((y_pred_bon / np.pi + 0.5) * 512).round().astype(int)
+        return {'cor_id': cor_id, 'y_bon_': y_pred_bon, 'y_cor_': y_cor_}
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/panovit.py b/modelscope/models/cv/indoor_layout_estimation/networks/panovit.py
new file mode 100644
index 00000000..23ba211d
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/panovit.py
@@ -0,0 +1,96 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from . import backbone, modality
+from .utils import visualize_a_data
+
+
+class PanoVIT(nn.Module):
+
+    def __init__(self,
+                 emb_dim=256,
+                 input_hw=None,
+                 input_norm='imagenet',
+                 pretrain='',
+                 backbone_config={'module': 'Resnet'},
+                 transformer_config={'module': 'ViT'},
+                 modalities_config={}):
+        super(PanoVIT, self).__init__()
+        self.input_hw = input_hw
+        if input_norm == 'imagenet':
+            self.register_buffer(
+                'x_mean',
+                torch.FloatTensor(
+                    np.array([0.485, 0.456, 0.406])[None, :, None, None]))
+            self.register_buffer(
+                'x_std',
+                torch.FloatTensor(
+                    np.array([0.229, 0.224, 0.225])[None, :, None, None]))
+        else:
+            raise NotImplementedError
+
+        Encoder = getattr(backbone, backbone_config['module'])
+        Encoder_kwargs = backbone_config.get('kwargs', {})
+        self.encoder = Encoder(**Encoder_kwargs)
+
+        Transformer = getattr(backbone, transformer_config['module'])
+        Transformer_kwargs = transformer_config.get('kwargs', {})
+        self.transformer = Transformer(**Transformer_kwargs)
+        self.transformer_config = transformer_config['module']
+        self.transformer_Fourier = transformer_config['kwargs']['fourier']
+
+        self.modalities = nn.ModuleList([
+            getattr(modality, key)(emb_dim, **config)
+            for key, config in modalities_config.items()
+        ])
+
+    def extract_feat(self, x):
+        img = x[:, 0:3, :, :]
+        if self.input_hw:
+            img = F.interpolate(
+                img, size=self.input_hw, mode='bilinear', align_corners=False)
+
+        img = (img - self.x_mean) / self.x_std
+        if self.transformer_Fourier == 'fourier_res':
+            img = torch.cat((img, x[:, 3:, :, :]), dim=1)
+            res_f = self.encoder(img)
+        elif self.transformer_Fourier == 'fourier_trans':
+            res_f = self.encoder(img)
+            img = torch.cat((img, x[:, 3:, :, :]), dim=1)
+        else:
+            res_f = self.encoder(img)
+
+        if self.transformer_config == 'ViTHorizonPryImage':
+
+            feat = self.transformer(img, res_f)
+        else:
+            feat = self.transformer(x)
+        return feat
+
+    def call_modality(self, method, *feed_args, **feed_kwargs):
+        output_dict = {}
+        for m in self.modalities:
+            curr_dict = getattr(m, method)(*feed_args, **feed_kwargs)
+            assert len(output_dict.keys() & curr_dict.keys()
+                       ) == 0, 'Key collision for different modalities'
+            output_dict.update(curr_dict)
+        return output_dict
+
+    def forward(self, x):
+        feat = self.extract_feat(x)
+        results = self.call_modality('forward', feat)
+        return torch.cat((results['pred_bon'], results['pred_cor']), dim=1)
+
+    def infer(self, x):
+        feat = self.extract_feat(x)
+        result = self.call_modality('infer', feat)
+        result['image'] = x
+        return result
+
+    def postprocess(self, image, y_bon, y_cor):
+        vis_layout = visualize_a_data(image, y_bon, y_cor)
+        return (vis_layout[:, :, (2, 1, 0)])
diff --git a/modelscope/models/cv/indoor_layout_estimation/networks/utils.py b/modelscope/models/cv/indoor_layout_estimation/networks/utils.py
new file mode 100644
index 00000000..57753732
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/networks/utils.py
@@ -0,0 +1,110 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class StripPooling(nn.Module):
+
+    def __init__(self, in_channels, input_size, pool_size, norm_layer):
+        super(StripPooling, self).__init__()
+        input_size = np.array(input_size)
+        output_size = np.array([pool_size[0], pool_size[0]])
+        stride = np.floor((input_size / output_size)).astype(int)
+        kernel_size = (input_size - (output_size - 1) * stride).astype(int)
+        self.pool1 = nn.AvgPool2d(
+            kernel_size=(kernel_size[0], kernel_size[1]),
+            stride=(stride[0], stride[1]),
+            ceil_mode=False)
+
+        output_size = np.array([pool_size[1], pool_size[1]])
+        stride = np.floor((input_size / output_size)).astype(int)
+        kernel_size = (input_size - (output_size - 1) * stride).astype(int)
+        self.pool2 = nn.AvgPool2d(
+            kernel_size=(kernel_size[0], kernel_size[1]),
+            stride=(stride[0], stride[1]),
+            ceil_mode=False)
+
+        output_size = np.array([1, input_size[1]])
+        stride = np.floor((input_size / output_size)).astype(int)
+        kernel_size = (input_size - (output_size - 1) * stride).astype(int)
+        self.pool3 = nn.AvgPool2d(
+            kernel_size=(kernel_size[0], kernel_size[1]),
+            stride=(stride[0], stride[1]),
+            ceil_mode=False)
+
+        output_size = np.array([input_size[0], 1])
+        stride = np.floor((input_size / output_size)).astype(int)
+        kernel_size = (input_size - (output_size - 1) * stride).astype(int)
+        self.pool4 = nn.AvgPool2d(
+            kernel_size=(kernel_size[0], kernel_size[1]),
+            stride=(stride[0], stride[1]),
+            ceil_mode=False)
+
+        inter_channels = in_channels // 4
+
+        self.conv1_1 = nn.Sequential(
+            nn.Conv2d(in_channels, inter_channels, 1, bias=False),
+            norm_layer(inter_channels), nn.ReLU(True))
+        self.conv1_2 = nn.Sequential(
+            nn.Conv2d(in_channels, inter_channels, 1, bias=False),
+            norm_layer(inter_channels), nn.ReLU(True))
+        self.conv2_0 = nn.Sequential(
+            nn.Conv2d(inter_channels, inter_channels, 3, 1, 1, bias=False),
+            norm_layer(inter_channels))
+        self.conv2_1 = nn.Sequential(
+            nn.Conv2d(inter_channels, inter_channels, 3, 1, 1, bias=False),
+            norm_layer(inter_channels))
+        self.conv2_2 = nn.Sequential(
+            nn.Conv2d(inter_channels, inter_channels, 3, 1, 1, bias=False),
+            norm_layer(inter_channels))
+        self.conv2_3 = nn.Sequential(
+            nn.Conv2d(
+                inter_channels, inter_channels, (1, 3), 1, (0, 1), bias=False),
+            norm_layer(inter_channels))
+        self.conv2_4 = nn.Sequential(
+            nn.Conv2d(
+                inter_channels, inter_channels, (3, 1), 1, (1, 0), bias=False),
+            norm_layer(inter_channels))
+        self.conv2_5 = nn.Sequential(
+            nn.Conv2d(inter_channels, inter_channels, 3, 1, 1, bias=False),
+            norm_layer(inter_channels), nn.ReLU(True))
+        self.conv2_6 = nn.Sequential(
+            nn.Conv2d(inter_channels, inter_channels, 3, 1, 1, bias=False),
+            norm_layer(inter_channels), nn.ReLU(True))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(inter_channels * 2, in_channels, 1, bias=False),
+            norm_layer(in_channels))
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+        x1 = self.conv1_1(x)
+        x2 = self.conv1_2(x)
+        x2_1 = self.conv2_0(x1)
+        x2_2 = F.interpolate(self.conv2_1(self.pool1(x1)), (h, w))
+        x2_3 = F.interpolate(self.conv2_2(self.pool2(x1)), (h, w))
+        x2_4 = F.interpolate(self.conv2_3(self.pool3(x2)), (h, w))
+        x2_5 = F.interpolate(self.conv2_4(self.pool4(x2)), (h, w))
+        x1 = self.conv2_5(F.relu_(x2_1 + x2_2 + x2_3))
+        x2 = self.conv2_6(F.relu_(x2_5 + x2_4))
+        out = self.conv3(torch.cat([x1, x2], dim=1))
+        return F.relu_(x + out)
+
+
+def visualize_a_data(x, y_bon, y_cor):
+    x = (x.cpu().numpy().transpose([1, 2, 0]) * 255).astype(np.uint8)
+    y_bon = y_bon.round().astype(int)
+    gt_cor = np.zeros((30, 1024, 3), np.uint8)
+    gt_cor[:] = y_cor[None, :, None] * 255
+    img_pad = np.zeros((3, 1024, 3), np.uint8) + 255
+
+    img_bon = (x.copy()).astype(np.uint8)
+    img_bon[y_bon[0], np.arange(len(y_bon[0])), 1] = 255
+    img_bon[y_bon[1], np.arange(len(y_bon[1])), 1] = 255
+    img_bon[y_bon[0] - 1, np.arange(len(y_bon[0])), 1] = 255
+    img_bon[y_bon[1] - 1, np.arange(len(y_bon[1])), 1] = 255
+    img_bon[y_bon[0] + 1, np.arange(len(y_bon[0])), 1] = 255
+    img_bon[y_bon[1] + 1, np.arange(len(y_bon[1])), 1] = 255
+    return np.concatenate([gt_cor, img_pad, img_bon], 0)
diff --git a/modelscope/models/cv/indoor_layout_estimation/panovit.py b/modelscope/models/cv/indoor_layout_estimation/panovit.py
new file mode 100644
index 00000000..615ad5e4
--- /dev/null
+++ b/modelscope/models/cv/indoor_layout_estimation/panovit.py
@@ -0,0 +1,57 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import numpy as np
+import torch
+from yacs.config import CfgNode as CN
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.indoor_layout_estimation.networks.panovit import \
+    PanoVIT
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.indoor_layout_estimation,
+    module_name=Models.panovit_layout_estimation)
+class LayoutEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        config = CN()
+        config.model = CN()
+        config.model.kwargs = CN(new_allowed=True)
+        config.defrost()
+        config_path = osp.join(model_dir, ModelFile.YAML_FILE)
+        config.merge_from_file(config_path)
+        config.freeze()
+        # build model
+        self.model = PanoVIT(**config.model.kwargs)
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        if torch.cuda.is_available():
+            state_dict = torch.load(model_path)
+        else:
+            state_dict = torch.load(model_path, map_location='cpu')
+
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+
+    def forward(self, Inputs):
+        return self.model.infer(Inputs['images'])
+
+    def postprocess(self, Inputs):
+        image, y_bon, y_cor = Inputs['image'], Inputs['y_bon_'], Inputs[
+            'y_cor_']
+        layout_image = self.model.postprocess(image[0, 0:3], y_bon, y_cor)
+        return layout_image
+
+    def inference(self, data):
+        results = self.forward(data)
+        return results
diff --git a/modelscope/models/cv/language_guided_video_summarization/summarizer.py b/modelscope/models/cv/language_guided_video_summarization/summarizer.py
index 654dc3ea..6e12ea2b 100755
--- a/modelscope/models/cv/language_guided_video_summarization/summarizer.py
+++ b/modelscope/models/cv/language_guided_video_summarization/summarizer.py
@@ -5,7 +5,6 @@
 import argparse
 import os
 import os.path as osp
-from copy import deepcopy
 from typing import Dict, Union
 
 import numpy as np
@@ -129,37 +128,13 @@ class ClipItVideoSummarization(TorchModel):
             self._device = torch.device('cpu')
         self.model = self.model.to(self._device)
 
-        self.model = self.load_pretrained(self.model, model_path)
+        self.model = self._load_pretrained(self.model, model_path)
 
         if self.training:
             self.model.train()
         else:
             self.model.eval()
 
-    def load_pretrained(self, net, load_path, strict=True, param_key='params'):
-        if isinstance(net, (DataParallel, DistributedDataParallel)):
-            net = net.module
-        load_net = torch.load(
-            load_path, map_location=lambda storage, loc: storage)
-        if param_key is not None:
-            if param_key not in load_net and 'params' in load_net:
-                param_key = 'params'
-                logger.info(
-                    f'Loading: {param_key} does not exist, use params.')
-            if param_key in load_net:
-                load_net = load_net[param_key]
-        logger.info(
-            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
-        )
-        # remove unnecessary 'module.'
-        for k, v in deepcopy(load_net).items():
-            if k.startswith('module.'):
-                load_net[k[7:]] = v
-                load_net.pop(k)
-        net.load_state_dict(load_net, strict=strict)
-        logger.info('load model done.')
-        return net
-
     def _train_forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         frame_features = input['frame_features']
         txt_features = input['txt_features']
diff --git a/modelscope/models/cv/movie_scene_segmentation/model.py b/modelscope/models/cv/movie_scene_segmentation/model.py
index 8117961a..f5d3e677 100644
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -1,6 +1,7 @@
 # The implementation here is modified based on BaSSL,
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/kakaobrain/bassl
 
+import math
 import os
 import os.path as osp
 from typing import Any, Dict
@@ -74,7 +75,7 @@ class MovieSceneSegmentationModel(TorchModel):
         self.eps = 1e-5
 
     def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
-        data = inputs['video']
+        data = inputs.pop('video')
         labels = inputs['label']
         outputs = self.shared_step(data)
 
@@ -101,7 +102,7 @@ class MovieSceneSegmentationModel(TorchModel):
         inputs = batch['shot_feat']
 
         shot_num = len(sids)
-        cnt = shot_num // bs + 1
+        cnt = math.ceil(shot_num / bs)
 
         infer_sid, infer_pred = [], []
         infer_result = {}
diff --git a/modelscope/models/cv/panorama_depth_estimation/__init__.py b/modelscope/models/cv/panorama_depth_estimation/__init__.py
new file mode 100644
index 00000000..b20f6209
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .unifuse_model import PanoramaDepthEstimation
+
+else:
+    _import_structure = {
+        'unifuse_model': ['PanoramaDepthEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/__init__.py b/modelscope/models/cv/panorama_depth_estimation/networks/__init__.py
new file mode 100644
index 00000000..5f0fbd21
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .equi import Equi
+from .unifuse import UniFuse
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/equi.py b/modelscope/models/cv/panorama_depth_estimation/networks/equi.py
new file mode 100644
index 00000000..3b8bb3d3
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/equi.py
@@ -0,0 +1,133 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from __future__ import absolute_import, division, print_function
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .layers import Conv3x3, ConvBlock, upsample
+from .mobilenet import mobilenet_v2
+from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152
+
+
+class Equi(nn.Module):
+    """ Model: Resnet based Encoder + Decoder
+    """
+
+    def __init__(self,
+                 num_layers,
+                 equi_h,
+                 equi_w,
+                 pretrained=False,
+                 max_depth=10.0,
+                 **kwargs):
+        super(Equi, self).__init__()
+
+        self.num_layers = num_layers
+        self.equi_h = equi_h
+        self.equi_w = equi_w
+        self.cube_h = equi_h // 2
+
+        # encoder
+        encoder = {
+            2: mobilenet_v2,
+            18: resnet18,
+            34: resnet34,
+            50: resnet50,
+            101: resnet101,
+            152: resnet152
+        }
+
+        if num_layers not in encoder:
+            raise ValueError(
+                '{} is not a valid number of resnet layers'.format(num_layers))
+        self.equi_encoder = encoder[num_layers](pretrained)
+
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+        if num_layers < 18:
+            self.num_ch_enc = np.array([16, 24, 32, 96, 320])
+
+        # decoder
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+        self.equi_dec_convs = OrderedDict()
+
+        self.equi_dec_convs['upconv_5'] = ConvBlock(self.num_ch_enc[4],
+                                                    self.num_ch_dec[4])
+
+        self.equi_dec_convs['deconv_4'] = ConvBlock(
+            self.num_ch_dec[4] + self.num_ch_enc[3], self.num_ch_dec[4])
+        self.equi_dec_convs['upconv_4'] = ConvBlock(self.num_ch_dec[4],
+                                                    self.num_ch_dec[3])
+
+        self.equi_dec_convs['deconv_3'] = ConvBlock(
+            self.num_ch_dec[3] + self.num_ch_enc[2], self.num_ch_dec[3])
+        self.equi_dec_convs['upconv_3'] = ConvBlock(self.num_ch_dec[3],
+                                                    self.num_ch_dec[2])
+
+        self.equi_dec_convs['deconv_2'] = ConvBlock(
+            self.num_ch_dec[2] + self.num_ch_enc[1], self.num_ch_dec[2])
+        self.equi_dec_convs['upconv_2'] = ConvBlock(self.num_ch_dec[2],
+                                                    self.num_ch_dec[1])
+
+        self.equi_dec_convs['deconv_1'] = ConvBlock(
+            self.num_ch_dec[1] + self.num_ch_enc[0], self.num_ch_dec[1])
+        self.equi_dec_convs['upconv_1'] = ConvBlock(self.num_ch_dec[1],
+                                                    self.num_ch_dec[0])
+
+        self.equi_dec_convs['deconv_0'] = ConvBlock(self.num_ch_dec[0],
+                                                    self.num_ch_dec[0])
+        self.equi_dec_convs['depthconv_0'] = Conv3x3(self.num_ch_dec[0], 1)
+
+        self.equi_decoder = nn.ModuleList(list(self.equi_dec_convs.values()))
+
+        self.sigmoid = nn.Sigmoid()
+        self.max_depth = nn.Parameter(
+            torch.tensor(max_depth), requires_grad=False)
+
+    def forward(self, input_equi_image, input_cube_image):
+
+        # euqi image encoding
+        if self.num_layers < 18:
+            equi_enc_feat0, equi_enc_feat1, equi_enc_feat2, equi_enc_feat3, equi_enc_feat4 \
+                = self.equi_encoder(input_equi_image)
+        else:
+            x = self.equi_encoder.conv1(input_equi_image)
+            x = self.equi_encoder.relu(self.equi_encoder.bn1(x))
+            equi_enc_feat0 = x
+
+            x = self.equi_encoder.maxpool(x)
+            equi_enc_feat1 = self.equi_encoder.layer1(x)
+            equi_enc_feat2 = self.equi_encoder.layer2(equi_enc_feat1)
+            equi_enc_feat3 = self.equi_encoder.layer3(equi_enc_feat2)
+            equi_enc_feat4 = self.equi_encoder.layer4(equi_enc_feat3)
+
+        # euqi image decoding
+        outputs = {}
+
+        equi_x = equi_enc_feat4
+        equi_x = upsample(self.equi_dec_convs['upconv_5'](equi_x))
+
+        equi_x = torch.cat([equi_x, equi_enc_feat3], 1)
+        equi_x = self.equi_dec_convs['deconv_4'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_4'](equi_x))
+
+        equi_x = torch.cat([equi_x, equi_enc_feat2], 1)
+        equi_x = self.equi_dec_convs['deconv_3'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_3'](equi_x))
+
+        equi_x = torch.cat([equi_x, equi_enc_feat1], 1)
+        equi_x = self.equi_dec_convs['deconv_2'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_2'](equi_x))
+
+        equi_x = torch.cat([equi_x, equi_enc_feat0], 1)
+        equi_x = self.equi_dec_convs['deconv_1'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_1'](equi_x))
+
+        equi_x = self.equi_dec_convs['deconv_0'](equi_x)
+        equi_depth = self.equi_dec_convs['depthconv_0'](equi_x)
+        outputs['pred_depth'] = self.max_depth * self.sigmoid(equi_depth)
+
+        return outputs
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/layers.py b/modelscope/models/cv/panorama_depth_estimation/networks/layers.py
new file mode 100644
index 00000000..99e166aa
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/layers.py
@@ -0,0 +1,235 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Conv3x3(nn.Module):
+    """Layer to pad and convolve input
+    """
+
+    def __init__(self, in_channels, out_channels, bias=True):
+        super(Conv3x3, self).__init__()
+
+        self.pad = nn.ZeroPad2d(1)
+        self.conv = nn.Conv2d(
+            int(in_channels), int(out_channels), 3, bias=bias)
+
+    def forward(self, x):
+        out = self.pad(x)
+        out = self.conv(out)
+        return out
+
+
+class ConvBlock(nn.Module):
+    """Layer to perform a convolution followed by ELU
+    """
+
+    def __init__(self, in_channels, out_channels, bias=True):
+        super(ConvBlock, self).__init__()
+
+        self.conv = Conv3x3(in_channels, out_channels, bias)
+        self.nonlin = nn.ELU(inplace=True)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.nonlin(out)
+        return out
+
+
+def upsample(x):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=2, mode='nearest')
+
+
+# Based on https://github.com/sunset1995/py360convert
+class Cube2Equirec(nn.Module):
+
+    def __init__(self, face_w, equ_h, equ_w):
+        super(Cube2Equirec, self).__init__()
+        '''
+        face_w: int, the length of each face of the cubemap
+        equ_h: int, height of the equirectangular image
+        equ_w: int, width of the equirectangular image
+        '''
+
+        self.face_w = face_w
+        self.equ_h = equ_h
+        self.equ_w = equ_w
+
+        # Get face id to each pixel: 0F 1R 2B 3L 4U 5D
+        self._equirect_facetype()
+        self._equirect_faceuv()
+
+    def _equirect_facetype(self):
+        '''
+        0F 1R 2B 3L 4U 5D
+        '''
+        tp = np.roll(
+            np.arange(4).repeat(self.equ_w // 4)[None, :].repeat(
+                self.equ_h, 0), 3 * self.equ_w // 8, 1)
+
+        # Prepare ceil mask
+        mask = np.zeros((self.equ_h, self.equ_w // 4), np.bool)
+        idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
+        idx = self.equ_h // 2 - np.round(
+            np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)
+        for i, j in enumerate(idx):
+            mask[:j, i] = 1
+        mask = np.roll(np.concatenate([mask] * 4, 1), 3 * self.equ_w // 8, 1)
+
+        tp[mask] = 4
+        tp[np.flip(mask, 0)] = 5
+
+        self.tp = tp
+        self.mask = mask
+
+    def _equirect_faceuv(self):
+
+        lon = (
+            (np.linspace(0, self.equ_w - 1, num=self.equ_w, dtype=np.float32)
+             + 0.5) / self.equ_w - 0.5) * 2 * np.pi
+        lat = -(
+            (np.linspace(0, self.equ_h - 1, num=self.equ_h, dtype=np.float32)
+             + 0.5) / self.equ_h - 0.5) * np.pi
+
+        lon, lat = np.meshgrid(lon, lat)
+
+        coor_u = np.zeros((self.equ_h, self.equ_w), dtype=np.float32)
+        coor_v = np.zeros((self.equ_h, self.equ_w), dtype=np.float32)
+
+        for i in range(4):
+            mask = (self.tp == i)
+            coor_u[mask] = 0.5 * np.tan(lon[mask] - np.pi * i / 2)
+            coor_v[mask] = -0.5 * np.tan(
+                lat[mask]) / np.cos(lon[mask] - np.pi * i / 2)
+
+        mask = (self.tp == 4)
+        c = 0.5 * np.tan(np.pi / 2 - lat[mask])
+        coor_u[mask] = c * np.sin(lon[mask])
+        coor_v[mask] = c * np.cos(lon[mask])
+
+        mask = (self.tp == 5)
+        c = 0.5 * np.tan(np.pi / 2 - np.abs(lat[mask]))
+        coor_u[mask] = c * np.sin(lon[mask])
+        coor_v[mask] = -c * np.cos(lon[mask])
+
+        # Final renormalize
+        coor_u = (np.clip(coor_u, -0.5, 0.5)) * 2
+        coor_v = (np.clip(coor_v, -0.5, 0.5)) * 2
+
+        # Convert to torch tensor
+        self.tp = torch.from_numpy(self.tp.astype(np.float32) / 2.5 - 1)
+        self.coor_u = torch.from_numpy(coor_u)
+        self.coor_v = torch.from_numpy(coor_v)
+
+        sample_grid = torch.stack([self.coor_u, self.coor_v, self.tp],
+                                  dim=-1).view(1, 1, self.equ_h, self.equ_w, 3)
+        self.sample_grid = nn.Parameter(sample_grid, requires_grad=False)
+
+    def forward(self, cube_feat):
+
+        bs, ch, h, w = cube_feat.shape
+        assert h == self.face_w and w // 6 == self.face_w
+
+        cube_feat = cube_feat.view(bs, ch, 1, h, w)
+        cube_feat = torch.cat(
+            torch.split(cube_feat, self.face_w, dim=-1), dim=2)
+
+        cube_feat = cube_feat.view([bs, ch, 6, self.face_w, self.face_w])
+        sample_grid = torch.cat(bs * [self.sample_grid], dim=0)
+        equi_feat = F.grid_sample(
+            cube_feat, sample_grid, padding_mode='border', align_corners=True)
+
+        return equi_feat.squeeze(2)
+
+
+class Concat(nn.Module):
+
+    def __init__(self, channels, **kwargs):
+        super(Concat, self).__init__()
+        self.conv = nn.Conv2d(channels * 2, channels, 1, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, equi_feat, c2e_feat):
+
+        x = torch.cat([equi_feat, c2e_feat], 1)
+        x = self.relu(self.conv(x))
+        return x
+
+
+# Based on https://github.com/Yeh-yu-hsuan/BiFuse/blob/master/models/FCRN.py
+class BiProj(nn.Module):
+
+    def __init__(self, channels, **kwargs):
+        super(BiProj, self).__init__()
+
+        self.conv_c2e = nn.Sequential(
+            nn.Conv2d(channels, channels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True))
+        self.conv_e2c = nn.Sequential(
+            nn.Conv2d(channels, channels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True))
+        self.conv_mask = nn.Sequential(
+            nn.Conv2d(channels * 2, 1, kernel_size=1, padding=0), nn.Sigmoid())
+
+    def forward(self, equi_feat, c2e_feat):
+        aaa = self.conv_e2c(equi_feat)
+        tmp_equi = self.conv_c2e(c2e_feat)
+        mask_equi = self.conv_mask(torch.cat([aaa, tmp_equi], dim=1))
+        tmp_equi = tmp_equi.clone() * mask_equi
+        return equi_feat + tmp_equi
+
+
+# from https://github.com/moskomule/senet.pytorch/blob/master/senet/se_module.py
+class SELayer(nn.Module):
+
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y.expand_as(x)
+
+
+class CEELayer(nn.Module):
+
+    def __init__(self, channels, SE=True):
+        super(CEELayer, self).__init__()
+
+        self.res_conv1 = nn.Conv2d(
+            channels * 2, channels, kernel_size=1, padding=0, bias=False)
+        self.res_bn1 = nn.BatchNorm2d(channels)
+
+        self.res_conv2 = nn.Conv2d(
+            channels, channels, kernel_size=3, padding=1, bias=False)
+        self.res_bn2 = nn.BatchNorm2d(channels)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.SE = SE
+        if self.SE:
+            self.selayer = SELayer(channels * 2)
+
+        self.conv = nn.Conv2d(channels * 2, channels, 1, bias=False)
+
+    def forward(self, equi_feat, c2e_feat):
+
+        x = torch.cat([equi_feat, c2e_feat], 1)
+        x = self.relu(self.res_bn1(self.res_conv1(x)))
+        shortcut = self.res_bn2(self.res_conv2(x))
+
+        x = c2e_feat + shortcut
+        x = torch.cat([equi_feat, x], 1)
+        if self.SE:
+            x = self.selayer(x)
+        x = self.relu(self.conv(x))
+        return x
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/mobilenet.py b/modelscope/models/cv/panorama_depth_estimation/networks/mobilenet.py
new file mode 100644
index 00000000..d49b9a05
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/mobilenet.py
@@ -0,0 +1,238 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Modified from https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenet.py
+from torch import nn
+
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+
+
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNReLU(nn.Sequential):
+
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 norm_layer=None):
+        padding = (kernel_size - 1) // 2
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(
+                in_planes,
+                out_planes,
+                kernel_size,
+                stride,
+                padding,
+                groups=groups,
+                bias=False), norm_layer(out_planes), nn.ReLU6(inplace=True))
+
+
+class InvertedResidual(nn.Module):
+
+    def __init__(self, inp, oup, stride, expand_ratio, norm_layer=None):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(
+                ConvBNReLU(
+                    inp, hidden_dim, kernel_size=1, norm_layer=norm_layer))
+        layers.extend([
+            # dw
+            ConvBNReLU(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                norm_layer=norm_layer),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            norm_layer(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+
+    def __init__(self,
+                 width_mult=1.0,
+                 inverted_residual_setting=None,
+                 round_nearest=8,
+                 block=None,
+                 norm_layer=None):
+        """
+        MobileNet V2 main class
+
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+            block: Module specifying inverted residual building block for mobilenet
+            norm_layer: Module specifying the normalization layer to use
+
+        """
+        super(MobileNetV2, self).__init__()
+
+        if block is None:
+            block = InvertedResidual
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        input_channel = 32
+
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(
+                inverted_residual_setting[0]) != 4:
+            raise ValueError('inverted_residual_setting should be non-empty '
+                             'or a 4-element list, got {}'.format(
+                                 inverted_residual_setting))
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult,
+                                        round_nearest)
+        features = [
+            ConvBNReLU(3, input_channel, stride=2, norm_layer=norm_layer)
+        ]
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(
+                    block(
+                        input_channel,
+                        output_channel,
+                        stride,
+                        expand_ratio=t,
+                        norm_layer=norm_layer))
+                input_channel = output_channel
+        # building last several layers
+        # features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1, norm_layer=norm_layer))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+        """
+        # remove fcn as we don't need it for a depth prediction task
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, num_classes),
+        )
+        """
+
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x):
+        # This exists since TorchScript doesn't support inheritance, so the superclass method
+        # (this one) needs to have a name other than `forward` that can be accessed in a subclass
+        # Cannot use "squeeze" as batch-size can be 1 => must use reshape with x.shape[0]
+
+        st = 0
+        for i in range(2):
+            x = self.features[st + i](x)
+        st = st + 2
+        feat0 = x
+
+        for i in range(2):
+            x = self.features[st + i](x)
+        st = st + 2
+        feat1 = x
+
+        for i in range(3):
+            x = self.features[st + i](x)
+        st = st + 3
+        feat2 = x
+
+        for i in range(7):
+            x = self.features[st + i](x)
+        st = st + 7
+        feat3 = x
+
+        for i in range(4):
+            x = self.features[st + i](x)
+        feat4 = x
+
+        return feat0, feat1, feat2, feat3, feat4
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+def mobilenet_v2(pretrained=False, progress=True, **kwargs):
+    """
+    Constructs a MobileNetV2 architecture from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet (deprecated)
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MobileNetV2(**kwargs)
+
+    return model
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/resnet.py b/modelscope/models/cv/panorama_depth_estimation/networks/resnet.py
new file mode 100644
index 00000000..e6e6b1d3
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/resnet.py
@@ -0,0 +1,424 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Modified from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+import torch
+import torch.nn as nn
+
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+
+__all__ = [
+    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
+    'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2',
+    'wide_resnet101_2'
+]
+
+
+def conv3x3(in_planes, out_planes, padding, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=padding,
+        groups=groups,
+        bias=False,
+        dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+
+        self.conv1 = conv3x3(inplanes, planes, 1, stride)
+
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.conv2 = conv3x3(planes, planes, 1)
+
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+
+        self.conv2 = conv3x3(width, width, 1, stride, groups, dilation)
+
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_input_images=1,
+                 zero_init_residual=False,
+                 groups=1,
+                 width_per_group=64,
+                 replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError('replace_stride_with_dilation should be None '
+                             'or a 3-element tuple, got {}'.format(
+                                 replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+
+        self.conv1 = nn.Conv2d(
+            3 * num_input_images,
+            self.inplanes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            padding_mode='zeros')
+
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2])
+        """
+        # remove fcn as we don't need it for a depth prediction task
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        """
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, self.groups,
+                  self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x):
+        # See note [TorchScript super()]
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        """
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        """
+
+        return x
+
+    def forward(self, x):
+        return self._forward_impl(x)
+
+
+def _resnet(arch,
+            block,
+            layers,
+            pretrained,
+            progress,
+            num_input_images=1,
+            **kwargs):
+    model = ResNet(block, layers, num_input_images=num_input_images, **kwargs)
+
+    return model
+
+
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet (deprecated)
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained,
+                   progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained,
+                   progress, **kwargs)
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/unifuse.py b/modelscope/models/cv/panorama_depth_estimation/networks/unifuse.py
new file mode 100644
index 00000000..5715c89b
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/unifuse.py
@@ -0,0 +1,219 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from __future__ import absolute_import, division, print_function
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .layers import (BiProj, CEELayer, Concat, Conv3x3, ConvBlock,
+                     Cube2Equirec, upsample)
+from .mobilenet import mobilenet_v2
+from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152
+
+
+class UniFuse(nn.Module):
+    """ UniFuse Model: Resnet based Euqi Encoder and Cube Encoder + Euqi Decoder
+    """
+
+    def __init__(self,
+                 num_layers,
+                 equi_h,
+                 equi_w,
+                 pretrained=False,
+                 max_depth=10.0,
+                 fusion_type='cee',
+                 se_in_fusion=True):
+        super(UniFuse, self).__init__()
+
+        self.num_layers = num_layers
+        self.equi_h = equi_h
+        self.equi_w = equi_w
+        self.cube_h = equi_h // 2
+
+        self.fusion_type = fusion_type
+        self.se_in_fusion = se_in_fusion
+
+        # encoder
+        encoder = {
+            2: mobilenet_v2,
+            18: resnet18,
+            34: resnet34,
+            50: resnet50,
+            101: resnet101,
+            152: resnet152
+        }
+
+        if num_layers not in encoder:
+            raise ValueError(
+                '{} is not a valid number of resnet layers'.format(num_layers))
+        self.equi_encoder = encoder[num_layers](pretrained)
+        self.cube_encoder = encoder[num_layers](pretrained)
+
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+
+        if num_layers < 18:
+            self.num_ch_enc = np.array([16, 24, 32, 96, 320])
+
+        # decoder
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+        self.equi_dec_convs = OrderedDict()
+        self.c2e = {}
+
+        Fusion_dict = {'cat': Concat, 'biproj': BiProj, 'cee': CEELayer}
+        FusionLayer = Fusion_dict[self.fusion_type]
+
+        self.c2e['5'] = Cube2Equirec(self.cube_h // 32, self.equi_h // 32,
+                                     self.equi_w // 32)
+
+        self.equi_dec_convs['fusion_5'] = FusionLayer(
+            self.num_ch_enc[4], SE=self.se_in_fusion)
+        self.equi_dec_convs['upconv_5'] = ConvBlock(self.num_ch_enc[4],
+                                                    self.num_ch_dec[4])
+
+        self.c2e['4'] = Cube2Equirec(self.cube_h // 16, self.equi_h // 16,
+                                     self.equi_w // 16)
+        self.equi_dec_convs['fusion_4'] = FusionLayer(
+            self.num_ch_enc[3], SE=self.se_in_fusion)
+        self.equi_dec_convs['deconv_4'] = ConvBlock(
+            self.num_ch_dec[4] + self.num_ch_enc[3], self.num_ch_dec[4])
+        self.equi_dec_convs['upconv_4'] = ConvBlock(self.num_ch_dec[4],
+                                                    self.num_ch_dec[3])
+
+        self.c2e['3'] = Cube2Equirec(self.cube_h // 8, self.equi_h // 8,
+                                     self.equi_w // 8)
+        self.equi_dec_convs['fusion_3'] = FusionLayer(
+            self.num_ch_enc[2], SE=self.se_in_fusion)
+        self.equi_dec_convs['deconv_3'] = ConvBlock(
+            self.num_ch_dec[3] + self.num_ch_enc[2], self.num_ch_dec[3])
+        self.equi_dec_convs['upconv_3'] = ConvBlock(self.num_ch_dec[3],
+                                                    self.num_ch_dec[2])
+
+        self.c2e['2'] = Cube2Equirec(self.cube_h // 4, self.equi_h // 4,
+                                     self.equi_w // 4)
+        self.equi_dec_convs['fusion_2'] = FusionLayer(
+            self.num_ch_enc[1], SE=self.se_in_fusion)
+        self.equi_dec_convs['deconv_2'] = ConvBlock(
+            self.num_ch_dec[2] + self.num_ch_enc[1], self.num_ch_dec[2])
+        self.equi_dec_convs['upconv_2'] = ConvBlock(self.num_ch_dec[2],
+                                                    self.num_ch_dec[1])
+
+        self.c2e['1'] = Cube2Equirec(self.cube_h // 2, self.equi_h // 2,
+                                     self.equi_w // 2)
+        self.equi_dec_convs['fusion_1'] = FusionLayer(
+            self.num_ch_enc[0], SE=self.se_in_fusion)
+        self.equi_dec_convs['deconv_1'] = ConvBlock(
+            self.num_ch_dec[1] + self.num_ch_enc[0], self.num_ch_dec[1])
+        self.equi_dec_convs['upconv_1'] = ConvBlock(self.num_ch_dec[1],
+                                                    self.num_ch_dec[0])
+
+        self.equi_dec_convs['deconv_0'] = ConvBlock(self.num_ch_dec[0],
+                                                    self.num_ch_dec[0])
+
+        self.equi_dec_convs['depthconv_0'] = Conv3x3(self.num_ch_dec[0], 1)
+
+        self.equi_decoder = nn.ModuleList(list(self.equi_dec_convs.values()))
+        self.projectors = nn.ModuleList(list(self.c2e.values()))
+
+        self.sigmoid = nn.Sigmoid()
+
+        self.max_depth = nn.Parameter(
+            torch.tensor(max_depth), requires_grad=False)
+
+    def forward(self, input_equi_image, input_cube_image):
+
+        # euqi image encoding
+
+        if self.num_layers < 18:
+            equi_enc_feat0, equi_enc_feat1, equi_enc_feat2, equi_enc_feat3, equi_enc_feat4 \
+                = self.equi_encoder(input_equi_image)
+        else:
+            x = self.equi_encoder.conv1(input_equi_image)
+            x = self.equi_encoder.relu(self.equi_encoder.bn1(x))
+            equi_enc_feat0 = x
+
+            x = self.equi_encoder.maxpool(x)
+            equi_enc_feat1 = self.equi_encoder.layer1(x)
+            equi_enc_feat2 = self.equi_encoder.layer2(equi_enc_feat1)
+            equi_enc_feat3 = self.equi_encoder.layer3(equi_enc_feat2)
+            equi_enc_feat4 = self.equi_encoder.layer4(equi_enc_feat3)
+
+        # cube image encoding
+        cube_inputs = torch.cat(
+            torch.split(input_cube_image, self.cube_h, dim=-1), dim=0)
+
+        if self.num_layers < 18:
+            cube_enc_feat0, cube_enc_feat1, cube_enc_feat2, cube_enc_feat3, cube_enc_feat4 \
+                = self.cube_encoder(cube_inputs)
+        else:
+
+            x = self.cube_encoder.conv1(cube_inputs)
+            x = self.cube_encoder.relu(self.cube_encoder.bn1(x))
+            cube_enc_feat0 = x
+
+            x = self.cube_encoder.maxpool(x)
+
+            cube_enc_feat1 = self.cube_encoder.layer1(x)
+            cube_enc_feat2 = self.cube_encoder.layer2(cube_enc_feat1)
+            cube_enc_feat3 = self.cube_encoder.layer3(cube_enc_feat2)
+            cube_enc_feat4 = self.cube_encoder.layer4(cube_enc_feat3)
+
+        # euqi image decoding fused with cubemap features
+        outputs = {}
+
+        cube_enc_feat4 = torch.cat(
+            torch.split(cube_enc_feat4, input_equi_image.shape[0], dim=0),
+            dim=-1)
+        c2e_enc_feat4 = self.c2e['5'](cube_enc_feat4)
+        fused_feat4 = self.equi_dec_convs['fusion_5'](equi_enc_feat4,
+                                                      c2e_enc_feat4)
+        equi_x = upsample(self.equi_dec_convs['upconv_5'](fused_feat4))
+
+        cube_enc_feat3 = torch.cat(
+            torch.split(cube_enc_feat3, input_equi_image.shape[0], dim=0),
+            dim=-1)
+        c2e_enc_feat3 = self.c2e['4'](cube_enc_feat3)
+        fused_feat3 = self.equi_dec_convs['fusion_4'](equi_enc_feat3,
+                                                      c2e_enc_feat3)
+        equi_x = torch.cat([equi_x, fused_feat3], 1)
+        equi_x = self.equi_dec_convs['deconv_4'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_4'](equi_x))
+
+        cube_enc_feat2 = torch.cat(
+            torch.split(cube_enc_feat2, input_equi_image.shape[0], dim=0),
+            dim=-1)
+        c2e_enc_feat2 = self.c2e['3'](cube_enc_feat2)
+        fused_feat2 = self.equi_dec_convs['fusion_3'](equi_enc_feat2,
+                                                      c2e_enc_feat2)
+        equi_x = torch.cat([equi_x, fused_feat2], 1)
+        equi_x = self.equi_dec_convs['deconv_3'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_3'](equi_x))
+
+        cube_enc_feat1 = torch.cat(
+            torch.split(cube_enc_feat1, input_equi_image.shape[0], dim=0),
+            dim=-1)
+        c2e_enc_feat1 = self.c2e['2'](cube_enc_feat1)
+        fused_feat1 = self.equi_dec_convs['fusion_2'](equi_enc_feat1,
+                                                      c2e_enc_feat1)
+        equi_x = torch.cat([equi_x, fused_feat1], 1)
+        equi_x = self.equi_dec_convs['deconv_2'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_2'](equi_x))
+
+        cube_enc_feat0 = torch.cat(
+            torch.split(cube_enc_feat0, input_equi_image.shape[0], dim=0),
+            dim=-1)
+        c2e_enc_feat0 = self.c2e['1'](cube_enc_feat0)
+        fused_feat0 = self.equi_dec_convs['fusion_1'](equi_enc_feat0,
+                                                      c2e_enc_feat0)
+        equi_x = torch.cat([equi_x, fused_feat0], 1)
+        equi_x = self.equi_dec_convs['deconv_1'](equi_x)
+        equi_x = upsample(self.equi_dec_convs['upconv_1'](equi_x))
+
+        equi_x = self.equi_dec_convs['deconv_0'](equi_x)
+
+        equi_depth = self.equi_dec_convs['depthconv_0'](equi_x)
+        outputs['pred_depth'] = self.max_depth * self.sigmoid(equi_depth)
+
+        return outputs
diff --git a/modelscope/models/cv/panorama_depth_estimation/networks/util.py b/modelscope/models/cv/panorama_depth_estimation/networks/util.py
new file mode 100644
index 00000000..546ccc92
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/util.py
@@ -0,0 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import cv2
+import numpy as np
+from scipy.ndimage import map_coordinates
+
+
+class Equirec2Cube:
+
+    def __init__(self, equ_h, equ_w, face_w):
+        '''
+        equ_h: int, height of the equirectangular image
+        equ_w: int, width of the equirectangular image
+        face_w: int, the length of each face of the cubemap
+        '''
+
+        self.equ_h = equ_h
+        self.equ_w = equ_w
+        self.face_w = face_w
+
+        self._xyzcube()
+        self._xyz2coor()
+
+        # For convert R-distance to Z-depth for CubeMaps
+        cosmap = 1 / np.sqrt((2 * self.grid[..., 0])**2
+                             + (2 * self.grid[..., 1])**2 + 1)
+        self.cosmaps = np.concatenate(6 * [cosmap], axis=1)[..., np.newaxis]
+
+    def _xyzcube(self):
+        '''
+        Compute the xyz cordinates of the unit cube in [F R B L U D] format.
+        '''
+        self.xyz = np.zeros((self.face_w, self.face_w * 6, 3), np.float32)
+        rng = np.linspace(-0.5, 0.5, num=self.face_w, dtype=np.float32)
+        self.grid = np.stack(np.meshgrid(rng, -rng), -1)
+
+        # Front face (z = 0.5)
+        self.xyz[:, 0 * self.face_w:1 * self.face_w, [0, 1]] = self.grid
+        self.xyz[:, 0 * self.face_w:1 * self.face_w, 2] = 0.5
+
+        # Right face (x = 0.5)
+        self.xyz[:, 1 * self.face_w:2 * self.face_w,
+                 [2, 1]] = self.grid[:, ::-1]
+        self.xyz[:, 1 * self.face_w:2 * self.face_w, 0] = 0.5
+
+        # Back face (z = -0.5)
+        self.xyz[:, 2 * self.face_w:3 * self.face_w,
+                 [0, 1]] = self.grid[:, ::-1]
+        self.xyz[:, 2 * self.face_w:3 * self.face_w, 2] = -0.5
+
+        # Left face (x = -0.5)
+        self.xyz[:, 3 * self.face_w:4 * self.face_w, [2, 1]] = self.grid
+        self.xyz[:, 3 * self.face_w:4 * self.face_w, 0] = -0.5
+
+        # Up face (y = 0.5)
+        self.xyz[:, 4 * self.face_w:5 * self.face_w,
+                 [0, 2]] = self.grid[::-1, :]
+        self.xyz[:, 4 * self.face_w:5 * self.face_w, 1] = 0.5
+
+        # Down face (y = -0.5)
+        self.xyz[:, 5 * self.face_w:6 * self.face_w, [0, 2]] = self.grid
+        self.xyz[:, 5 * self.face_w:6 * self.face_w, 1] = -0.5
+
+    def _xyz2coor(self):
+
+        # x, y, z to longitude and latitude
+        x, y, z = np.split(self.xyz, 3, axis=-1)
+        lon = np.arctan2(x, z)
+        c = np.sqrt(x**2 + z**2)
+        lat = np.arctan2(y, c)
+
+        # longitude and latitude to equirectangular coordinate
+        self.coor_x = (lon / (2 * np.pi) + 0.5) * self.equ_w - 0.5
+        self.coor_y = (-lat / np.pi + 0.5) * self.equ_h - 0.5
+
+    def sample_equirec(self, e_img, order=0):
+        pad_u = np.roll(e_img[[0]], self.equ_w // 2, 1)
+        pad_d = np.roll(e_img[[-1]], self.equ_w // 2, 1)
+        e_img = np.concatenate([e_img, pad_d, pad_u], 0)
+
+        return map_coordinates(
+            e_img, [self.coor_y, self.coor_x], order=order, mode='wrap')[...,
+                                                                         0]
+
+    def run(self, equ_img, equ_dep=None):
+
+        h, w = equ_img.shape[:2]
+        if h != self.equ_h or w != self.equ_w:
+            equ_img = cv2.resize(equ_img, (self.equ_w, self.equ_h))
+            if equ_dep is not None:
+                equ_dep = cv2.resize(
+                    equ_dep, (self.equ_w, self.equ_h),
+                    interpolation=cv2.INTER_NEAREST)
+
+        cube_img = np.stack([
+            self.sample_equirec(equ_img[..., i], order=1)
+            for i in range(equ_img.shape[2])
+        ],
+                            axis=-1)  # noqa
+
+        if equ_dep is not None:
+            cube_dep = np.stack([
+                self.sample_equirec(equ_dep[..., i], order=0)
+                for i in range(equ_dep.shape[2])
+            ],
+                                axis=-1)  # noqa
+            cube_dep = cube_dep * self.cosmaps
+
+        if equ_dep is not None:
+            return cube_img, cube_dep
+        else:
+            return cube_img
diff --git a/modelscope/models/cv/panorama_depth_estimation/unifuse_model.py b/modelscope/models/cv/panorama_depth_estimation/unifuse_model.py
new file mode 100644
index 00000000..5fe33e52
--- /dev/null
+++ b/modelscope/models/cv/panorama_depth_estimation/unifuse_model.py
@@ -0,0 +1,89 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import numpy as np
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.panorama_depth_estimation.networks import (Equi,
+                                                                     UniFuse)
+from modelscope.models.cv.panorama_depth_estimation.networks.util import \
+    Equirec2Cube
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.panorama_depth_estimation,
+    module_name=Models.unifuse_depth_estimation)
+class PanoramaDepthEstimation(TorchModel):
+    """
+    UniFuse: Unidirectional Fusion for 360 Panorama Depth Estimation
+    https://arxiv.org/abs/2102.03550
+    """
+
+    def __init__(self, model_dir: str, **kwargs):
+        """
+        Args:
+            model_dir: the path of the pretrained model file
+        """
+        super().__init__(model_dir, **kwargs)
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model {model_path}')
+        model_dict = torch.load(model_path, map_location=torch.device('cpu'))
+        Net_dict = {'UniFuse': UniFuse, 'Equi': Equi}
+        Net = Net_dict[model_dict['net']]
+        self.w = model_dict['width']
+        self.h = model_dict['height']
+        self.max_depth_meters = 10.0
+        self.e2c = Equirec2Cube(self.h, self.w, self.h // 2)
+        self.to_tensor = transforms.ToTensor()
+        self.normalize = transforms.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+        # build model
+        self.model = Net(
+            model_dict['layers'],
+            model_dict['height'],
+            model_dict['width'],
+            max_depth=self.max_depth_meters,
+            fusion_type=model_dict['fusion'],
+            se_in_fusion=model_dict['se_in_fusion'])
+
+        # load state dict
+        self.model.to(self.device)
+        model_state_dict = self.model.state_dict()
+        self.model.load_state_dict(
+            {k: v
+             for k, v in model_dict.items() if k in model_state_dict})
+        self.model.eval()
+
+        logger.info(f'model init done! Device:{self.device}')
+
+    def forward(self, Inputs):
+        """
+        Args:
+            Inputs: model inputs containning equirectangular panorama images and the corresponding cubmap images
+            The torch size of Inputs['rgb'] should be [n, 3, 512, 1024]
+            The torch size of Inputs['cube_rgb'] should be [n, 3, 256, 1536]
+        Returns:
+            Unifuse model outputs containing the predicted equirectangular depth images in metric
+        """
+        equi_inputs = Inputs['rgb'].to(self.device)
+        cube_inputs = Inputs['cube_rgb'].to(self.device)
+        return self.model(equi_inputs, cube_inputs)
+
+    def postprocess(self, Inputs):
+        depth_result = Inputs['pred_depth'][0]
+        results = {OutputKeys.DEPTHS: depth_result}
+        return results
diff --git a/modelscope/models/cv/pointcloud_sceneflow_estimation/__init__.py b/modelscope/models/cv/pointcloud_sceneflow_estimation/__init__.py
new file mode 100644
index 00000000..b9953f61
--- /dev/null
+++ b/modelscope/models/cv/pointcloud_sceneflow_estimation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .rcp_model import SceneFlowEstimation
+
+else:
+    _import_structure = {
+        'rcp_model': ['SceneFlowEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/pointcloud_sceneflow_estimation/common.py b/modelscope/models/cv/pointcloud_sceneflow_estimation/common.py
new file mode 100644
index 00000000..ee44b51f
--- /dev/null
+++ b/modelscope/models/cv/pointcloud_sceneflow_estimation/common.py
@@ -0,0 +1,446 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from . import pointnet2_utils as pointutils
+
+RADIUS = 2.5
+
+
+def index_points_group(points, knn_idx):
+    """
+    Input:
+        points: input points data, [B, N, C]
+        knn_idx: sample index data, [B, N, K]
+    Return:
+        new_points:, indexed points data, [B, N, K, C]
+    """
+    points_flipped = points.permute(0, 2, 1).contiguous()
+    new_points = pointutils.grouping_operation(points_flipped,
+                                               knn_idx.int()).permute(
+                                                   0, 2, 3, 1)
+
+    return new_points
+
+
+def curvature(pc, nsample=10, radius=RADIUS):
+    # pc: B 3 N
+    assert pc.shape[1] == 3
+    pc = pc.permute(0, 2, 1)
+
+    dist, kidx = pointutils.knn(nsample, pc.contiguous(),
+                                pc.contiguous())  # (B, N, 10)
+
+    if radius is not None:
+        tmp_idx = kidx[:, :, 0].unsqueeze(2).repeat(1, 1,
+                                                    nsample).to(kidx.device)
+        kidx[dist > radius] = tmp_idx[dist > radius]
+
+    grouped_pc = index_points_group(pc, kidx)  # B N 10 3
+    pc_curvature = torch.sum(grouped_pc - pc.unsqueeze(2), dim=2) / 9.0
+    return pc_curvature  # B N 3
+
+
+class PointNetSetAbstractionRatio(nn.Module):
+
+    def __init__(self,
+                 ratio,
+                 radius,
+                 nsample,
+                 in_channel,
+                 mlp,
+                 group_all,
+                 return_fps=False,
+                 use_xyz=True,
+                 use_act=True,
+                 act=F.relu,
+                 mean_aggr=False,
+                 use_instance_norm=False):
+        super(PointNetSetAbstractionRatio, self).__init__()
+        self.ratio = ratio
+        self.radius = radius
+        self.nsample = nsample
+        self.group_all = group_all
+        self.use_xyz = use_xyz
+        self.use_act = use_act
+        self.mean_aggr = mean_aggr
+        self.act = act
+        self.mlp_convs = nn.ModuleList()
+        self.mlp_bns = nn.ModuleList()
+        last_channel = (in_channel + 3) if use_xyz else in_channel
+        for out_channel in mlp:
+            self.mlp_convs.append(
+                nn.Conv2d(last_channel, out_channel, 1, bias=False))
+            if use_instance_norm:
+                self.mlp_bns.append(
+                    nn.InstanceNorm2d(out_channel, affine=True))
+            else:
+                self.mlp_bns.append(nn.BatchNorm2d(out_channel))
+
+            last_channel = out_channel
+
+        if group_all:
+            self.queryandgroup = pointutils.GroupAll(self.use_xyz)
+        else:
+            self.queryandgroup = pointutils.QueryAndGroup(
+                radius, nsample, self.use_xyz)
+        self.return_fps = return_fps
+
+    def forward(self, xyz, points, fps_idx=None):
+        """
+        Input:
+            xyz: input points position data, [B, C, N]
+            points: input points data, [B, D, N]
+        Return:
+            new_xyz: sampled points position data, [B, C, S]
+            new_points: sample points feature data, [B, D', S]
+        """
+        B, C, N = xyz.shape
+        npoint = int(N * self.ratio)
+
+        xyz = xyz.contiguous()
+        xyz_t = xyz.permute(0, 2, 1).contiguous()
+
+        if (self.group_all is False) and (npoint != -1):
+            if fps_idx is None:
+                fps_idx = pointutils.furthest_point_sample(xyz_t,
+                                                           npoint)  # [B, N]
+            new_xyz = pointutils.gather_operation(xyz, fps_idx)  # [B, C, N]
+        else:
+            new_xyz = xyz
+        new_points, _ = self.queryandgroup(xyz_t,
+                                           new_xyz.transpose(2,
+                                                             1).contiguous(),
+                                           points)  # [B, 3+C, N, S]
+
+        # new_xyz: sampled points position data, [B, C, npoint]
+        # new_points: sampled points data, [B, C+D, npoint, nsample]
+        for i, conv in enumerate(self.mlp_convs):
+            if self.use_act:
+                bn = self.mlp_bns[i]
+                new_points = self.act(bn(conv(new_points)))
+            else:
+                new_points = conv(new_points)
+
+        if self.mean_aggr:
+            new_points = torch.mean(new_points, -1)
+        else:
+            new_points = torch.max(new_points, -1)[0]
+
+        if self.return_fps:
+            return new_xyz, new_points, fps_idx
+        else:
+            return new_xyz, new_points
+
+
+class PointNetSetAbstraction(nn.Module):
+
+    def __init__(self,
+                 npoint,
+                 radius,
+                 nsample,
+                 in_channel,
+                 mlp,
+                 group_all,
+                 return_fps=False,
+                 use_xyz=True,
+                 use_act=True,
+                 act=F.relu,
+                 mean_aggr=False,
+                 use_instance_norm=False):
+        super(PointNetSetAbstraction, self).__init__()
+        self.npoint = npoint
+        self.radius = radius
+        self.nsample = nsample
+        self.group_all = group_all
+        self.use_xyz = use_xyz
+        self.use_act = use_act
+        self.mean_aggr = mean_aggr
+        self.act = act
+        self.mlp_convs = nn.ModuleList()
+        self.mlp_bns = nn.ModuleList()
+        last_channel = (in_channel + 3) if use_xyz else in_channel
+        for out_channel in mlp:
+            self.mlp_convs.append(
+                nn.Conv2d(last_channel, out_channel, 1, bias=False))
+            if use_instance_norm:
+                self.mlp_bns.append(
+                    nn.InstanceNorm2d(out_channel, affine=True))
+            else:
+                self.mlp_bns.append(nn.BatchNorm2d(out_channel))
+
+            last_channel = out_channel
+
+        if group_all:
+            self.queryandgroup = pointutils.GroupAll(self.use_xyz)
+        else:
+            self.queryandgroup = pointutils.QueryAndGroup(
+                radius, nsample, self.use_xyz)
+        self.return_fps = return_fps
+
+    def forward(self, xyz, points, fps_idx=None):
+        """
+        Input:
+            xyz: input points position data, [B, C, N]
+            points: input points data, [B, D, N]
+        Return:
+            new_xyz: sampled points position data, [B, S, C]
+            new_points: sample points feature data, [B, S, D']
+        """
+        # device = xyz.device
+        B, C, N = xyz.shape
+        xyz = xyz.contiguous()
+        xyz_t = xyz.permute(0, 2, 1).contiguous()
+
+        if (self.group_all is False) and (self.npoint != -1):
+            if fps_idx is None:
+                fps_idx = pointutils.furthest_point_sample(
+                    xyz_t, self.npoint)  # [B, N]
+            new_xyz = pointutils.gather_operation(xyz, fps_idx)  # [B, C, N]
+        else:
+            new_xyz = xyz
+        new_points, _ = self.queryandgroup(xyz_t,
+                                           new_xyz.transpose(2,
+                                                             1).contiguous(),
+                                           points)  # [B, 3+C, N, S]
+
+        # new_xyz: sampled points position data, [B, C, npoint]
+        # new_points: sampled points data, [B, C+D, npoint, nsample]
+        for i, conv in enumerate(self.mlp_convs):
+            if self.use_act:
+                bn = self.mlp_bns[i]
+                new_points = self.act(bn(conv(new_points)))
+            else:
+                new_points = conv(new_points)
+
+        if self.mean_aggr:
+            new_points = torch.mean(new_points, -1)
+        else:
+            new_points = torch.max(new_points, -1)[0]
+
+        if self.return_fps:
+            return new_xyz, new_points, fps_idx
+        else:
+            return new_xyz, new_points
+
+
+class PointNetFeaturePropogation(nn.Module):
+
+    def __init__(self, in_channel, mlp, learn_mask=False, nsample=3):
+        super(PointNetFeaturePropogation, self).__init__()
+        self.mlp_convs = nn.ModuleList()
+        self.mlp_bns = nn.ModuleList()
+        self.apply_mlp = mlp is not None
+        last_channel = in_channel
+        self.nsample = nsample
+        if self.apply_mlp:
+            for out_channel in mlp:
+                self.mlp_convs.append(nn.Conv1d(last_channel, out_channel, 1))
+                self.mlp_bns.append(nn.BatchNorm1d(out_channel))
+                last_channel = out_channel
+
+        if learn_mask:
+            self.queryandgroup = pointutils.QueryAndGroup(
+                None, 9, use_xyz=True)
+            last_channel = (128 + 3)
+            for out_channel in [32, 1]:
+                self.mlp_convs.append(
+                    nn.Conv2d(last_channel, out_channel, 1, bias=False))
+                self.mlp_bns.append(nn.BatchNorm2d(out_channel))
+                last_channel = out_channel
+
+    def forward(self, pos1, pos2, feature1, feature2, hidden=None):
+        """
+        Input:
+            pos1: input points position data, [B, C, N]
+            pos2: sampled input points position data, [B, C, S]
+            feature1: input points data, [B, D, N]
+            feature2: input points data, [B, D, S]
+        Return:
+            feat_new: upsampled points data, [B, D', N]
+        """
+        pos1_t = pos1.permute(0, 2, 1).contiguous()
+        pos2_t = pos2.permute(0, 2, 1).contiguous()
+        B, C, N = pos1.shape
+
+        if hidden is None:
+            if self.nsample == 3:
+                dists, idx = pointutils.three_nn(pos1_t, pos2_t)
+            else:
+                dists, idx = pointutils.knn(self.nsample, pos1_t, pos2_t)
+            dists[dists < 1e-10] = 1e-10
+            weight = 1.0 / dists
+            weight = weight / torch.sum(weight, -1, keepdim=True)  # [B,N,3]
+            interpolated_feat = torch.sum(
+                pointutils.grouping_operation(feature2, idx)
+                * weight.view(B, 1, N, self.nsample),
+                dim=-1)  # [B,C,N,3]
+        else:
+            dist, idx = pointutils.knn(9, pos1_t, pos2_t)
+
+            new_feat, _ = self.queryandgroup(pos2_t, pos1_t,
+                                             hidden)  # [B, 3+C, N, 9]
+
+            for i, conv in enumerate(self.mlp_convs):
+                new_feat = conv(new_feat)
+            weight = torch.softmax(new_feat, dim=-1)  # [B, 1, N, 9]
+            interpolated_feat = torch.sum(
+                pointutils.grouping_operation(feature2, idx) * weight,
+                dim=-1)  # [B, C, N]
+
+        if feature1 is not None:
+            feat_new = torch.cat([interpolated_feat, feature1], 1)
+        else:
+            feat_new = interpolated_feat
+
+        if self.apply_mlp:
+            for i, conv in enumerate(self.mlp_convs):
+                bn = self.mlp_bns[i]
+                feat_new = F.relu(bn(conv(feat_new)))
+        return feat_new
+
+
+class Sinkhorn(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, corr, epsilon, gamma, max_iter):
+        # Early return if no iteration
+        if max_iter == 0:
+            return corr
+
+        # Init. of Sinkhorn algorithm
+        power = gamma / (gamma + epsilon)
+        a = (
+            torch.ones((corr.shape[0], corr.shape[1], 1),
+                       device=corr.device,
+                       dtype=corr.dtype) / corr.shape[1])
+        prob1 = (
+            torch.ones((corr.shape[0], corr.shape[1], 1),
+                       device=corr.device,
+                       dtype=corr.dtype) / corr.shape[1])
+        prob2 = (
+            torch.ones((corr.shape[0], corr.shape[2], 1),
+                       device=corr.device,
+                       dtype=corr.dtype) / corr.shape[2])
+
+        # Sinkhorn algorithm
+        for _ in range(max_iter):
+            # Update b
+            KTa = torch.bmm(corr.transpose(1, 2), a)
+            b = torch.pow(prob2 / (KTa + 1e-8), power)
+            # Update a
+            Kb = torch.bmm(corr, b)
+            a = torch.pow(prob1 / (Kb + 1e-8), power)
+
+        # Transportation map
+        T = torch.mul(torch.mul(a, corr), b.transpose(1, 2))
+
+        return T
+
+
+class PointWiseOptimLayer(nn.Module):
+
+    def __init__(self, nsample, radius, in_channel, mlp, use_curvature=True):
+        super().__init__()
+        self.nsample = nsample
+        self.radius = radius
+        self.use_curvature = use_curvature
+
+        self.pos_embed = nn.Sequential(
+            nn.Conv1d(3, 32, 1), nn.ReLU(inplace=True), nn.Conv1d(32, 64, 1))
+
+        self.qk_net = nn.Sequential(
+            nn.Conv1d(in_channel + 64, in_channel + 64, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(in_channel + 64, in_channel + 64, 1))
+        if self.use_curvature:
+            self.curvate_net = nn.Sequential(
+                nn.Conv1d(3, 32, 1), nn.ReLU(inplace=True),
+                nn.Conv1d(32, 32, 1))
+            self.mlp_conv = nn.Conv1d(
+                in_channel + 64 + 32, mlp[-1], 1, bias=True)
+        else:
+            self.mlp_conv = nn.Conv1d(in_channel + 64, mlp[-1], 1, bias=True)
+
+    def forward(self,
+                pos1,
+                pos2,
+                feature1,
+                feature2,
+                nsample,
+                radius=None,
+                pos1_raw=None,
+                return_score=False):
+        """
+        Input:
+            pos1: (batch_size, 3, npoint)
+            pos2: (batch_size, 3, npoint)
+            feature1: (batch_size, channel, npoint)
+            feature2: (batch_size, channel, npoint)
+        Output:
+            pos1: (batch_size, 3, npoint)
+            cost: (batch_size, channel, npoint)
+        """
+        pos1_t = pos1.permute(0, 2, 1).contiguous()
+        pos2_t = pos2.permute(0, 2, 1).contiguous()
+        self.nsample = nsample
+        self.radius = radius
+
+        dist, idx = pointutils.knn(self.nsample, pos1_t, pos2_t)  # [B, N, K]
+        if self.radius is not None:
+            tmp_idx = idx[:, :,
+                          0].unsqueeze(2).repeat(1, 1,
+                                                 self.nsample).to(idx.device)
+            idx[dist > self.radius] = tmp_idx[dist > self.radius]
+
+        pos1_embed_norm = self.pos_embed(pos1)
+        pos2_embed_norm = self.pos_embed(pos2)  # [B, C1, N]
+
+        feat1_w_pos = torch.cat([feature1, pos1_embed_norm], dim=1)
+        feat2_w_pos = torch.cat([feature2, pos2_embed_norm],
+                                dim=1)  # [B, C1+C2, N]
+
+        feat1_w_pos = self.qk_net(feat1_w_pos)
+        feat2_w_pos = self.qk_net(feat2_w_pos)  # [B, C1+C2, N]
+
+        feat2_grouped = pointutils.grouping_operation(feat2_w_pos,
+                                                      idx)  # [B, C1+C2, N, S]
+
+        score = torch.softmax(
+            feat1_w_pos.unsqueeze(-1) * feat2_grouped * 1.
+            / math.sqrt(feat1_w_pos.shape[1]),
+            dim=-1)  # [B, C1+C2, N, S]
+        cost = (score * (feat1_w_pos.unsqueeze(-1) - feat2_grouped)**2).sum(
+            dim=-1)  # [B, C1+C2, N]
+
+        if self.use_curvature:
+            curvate1_raw = curvature(pos1_raw).permute(0, 2, 1)  # [B, 3, N]
+            curvate1 = curvature(pos1).permute(0, 2, 1)  # [B, 3, N]
+            curvate_cost = self.curvate_net(curvate1_raw) - self.curvate_net(
+                curvate1)
+            curvate_cost = curvate_cost**2
+            cost = self.mlp_conv(torch.cat([cost, curvate_cost],
+                                           dim=1))  # [B, C, N]
+        else:
+            cost = self.mlp_conv(cost)  # [B, C, N]
+
+        if return_score:
+            pos2_grouped = pointutils.grouping_operation(pos2,
+                                                         idx)  # [B, 3, N, S]
+            # [B, N, K]
+            index = (dist > self.radius).sum(
+                dim=2, keepdim=True).float() > (dist.shape[2] - 0.1
+                                                )  # [B, N, 1]
+            index = index.unsqueeze(1).repeat(1, score.shape[1], 1,
+                                              dist.shape[2])  # [B, N, K]
+            score_tmp = score.clone()
+            score_tmp[index] = 0.0
+            score = score_tmp
+            return pos1, cost, score, pos2_grouped
+        else:
+            return pos1, cost
diff --git a/modelscope/models/cv/pointcloud_sceneflow_estimation/pointnet2_utils.py b/modelscope/models/cv/pointcloud_sceneflow_estimation/pointnet2_utils.py
new file mode 100644
index 00000000..ddc66229
--- /dev/null
+++ b/modelscope/models/cv/pointcloud_sceneflow_estimation/pointnet2_utils.py
@@ -0,0 +1,360 @@
+# The implementation is adopt from PointNet2, open-sourced under MIT license,
+# made publicy available at https://github.com/sshaoshuai/Pointnet2.PyTorch
+from typing import Tuple
+
+import pointnet2_cuda as pointnet2
+import torch
+import torch.nn as nn
+from torch.autograd import Function, Variable
+
+
+class FurthestPointSampling(Function):
+
+    @staticmethod
+    def forward(ctx, xyz: torch.Tensor, npoint: int) -> torch.Tensor:
+        """
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance
+        :param ctx:
+        :param xyz: (B, N, 3) where N > npoint
+        :param npoint: int, number of features in the sampled set
+        :return:
+             output: (B, npoint) tensor containing the set
+        """
+        assert xyz.is_contiguous()
+
+        B, N, _ = xyz.size()
+        output = torch.cuda.IntTensor(B, npoint)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+
+        pointnet2.furthest_point_sampling_wrapper(B, N, npoint, xyz, temp,
+                                                  output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+
+
+class GatherOperation(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param features: (B, C, N)
+        :param idx: (B, npoint) index tensor of the features to gather
+        :return:
+            output: (B, C, npoint)
+        """
+        assert features.is_contiguous()
+        assert idx.is_contiguous()
+
+        B, npoint = idx.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, npoint)
+
+        pointnet2.gather_points_wrapper(B, C, N, npoint, features, idx, output)
+
+        ctx.for_backwards = (idx, C, N)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+
+        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
+        grad_out_data = grad_out.data.contiguous()
+        pointnet2.gather_points_grad_wrapper(B, C, N, npoint, grad_out_data,
+                                             idx, grad_features.data)
+        return grad_features, None
+
+
+gather_operation = GatherOperation.apply
+
+
+class KNN(Function):
+
+    @staticmethod
+    def forward(ctx, k: int, unknown: torch.Tensor,
+                known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Find the three nearest neighbors of unknown in known
+        :param ctx:
+        :param unknown: (B, N, 3)
+        :param known: (B, M, 3)
+        :return:
+            dist: (B, N, k) l2 distance to the three nearest neighbors
+            idx: (B, N, k) index of 3 nearest neighbors
+        """
+        assert unknown.is_contiguous()
+        assert known.is_contiguous()
+
+        B, N, _ = unknown.size()
+        m = known.size(1)
+        dist2 = torch.cuda.FloatTensor(B, N, k)
+        idx = torch.cuda.IntTensor(B, N, k)
+
+        pointnet2.knn_wrapper(B, N, m, k, unknown, known, dist2, idx)
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None, None
+
+
+knn = KNN.apply
+
+
+class ThreeNN(Function):
+
+    @staticmethod
+    def forward(ctx, unknown: torch.Tensor,
+                known: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Find the three nearest neighbors of unknown in known
+        :param ctx:
+        :param unknown: (B, N, 3)
+        :param known: (B, M, 3)
+        :return:
+            dist: (B, N, 3) l2 distance to the three nearest neighbors
+            idx: (B, N, 3) index of 3 nearest neighbors
+        """
+        assert unknown.is_contiguous()
+        assert known.is_contiguous()
+
+        B, N, _ = unknown.size()
+        m = known.size(1)
+        dist2 = torch.cuda.FloatTensor(B, N, 3)
+        idx = torch.cuda.IntTensor(B, N, 3)
+
+        pointnet2.three_nn_wrapper(B, N, m, unknown, known, dist2, idx)
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
+
+
+class ThreeInterpolate(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, idx: torch.Tensor,
+                weight: torch.Tensor) -> torch.Tensor:
+        """
+        Performs weight linear interpolation on 3 features
+        :param ctx:
+        :param features: (B, C, M) Features descriptors to be interpolated from
+        :param idx: (B, n, 3) three nearest neighbors of the target features in features
+        :param weight: (B, n, 3) weights
+        :return:
+            output: (B, C, N) tensor of the interpolated features
+        """
+        assert features.is_contiguous()
+        assert idx.is_contiguous()
+        assert weight.is_contiguous()
+
+        B, c, m = features.size()
+        n = idx.size(1)
+        ctx.three_interpolate_for_backward = (idx, weight, m)
+        output = torch.cuda.FloatTensor(B, c, n)
+
+        pointnet2.three_interpolate_wrapper(B, c, m, n, features, idx, weight,
+                                            output)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        :param ctx:
+        :param grad_out: (B, C, N) tensor with gradients of outputs
+        :return:
+            grad_features: (B, C, M) tensor with gradients of features
+            None:
+            None:
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+        B, c, n = grad_out.size()
+
+        grad_features = Variable(torch.cuda.FloatTensor(B, c, m).zero_())
+        grad_out_data = grad_out.data.contiguous()
+
+        pointnet2.three_interpolate_grad_wrapper(B, c, n, m, grad_out_data,
+                                                 idx, weight,
+                                                 grad_features.data)
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
+
+
+class GroupingOperation(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                idx: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param features: (B, C, N) tensor of features to group
+        :param idx: (B, npoint, nsample) tensor containing the indicies of features to group with
+        :return:
+            output: (B, C, npoint, nsample) tensor
+        """
+        assert features.is_contiguous()
+        assert idx.is_contiguous()
+        idx = idx.int()
+        B, nfeatures, nsample = idx.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+
+        pointnet2.group_points_wrapper(B, C, N, nfeatures, nsample, features,
+                                       idx, output)
+
+        ctx.for_backwards = (idx, N)
+        return output
+
+    @staticmethod
+    def backward(ctx,
+                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        :param ctx:
+        :param grad_out: (B, C, npoint, nsample) tensor of the gradients of the output from forward
+        :return:
+            grad_features: (B, C, N) gradient of the features
+        """
+        idx, N = ctx.for_backwards
+
+        B, C, npoint, nsample = grad_out.size()
+        grad_features = Variable(torch.cuda.FloatTensor(B, C, N).zero_())
+
+        grad_out_data = grad_out.data.contiguous()
+        pointnet2.group_points_grad_wrapper(B, C, N, npoint, nsample,
+                                            grad_out_data, idx,
+                                            grad_features.data)
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
+
+
+class BallQuery(Function):
+
+    @staticmethod
+    def forward(ctx, radius: float, nsample: int, xyz: torch.Tensor,
+                new_xyz: torch.Tensor) -> torch.Tensor:
+        """
+        :param ctx:
+        :param radius: float, radius of the balls
+        :param nsample: int, maximum number of features in the balls
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: (B, npoint, 3) centers of the ball query
+        :return:
+            idx: (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        assert new_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+
+        B, N, _ = xyz.size()
+        npoint = new_xyz.size(1)
+        idx = torch.cuda.IntTensor(B, npoint, nsample).zero_()
+
+        pointnet2.ball_query_wrapper(B, N, npoint, radius, nsample, new_xyz,
+                                     xyz, idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
+
+
+class QueryAndGroup(nn.Module):
+
+    def __init__(self, radius: float, nsample: int, use_xyz: bool = True):
+        """
+        :param radius: float, radius of ball
+        :param nsample: int, maximum number of features to gather in the ball
+        :param use_xyz:
+        """
+        super().__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: torch.Tensor = None) -> Tuple[torch.Tensor]:
+        """
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: (B, npoint, 3) centroids
+        :param features: (B, C, N) descriptors of the features
+        :return:
+            new_features: (B, 3 + C, npoint, nsample)
+        """
+        # idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+        B, N, C = new_xyz.shape
+        dist, idx = knn(self.nsample, new_xyz, xyz)
+        if self.radius is not None:
+            tmp_idx = idx[:, :,
+                          0].unsqueeze(2).repeat(1, 1,
+                                                 self.nsample).to(idx.device)
+            idx[dist > self.radius] = tmp_idx[dist > self.radius]
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans,
+                                         idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert self.use_xyz, 'Cannot have not features and not use xyz as a feature!'
+            new_features = grouped_xyz
+
+        return new_features, grouped_xyz
+
+
+class GroupAll(nn.Module):
+
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: torch.Tensor = None):
+        """
+        :param xyz: (B, N, 3) xyz coordinates of the features
+        :param new_xyz: ignored
+        :param features: (B, C, N) descriptors of the features
+        :return:
+            new_features: (B, C + 3, 1, N)
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        return new_features, grouped_xyz
diff --git a/modelscope/models/cv/pointcloud_sceneflow_estimation/rcp_model.py b/modelscope/models/cv/pointcloud_sceneflow_estimation/rcp_model.py
new file mode 100644
index 00000000..8d2e5b2e
--- /dev/null
+++ b/modelscope/models/cv/pointcloud_sceneflow_estimation/rcp_model.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .sf_rcp import SF_RCP
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.pointcloud_sceneflow_estimation,
+    module_name=Models.rcp_sceneflow_estimation)
+class SceneFlowEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        assert torch.cuda.is_available(
+        ), 'current model only support run in gpu'
+
+        # build model
+        self.model = SF_RCP(
+            npoint=8192,
+            use_instance_norm=False,
+            model_name='SF_RCP',
+            use_insrance_norm=False,
+            use_curvature=True)
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        logger.info(f'load ckpt from:{model_path}')
+
+        checkpoint = torch.load(model_path, map_location='cpu')
+
+        self.model.load_state_dict({k: v for k, v in checkpoint.items()})
+        self.model.cuda()
+        self.model.eval()
+
+    def forward(self, Inputs):
+
+        return self.model(Inputs['pcd1'], Inputs['pcd2'], Inputs['pcd1'],
+                          Inputs['pcd2'])[-1]
+
+    def postprocess(self, Inputs):
+        output = Inputs['output']
+
+        results = {OutputKeys.OUTPUT: output.detach().cpu().numpy()[0]}
+
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+
+        return results
diff --git a/modelscope/models/cv/pointcloud_sceneflow_estimation/sf_rcp.py b/modelscope/models/cv/pointcloud_sceneflow_estimation/sf_rcp.py
new file mode 100644
index 00000000..f8ec6057
--- /dev/null
+++ b/modelscope/models/cv/pointcloud_sceneflow_estimation/sf_rcp.py
@@ -0,0 +1,523 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .common import (PointNetFeaturePropogation, PointNetSetAbstraction,
+                     PointWiseOptimLayer, Sinkhorn)
+
+
+class FeatureMatching(nn.Module):
+
+    def __init__(self, npoint, use_instance_norm, supporth_th, feature_norm,
+                 max_iter):
+        super(FeatureMatching, self).__init__()
+        self.support_th = supporth_th**2  # 10m
+        self.feature_norm = feature_norm
+        self.max_iter = max_iter
+        # Mass regularisation
+        self.gamma = torch.nn.Parameter(torch.zeros(1))
+        # Entropic regularisation
+        self.epsilon = torch.nn.Parameter(torch.zeros(1))
+        self.sinkhorn = Sinkhorn()
+
+        self.extract_glob = FeatureExtractionGlobal(npoint, use_instance_norm)
+
+        # upsample flow
+        self.fp0 = PointNetFeaturePropogation(in_channel=3, mlp=[])
+        self.sa1 = PointNetSetAbstraction(
+            npoint=int(npoint / 16),
+            radius=None,
+            nsample=16,
+            in_channel=3,
+            mlp=[32, 32, 64],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+        self.fp1 = PointNetFeaturePropogation(in_channel=64, mlp=[])
+        self.sa2 = PointNetSetAbstraction(
+            npoint=int(npoint / 8),
+            radius=None,
+            nsample=16,
+            in_channel=64,
+            mlp=[64, 64, 128],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+        self.fp2 = PointNetFeaturePropogation(in_channel=128, mlp=[])
+
+        self.flow_regressor = FlowRegressor(npoint, use_instance_norm)
+        self.flow_up_sample = PointNetFeaturePropogation(in_channel=3, mlp=[])
+
+    def upsample_flow(self, pc1_l, pc1_l_glob, flow_inp):
+        """
+            flow_inp: [B, N, 3]
+            return: [B, 3, N]
+        """
+
+        flow_inp = flow_inp.permute(0, 2, 1).contiguous()  # [B, 3, N]
+
+        flow_feat = self.fp0(pc1_l_glob['s16'], pc1_l_glob['s32'], None,
+                             flow_inp)
+        _, corr_feats_l2 = self.sa1(pc1_l_glob['s16'], flow_feat)
+
+        flow_feat = self.fp1(pc1_l_glob['s8'], pc1_l_glob['s16'], None,
+                             corr_feats_l2)
+        _, flow_feat = self.sa2(pc1_l_glob['s8'], flow_feat)
+
+        flow_feat = self.fp2(pc1_l['s4'], pc1_l_glob['s8'], None, flow_feat)
+
+        flow, flow_lr = self.flow_regressor(pc1_l, flow_feat)
+
+        flow_up = self.flow_up_sample(pc1_l['s1'], pc1_l_glob['s32'], None,
+                                      flow_inp)
+        flow_lr_up = self.flow_up_sample(pc1_l['s4'], pc1_l_glob['s32'], None,
+                                         flow_inp)
+
+        flow, flow_lr = flow + flow_up, flow_lr + flow_lr_up
+
+        return flow, flow_lr
+
+    def calc_feats_corr(self, pcloud1, pcloud2, feature1, feature2, norm):
+        """
+            pcloud1, pcloud2: [B, N, 3]
+            feature1, feature2: [B, N, C]
+        """
+        if norm:
+            feature1 = feature1 / torch.sqrt(
+                torch.sum(feature1**2, -1, keepdim=True) + 1e-6)
+            feature2 = feature2 / torch.sqrt(
+                torch.sum(feature2**2, -1, keepdim=True) + 1e-6)
+            corr_mat = torch.bmm(feature1,
+                                 feature2.transpose(1, 2))  # [B, N1, N2]
+        else:
+            corr_mat = torch.bmm(feature1, feature2.transpose(
+                1, 2)) / feature1.shape[2]**.5  # [B, N1, N2]
+
+        if self.support_th is not None:
+            distance_matrix = torch.sum(
+                pcloud1**2, -1, keepdim=True)  # [B, N1, 1]
+            distance_matrix = distance_matrix + torch.sum(
+                pcloud2**2, -1, keepdim=True).transpose(1, 2)  # [B, N1, N2]
+            distance_matrix = distance_matrix - 2 * torch.bmm(
+                pcloud1, pcloud2.transpose(1, 2))  # [B, N1, N2]
+            support = (distance_matrix < self.support_th)  # [B, N1, N2]
+            support = support.float()
+        else:
+            support = torch.ones_like(corr_mat)
+        return corr_mat, support
+
+    def calc_corr_mat(self, pcloud1, pcloud2, feature1, feature2):
+        """
+            pcloud1, pcloud2: [B, N, 3]
+            feature1, feature2: [B, N, C]
+            corr_mat: [B, N1, N2]
+        """
+        epsilon = torch.exp(self.epsilon) + 0.03
+        corr_mat, support = self.calc_feats_corr(
+            pcloud1, pcloud2, feature1, feature2, norm=self.feature_norm)
+        C = 1.0 - corr_mat
+        corr_mat = torch.exp(-C / epsilon) * support
+        return corr_mat
+
+    def get_flow_init(self, pcloud1, pcloud2, feats1, feats2):
+        """
+            pcloud1, pcloud2: [B, 3, N]
+            feats1, feats2: [B, C, N]
+        """
+
+        corr_mat = self.calc_corr_mat(
+            pcloud1.permute(0, 2, 1), pcloud2.permute(0, 2, 1),
+            feats1.permute(0, 2, 1), feats2.permute(0, 2, 1))
+
+        corr_mat = self.sinkhorn(corr_mat,
+                                 torch.exp(self.epsilon) + 0.03, self.gamma,
+                                 self.max_iter)
+
+        row_sum = corr_mat.sum(-1, keepdim=True)  # [B, N1, 1]
+        flow_init = (corr_mat @ pcloud2.permute(0, 2, 1).contiguous()) / (
+            row_sum + 1e-6) - pcloud1.permute(0, 2,
+                                              1).contiguous()  # [B, N1, 3]
+
+        return flow_init
+
+    def forward(self, pc1_l, pc2_l, feats1, feats2):
+        """
+        pc1_l, pc2_l: dict([B, 3, N])
+        feats1, feats2: [B, C, N]
+        """
+        pc1_l_glob, feats1_glob = self.extract_glob(pc1_l['s4'], feats1)
+        pc2_l_glob, feats2_glob = self.extract_glob(pc2_l['s4'], feats2)
+
+        flow_init_s32 = self.get_flow_init(pc1_l_glob['s32'],
+                                           pc2_l_glob['s32'], feats1_glob,
+                                           feats2_glob)
+
+        flow_init, flow_init_s4 = self.upsample_flow(pc1_l, pc1_l_glob,
+                                                     flow_init_s32)
+
+        return flow_init, flow_init_s4
+
+
+class FlowRegressor(nn.Module):
+
+    def __init__(self, npoint, use_instance_norm, input_dim=128, nsample=32):
+        super(FlowRegressor, self).__init__()
+        self.sa1 = PointNetSetAbstraction(
+            npoint=int(npoint / 4),
+            radius=None,
+            nsample=nsample,
+            in_channel=input_dim,
+            mlp=[input_dim, input_dim],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+        self.sa2 = PointNetSetAbstraction(
+            npoint=int(npoint / 4),
+            radius=None,
+            nsample=nsample,
+            in_channel=input_dim,
+            mlp=[input_dim, input_dim],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+
+        self.fc = nn.Sequential(
+            nn.Linear(input_dim, input_dim), nn.ReLU(inplace=True),
+            nn.Linear(input_dim, 3))
+
+        self.up_sample = PointNetFeaturePropogation(in_channel=3, mlp=[])
+
+    def forward(self, pc1_l, feats):
+        """
+            pc1_l: dict([B, 3, N])
+            feats: [B, C, N]
+            return: [B, 3, N]
+        """
+        _, x = self.sa1(pc1_l['s4'], feats)
+        _, x = self.sa2(pc1_l['s4'], x)
+        x = x.permute(0, 2, 1).contiguous()  # [B, N, C]
+        x = self.fc(x)
+        flow_lr = x.permute(0, 2, 1).contiguous()  # [B, 3, N]
+
+        flow = self.up_sample(pc1_l['s1'], pc1_l['s4'], None,
+                              flow_lr)  # [B, 3, N]
+
+        return flow, flow_lr
+
+
+class FeatureExtractionGlobal(nn.Module):
+
+    def __init__(self, npoint, use_instance_norm):
+        super(FeatureExtractionGlobal, self).__init__()
+        self.sa1 = PointNetSetAbstraction(
+            npoint=int(npoint / 8),
+            radius=None,
+            nsample=32,
+            in_channel=64,
+            mlp=[128, 128, 128],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+        self.sa2 = PointNetSetAbstraction(
+            npoint=int(npoint / 16),
+            radius=None,
+            nsample=24,
+            in_channel=128,
+            mlp=[128, 128, 128],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+        self.sa3 = PointNetSetAbstraction(
+            npoint=int(npoint / 32),
+            radius=None,
+            nsample=16,
+            in_channel=128,
+            mlp=[256, 256, 256],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+
+    def forward(self, pc, feature):
+        pc_l1, feat_l1 = self.sa1(pc, feature)
+        pc_l2, feat_l2 = self.sa2(pc_l1, feat_l1)
+        pc_l3, feat_l3 = self.sa3(pc_l2, feat_l2)
+
+        pc_l = dict(s8=pc_l1, s16=pc_l2, s32=pc_l3)
+        return pc_l, feat_l3
+
+
+class FeatureExtraction(nn.Module):
+
+    def __init__(self, npoint, use_instance_norm):
+        super(FeatureExtraction, self).__init__()
+        self.sa1 = PointNetSetAbstraction(
+            npoint=int(npoint / 2),
+            radius=None,
+            nsample=32,
+            in_channel=3,
+            mlp=[32, 32, 32],
+            group_all=False,
+            return_fps=True,
+            use_instance_norm=use_instance_norm)
+        self.sa2 = PointNetSetAbstraction(
+            npoint=int(npoint / 4),
+            radius=None,
+            nsample=32,
+            in_channel=32,
+            mlp=[64, 64, 64],
+            group_all=False,
+            return_fps=True,
+            use_instance_norm=use_instance_norm)
+
+    def forward(self, pc, feature, fps_idx=None):
+        """
+            pc: [B, 3, N]
+            feature: [B, 3, N]
+        """
+        fps_idx1 = fps_idx['s2'] if fps_idx is not None else None
+        pc_l1, feat_l1, fps_idx1 = self.sa1(pc, feature, fps_idx=fps_idx1)
+        fps_idx2 = fps_idx['s4'] if fps_idx is not None else None
+        pc_l2, feat_l2, fps_idx2 = self.sa2(pc_l1, feat_l1, fps_idx=fps_idx2)
+        pc_l = dict(s1=pc, s2=pc_l1, s4=pc_l2)
+        fps_idx = dict(s2=fps_idx1, s4=fps_idx2)
+        return pc_l, feat_l2, fps_idx
+
+
+class HiddenInitNet(nn.Module):
+
+    def __init__(self, npoint, use_instance_norm):
+        super(HiddenInitNet, self).__init__()
+        self.sa1 = PointNetSetAbstraction(
+            npoint=int(npoint / 4),
+            radius=None,
+            nsample=8,
+            in_channel=64,
+            mlp=[128, 128, 128],
+            group_all=False,
+            use_instance_norm=use_instance_norm)
+        self.sa2 = PointNetSetAbstraction(
+            npoint=int(npoint / 4),
+            radius=None,
+            nsample=8,
+            in_channel=128,
+            mlp=[128],
+            group_all=False,
+            use_act=False,
+            use_instance_norm=use_instance_norm)
+
+    def forward(self, pc, feature):
+        _, feat_l1 = self.sa1(pc, feature)
+        _, feat_l2 = self.sa2(pc, feat_l1)
+
+        h_init = torch.tanh(feat_l2)
+        return h_init
+
+
+class GRUReg(nn.Module):
+
+    def __init__(self, npoint, hidden_dim, input_dim, use_instance_norm):
+        super().__init__()
+        in_ch = hidden_dim + input_dim
+
+        self.flow_proj = nn.ModuleList([
+            PointNetSetAbstraction(
+                npoint=int(npoint / 4),
+                radius=None,
+                nsample=16,
+                in_channel=3,
+                mlp=[32, 32, 32],
+                group_all=False,
+                use_instance_norm=use_instance_norm),
+            PointNetSetAbstraction(
+                npoint=int(npoint / 4),
+                radius=None,
+                nsample=8,
+                in_channel=32,
+                mlp=[16, 16, 16],
+                group_all=False,
+                use_instance_norm=use_instance_norm)
+        ])
+
+        self.hidden_init_net = HiddenInitNet(npoint, use_instance_norm)
+
+        self.gru_layers = nn.ModuleList([
+            PointNetSetAbstraction(
+                npoint=int(npoint / 4),
+                radius=None,
+                nsample=4,
+                in_channel=in_ch,
+                mlp=[hidden_dim],
+                group_all=False,
+                use_act=False,
+                use_instance_norm=use_instance_norm),
+            PointNetSetAbstraction(
+                npoint=int(npoint / 4),
+                radius=None,
+                nsample=4,
+                in_channel=in_ch,
+                mlp=[hidden_dim],
+                group_all=False,
+                use_act=False,
+                use_instance_norm=use_instance_norm),
+            PointNetSetAbstraction(
+                npoint=int(npoint / 4),
+                radius=None,
+                nsample=4,
+                in_channel=in_ch,
+                mlp=[hidden_dim],
+                group_all=False,
+                use_act=False,
+                use_instance_norm=use_instance_norm)
+        ])
+
+    def gru(self, h, gru_inp, pc):
+        hx = torch.cat([h, gru_inp], dim=1)
+        z = torch.sigmoid(self.gru_layers[0](pc, hx)[1])
+        r = torch.sigmoid(self.gru_layers[1](pc, hx)[1])
+        q = torch.tanh(self.gru_layers[2](pc, torch.cat([r * h, gru_inp],
+                                                        dim=1))[1])
+        h = (1 - z) * h + z * q
+        return h
+
+    def get_gru_input(self, feats1_new, cost, flow, pc):
+        flow_feats = flow
+        for flow_conv in self.flow_proj:
+            _, flow_feats = flow_conv(pc, flow_feats)
+
+        gru_inp = torch.cat([feats1_new, cost, flow_feats, flow],
+                            dim=1)  # [64, 128, 16, 3]
+
+        return gru_inp
+
+    def forward(self, h, feats1_new, cost, flow_lr, pc1_l):
+        gru_inp = self.get_gru_input(feats1_new, cost, flow_lr, pc=pc1_l['s4'])
+
+        h = self.gru(h, gru_inp, pc1_l['s4'])
+        return h
+
+
+class SF_RCP(nn.Module):
+
+    def __init__(self, npoint=8192, use_instance_norm=False, **kwargs):
+        super().__init__()
+        self.radius = kwargs.get('radius', 3.5)
+        self.nsample = kwargs.get('nsample', 6)
+        self.radius_min = kwargs.get('radius_min', 3.5)
+        self.nsample_min = kwargs.get('nsample_min', 6)
+        self.use_curvature = kwargs.get('use_curvature', True)
+        self.flow_ratio = kwargs.get('flow_ratio', 0.1)
+        self.init_max_iter = kwargs.get('init_max_iter', 0)
+        self.init_feature_norm = kwargs.get('init_feature_norm', True)
+        self.support_th = kwargs.get('support_th', 10)
+
+        self.feature_extraction = FeatureExtraction(npoint, use_instance_norm)
+        self.feature_matching = FeatureMatching(
+            npoint,
+            use_instance_norm,
+            supporth_th=self.support_th,
+            feature_norm=self.init_feature_norm,
+            max_iter=self.init_max_iter)
+
+        self.pointwise_optim_layer = PointWiseOptimLayer(
+            nsample=self.nsample,
+            radius=self.radius,
+            in_channel=64,
+            mlp=[128, 128, 128],
+            use_curvature=self.use_curvature)
+
+        self.gru = GRUReg(
+            npoint,
+            hidden_dim=128,
+            input_dim=128 + 64 + 16 + 3,
+            use_instance_norm=use_instance_norm)
+
+        self.flow_regressor = FlowRegressor(npoint, use_instance_norm)
+
+    def initialization(self, pc1_l, pc2_l, feats1, feats2):
+        """
+            pc1: [B, 3, N]
+            pc2: [B, 3, N]
+            feature1: [B, 3, N]
+            feature2: [B, 3, N]
+        """
+        flow, flow_lr = self.feature_matching(pc1_l, pc2_l, feats1, feats2)
+
+        return flow, flow_lr
+
+    def pointwise_optimization(self, pc1_l_new, pc2_l, feats1_new, feats2,
+                               pc1_l, flow_lr, iter):
+        _, cost, score, pos2_grouped = self.pointwise_optim_layer(
+            pc1_l_new['s4'],
+            pc2_l['s4'],
+            feats1_new,
+            feats2,
+            nsample=max(self.nsample_min, self.nsample // (2**iter)),
+            radius=max(self.radius_min, self.radius / (2**iter)),
+            pos1_raw=pc1_l['s4'],
+            return_score=True)
+
+        # pc1_new_l_loc: [B, 3, N, S]
+        # pos2_grouped: [B, C, N, S]
+        delta_flow_tmp = ((pos2_grouped - pc1_l_new['s4'].unsqueeze(-1))
+                          * score.mean(dim=1, keepdim=True)).sum(
+                              dim=-1)  # [B, 3, N]
+        flow_lr = flow_lr + self.flow_ratio * delta_flow_tmp
+
+        return flow_lr, cost
+
+    def update_pos(self, pc, pc_lr, flow, flow_lr):
+        pc = pc + flow
+        pc_lr = pc_lr + flow_lr
+        return pc, pc_lr
+
+    def forward(self, pc1, pc2, feature1, feature2, iters=1):
+        """
+            pc1: [B, N, 3]
+            pc2: [B, N, 3]
+            feature1: [B, N, 3]
+            feature2: [B, N, 3]
+        """
+        # prepare
+        flow_predictions = []
+        pc1 = pc1.permute(0, 2, 1).contiguous()  # B 3 N
+        pc2 = pc2.permute(0, 2, 1).contiguous()  # B 3 N
+        feature1 = feature1.permute(0, 2, 1).contiguous()  # B 3 N
+        feature2 = feature2.permute(0, 2, 1).contiguous()  # B 3 N
+
+        # feature extraction
+        pc1_l, feats1, fps_idx1 = self.feature_extraction(pc1, feature1)
+        pc2_l, feats2, _ = self.feature_extraction(pc2, feature2)
+
+        # initialization, flow_lr_init(flow_low_resolution)
+        flow_init, flow_lr_init = self.initialization(pc1_l, pc2_l, feats1,
+                                                      feats2)
+        flow_predictions.append(flow_init.permute(0, 2, 1))
+
+        # gru init hidden state
+        h = self.gru.hidden_init_net(pc1_l['s4'], feats1)
+
+        # update position
+        pc1_lr_raw = pc1_l['s4']
+        pc1_new, pc1_lr_new = self.update_pos(pc1, pc1_lr_raw, flow_init,
+                                              flow_lr_init)
+
+        # iterative optim
+        for iter in range(iters - 1):
+            pc1_new = pc1_new.detach()
+            pc1_lr_new = pc1_lr_new.detach()
+            flow_lr = pc1_lr_new - pc1_lr_raw
+
+            pc1_l_new, feats1_new, _ = self.feature_extraction(
+                pc1_new, pc1_new, fps_idx1)
+
+            # pointwise optimization to get udpated flow_lr and cost
+            flow_lr_update, cost = self.pointwise_optimization(
+                pc1_l_new, pc2_l, feats1_new, feats2, pc1_l, flow_lr, iter)
+            flow_lr = flow_lr_update
+
+            # gru regularization
+            h = self.gru(h, feats1_new, cost, flow_lr, pc1_l)
+            # pred flow_lr
+            delta_flow, delta_flow_lr = self.flow_regressor(pc1_l, h)
+
+            pc1_new, pc1_lr_new = self.update_pos(pc1_new, pc1_lr_new,
+                                                  delta_flow, delta_flow_lr)
+
+            flow = pc1_new - pc1
+            flow_predictions.append(flow.permute(0, 2, 1))
+
+        return flow_predictions
diff --git a/modelscope/models/cv/tinynas_detection/core/__init__.py b/modelscope/models/cv/tinynas_detection/core/__init__.py
deleted file mode 100644
index 50a10d0b..00000000
--- a/modelscope/models/cv/tinynas_detection/core/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
diff --git a/modelscope/models/cv/tinynas_detection/core/utils.py b/modelscope/models/cv/tinynas_detection/core/utils.py
deleted file mode 100644
index 29f08f05..00000000
--- a/modelscope/models/cv/tinynas_detection/core/utils.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
-
-import numpy as np
-import torch
-import torchvision
-
-__all__ = [
-    'filter_box',
-    'postprocess_airdet',
-    'bboxes_iou',
-    'matrix_iou',
-    'adjust_box_anns',
-    'xyxy2xywh',
-    'xyxy2cxcywh',
-]
-
-
-def multiclass_nms(multi_bboxes,
-                   multi_scores,
-                   score_thr,
-                   iou_thr,
-                   max_num=100,
-                   score_factors=None):
-    """NMS for multi-class bboxes.
-
-    Args:
-        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
-        multi_scores (Tensor): shape (n, #class), where the last column
-            contains scores of the background class, but this will be ignored.
-        score_thr (float): bbox threshold, bboxes with scores lower than it
-            will not be considered.
-        nms_thr (float): NMS IoU threshold
-        max_num (int): if there are more than max_num bboxes after NMS,
-            only top max_num will be kept.
-        score_factors (Tensor): The factors multiplied to scores before
-            applying NMS
-
-    Returns:
-        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
-            are 0-based.
-    """
-    num_classes = multi_scores.size(1)
-    # exclude background category
-    if multi_bboxes.shape[1] > 4:
-        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
-    else:
-        bboxes = multi_bboxes[:, None].expand(
-            multi_scores.size(0), num_classes, 4)
-    scores = multi_scores
-    # filter out boxes with low scores
-    valid_mask = scores > score_thr  # 1000 * 80 bool
-
-    # We use masked_select for ONNX exporting purpose,
-    # which is equivalent to bboxes = bboxes[valid_mask]
-    # (TODO): as ONNX does not support repeat now,
-    # we have to use this ugly code
-    # bboxes -> 1000, 4
-    bboxes = torch.masked_select(
-        bboxes,
-        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
-                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
-    if score_factors is not None:
-        scores = scores * score_factors[:, None]
-    scores = torch.masked_select(scores, valid_mask)
-    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
-
-    if bboxes.numel() == 0:
-        bboxes = multi_bboxes.new_zeros((0, 5))
-        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
-        scores = multi_bboxes.new_zeros((0, ))
-
-        return bboxes, scores, labels
-
-    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
-
-    if max_num > 0:
-        keep = keep[:max_num]
-
-    return bboxes[keep], scores[keep], labels[keep]
-
-
-def filter_box(output, scale_range):
-    """
-    output: (N, 5+class) shape
-    """
-    min_scale, max_scale = scale_range
-    w = output[:, 2] - output[:, 0]
-    h = output[:, 3] - output[:, 1]
-    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
-    return output[keep]
-
-
-def filter_results(boxlist, num_classes, nms_thre):
-    boxes = boxlist.bbox
-    scores = boxlist.get_field('scores')
-    cls = boxlist.get_field('labels')
-    nms_out_index = torchvision.ops.batched_nms(
-        boxes,
-        scores,
-        cls,
-        nms_thre,
-    )
-    boxlist = boxlist[nms_out_index]
-
-    return boxlist
-
-
-def postprocess_airdet(prediction,
-                       num_classes,
-                       conf_thre=0.7,
-                       nms_thre=0.45,
-                       imgs=None):
-    box_corner = prediction.new(prediction.shape)
-    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
-    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
-    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
-    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
-    prediction[:, :, :4] = box_corner[:, :, :4]
-    output = [None for _ in range(len(prediction))]
-    for i, image_pred in enumerate(prediction):
-        # If none are remaining => process next image
-        if not image_pred.size(0):
-            continue
-        multi_bboxes = image_pred[:, :4]
-        multi_scores = image_pred[:, 5:]
-        detections, scores, labels = multiclass_nms(multi_bboxes, multi_scores,
-                                                    conf_thre, nms_thre, 500)
-        detections = torch.cat(
-            (detections, scores[:, None], scores[:, None], labels[:, None]),
-            dim=1)
-
-        if output[i] is None:
-            output[i] = detections
-        else:
-            output[i] = torch.cat((output[i], detections))
-    return output
-
-
-def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
-    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
-        raise IndexError
-
-    if xyxy:
-        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
-        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
-        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
-        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
-    else:
-        tl = torch.max(
-            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
-            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
-        )
-        br = torch.min(
-            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
-            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
-        )
-
-        area_a = torch.prod(bboxes_a[:, 2:], 1)
-        area_b = torch.prod(bboxes_b[:, 2:], 1)
-    en = (tl < br).type(tl.type()).prod(dim=2)
-    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
-    return area_i / (area_a[:, None] + area_b - area_i)
-
-
-def matrix_iou(a, b):
-    """
-    return iou of a and b, numpy version for data augenmentation
-    """
-    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
-    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
-
-    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
-    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
-    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
-    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
-
-
-def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
-    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
-    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
-    return bbox
-
-
-def xyxy2xywh(bboxes):
-    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
-    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
-    return bboxes
-
-
-def xyxy2cxcywh(bboxes):
-    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
-    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
-    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
-    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
-    return bboxes
diff --git a/modelscope/models/cv/tinynas_detection/damo/__init__.py b/modelscope/models/cv/tinynas_detection/damo/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/apis/__init__.py b/modelscope/models/cv/tinynas_detection/damo/apis/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/apis/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py b/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py
new file mode 100644
index 00000000..1db526f2
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/apis/detector_evaluater.py
@@ -0,0 +1,63 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+import torch
+
+from modelscope.models.cv.tinynas_detection.damo.apis.detector_inference import \
+    inference
+from modelscope.models.cv.tinynas_detection.damo.detectors.detector import \
+    build_local_model
+from modelscope.msdatasets.task_datasets.damoyolo import (build_dataloader,
+                                                          build_dataset)
+
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+class Evaluater:
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.output_dir = cfg.miscs.output_dir
+        self.exp_name = cfg.miscs.exp_name
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+        self.ckpt = torch.load(
+            self.cfg.test.checkpoint_path, map_location=self.device)
+        self.model = build_local_model(self.cfg, self.device)
+        self.model.load_state_dict(self.ckpt['model'])
+        self.val_loader = self.get_data_loader(self.cfg, False)
+
+    def get_data_loader(self, cfg, distributed=False):
+
+        val_dataset = build_dataset(
+            cfg,
+            cfg.dataset.val_image_dir,
+            cfg.dataset.val_ann,
+            is_train=False)
+
+        val_loader = build_dataloader(
+            val_dataset,
+            cfg.test.augment,
+            batch_size=cfg.test.batch_size,
+            num_workers=cfg.miscs.num_workers,
+            is_train=False,
+            size_div=32,
+            distributed=distributed)
+
+        return val_loader
+
+    def evaluate(self):
+
+        output_folder = os.path.join(self.output_dir, self.exp_name,
+                                     'inference')
+        for data_loader_val in self.val_loader:
+            inference(
+                self.model,
+                data_loader_val,
+                device=self.device,
+                output_folder=output_folder,
+            )
diff --git a/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py b/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py
new file mode 100644
index 00000000..47c1fb1b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/apis/detector_inference.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import torch
+from tqdm import tqdm
+
+from modelscope.msdatasets.task_datasets.damoyolo.evaluation import evaluate
+from modelscope.utils.logger import get_logger
+from modelscope.utils.timer import Timer, get_time_str
+from modelscope.utils.torch_utils import (all_gather, get_world_size,
+                                          is_master, synchronize)
+
+logger = get_logger()
+
+
+def compute_on_dataset(model, data_loader, device, timer=None, tta=False):
+    model.eval()
+    results_dict = {}
+    cpu_device = torch.device('cpu')
+    for _, batch in enumerate(tqdm(data_loader)):
+        images, targets, image_ids = batch
+        with torch.no_grad():
+            if timer:
+                timer.tic()
+                output = model(images.to(device))
+            if timer:
+                # torch.cuda.synchronize()  consume much time
+                timer.toc()
+            output = [o.to(cpu_device) if o is not None else o for o in output]
+        results_dict.update(
+            {img_id: result
+             for img_id, result in zip(image_ids, output)})
+    return results_dict
+
+
+def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu,
+                                               multi_gpu_infer):
+    if multi_gpu_infer:
+        all_predictions = all_gather(predictions_per_gpu)
+    else:
+        all_predictions = [predictions_per_gpu]
+    if not is_master():
+        return
+    # merge the list of dicts
+    predictions = {}
+    for p in all_predictions:
+        predictions.update(p)
+    # convert a dict where the key is the index in a list
+    image_ids = list(sorted(predictions.keys()))
+    if len(image_ids) != image_ids[-1] + 1:
+        logger.warning(
+            'Number of images that were gathered from multiple processes is'
+            'not a contiguous set. Some images might be missing from the'
+            'evaluation')
+
+    # convert to a list
+    predictions = [predictions[i] for i in image_ids]
+    return predictions
+
+
+def inference(
+    model,
+    data_loader,
+    iou_types=('bbox', ),
+    box_only=False,
+    device='cuda',
+    expected_results=(),
+    expected_results_sigma_tol=4,
+    output_folder=None,
+    multi_gpu_infer=True,
+):
+    # convert to a torch.device for efficiency
+    device = torch.device(device)
+    num_devices = get_world_size()
+    dataset = data_loader.dataset
+    logger.info('Start evaluation ({} images).'.format(len(dataset)))
+    total_timer = Timer()
+    inference_timer = Timer()
+    total_timer.tic()
+    predictions = compute_on_dataset(model, data_loader, device,
+                                     inference_timer)
+    # wait for all processes to complete before measuring the time
+    if multi_gpu_infer:
+        synchronize()
+    total_time = total_timer.toc()
+    total_time_str = get_time_str(total_time)
+    logger.info(
+        'Total run time: {} ({} s / img per device, on {} devices)'.format(
+            total_time_str, total_time * num_devices / len(dataset),
+            num_devices))
+    total_infer_time = get_time_str(inference_timer.total_time)
+    logger.info(
+        'Model inference time: {} ({} s / img per device, on {} devices)'.
+        format(
+            total_infer_time,
+            inference_timer.total_time * num_devices / len(dataset),
+            num_devices,
+        ))
+
+    predictions = _accumulate_predictions_from_multiple_gpus(
+        predictions, multi_gpu_infer)
+    if not is_master():
+        return
+
+    if output_folder:
+        torch.save(predictions, os.path.join(output_folder, 'predictions.pth'))
+
+    extra_args = dict(
+        box_only=box_only,
+        iou_types=iou_types,
+        expected_results=expected_results,
+        expected_results_sigma_tol=expected_results_sigma_tol,
+    )
+
+    return evaluate(
+        dataset=dataset,
+        predictions=predictions,
+        output_folder=output_folder,
+        **extra_args)
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/__init__.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/__init__.py
new file mode 100644
index 00000000..14e2370d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/__init__.py
@@ -0,0 +1 @@
+# Copyright © Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/__init__.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/__init__.py
new file mode 100644
index 00000000..14e2370d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/__init__.py
@@ -0,0 +1 @@
+# Copyright © Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/box_level_augs.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/box_level_augs.py
new file mode 100644
index 00000000..f7eb2582
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/box_level_augs.py
@@ -0,0 +1,104 @@
+# This file mainly comes from
+# https://github.com/dvlab-research/SA-AutoAug/blob/master/FCOS/fcos_core/augmentations/box_level_augs/box_level_augs.py
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import random
+
+import numpy as np
+
+from .color_augs import color_aug_func
+from .geometric_augs import geometric_aug_func
+
+
+def _box_sample_prob(bbox, scale_ratios_splits, box_prob=0.3):
+    scale_ratios, scale_splits = scale_ratios_splits
+
+    ratios = np.array(scale_ratios)
+    ratios = ratios / ratios.sum()
+    area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+    if area == 0:
+        return 0
+    if area < scale_splits[0]:
+        scale_ratio = ratios[0]
+    elif area < scale_splits[1]:
+        scale_ratio = ratios[1]
+    else:
+        scale_ratio = ratios[2]
+    return box_prob * scale_ratio
+
+
+def _box_aug_per_img(img,
+                     target,
+                     aug_type=None,
+                     scale_ratios=None,
+                     scale_splits=None,
+                     img_prob=0.1,
+                     box_prob=0.3,
+                     level=1):
+    if random.random() > img_prob:
+        return img, target
+    img /= 255.0
+
+    tag = 'prob' if aug_type in geometric_aug_func else 'area'
+    scale_ratios_splits = [scale_ratios[tag], scale_splits]
+    if scale_ratios is None:
+        box_sample_prob = [box_prob] * len(target.bbox)
+    else:
+        box_sample_prob = [
+            _box_sample_prob(bbox, scale_ratios_splits, box_prob=box_prob)
+            for bbox in target.bbox
+        ]
+
+    if aug_type in color_aug_func:
+        img_aug = color_aug_func[aug_type](
+            img, level, target, [scale_ratios['area'], scale_splits],
+            box_sample_prob)
+    elif aug_type in geometric_aug_func:
+        img_aug, target = geometric_aug_func[aug_type](img, level, target,
+                                                       box_sample_prob)
+    else:
+        raise ValueError('Unknown box-level augmentation function %s.' %
+                         (aug_type))
+    out = img_aug * 255.0
+
+    return out, target
+
+
+class Box_augs(object):
+
+    def __init__(self, box_augs_dict, max_iters, scale_splits, box_prob=0.3):
+        self.max_iters = max_iters
+        self.box_prob = box_prob
+        self.scale_splits = scale_splits
+        self.policies = box_augs_dict['policies']
+        self.scale_ratios = box_augs_dict['scale_ratios']
+
+    def __call__(self, tensor, target, iteration):
+        iter_ratio = float(iteration) / self.max_iters
+        sub_policy = random.choice(self.policies)
+
+        h, w = tensor.shape[-2:]
+        ratio = min(h, w) / 800
+
+        scale_splits = [area * ratio for area in self.scale_splits]
+        if iter_ratio <= 1:
+            tensor, _ = _box_aug_per_img(
+                tensor,
+                target,
+                aug_type=sub_policy[0][0],
+                scale_ratios=self.scale_ratios,
+                scale_splits=scale_splits,
+                img_prob=sub_policy[0][1] * iter_ratio,
+                box_prob=self.box_prob,
+                level=sub_policy[0][2])
+            tensor, target = _box_aug_per_img(
+                tensor,
+                target,
+                aug_type=sub_policy[1][0],
+                scale_ratios=self.scale_ratios,
+                scale_splits=scale_splits,
+                img_prob=sub_policy[1][1] * iter_ratio,
+                box_prob=self.box_prob,
+                level=sub_policy[1][2])
+
+        return tensor, target
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/color_augs.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/color_augs.py
new file mode 100644
index 00000000..41c46e86
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/color_augs.py
@@ -0,0 +1,234 @@
+# This file mainly comes from
+# https://github.com/dvlab-research/SA-AutoAug/blob/master/FCOS/fcos_core/augmentations/box_level_augs/color_augs.py
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import random
+
+import torch
+import torch.nn.functional as F
+
+from .gaussian_maps import _merge_gaussian
+
+_MAX_LEVEL = 10.0
+
+
+def blend(image1, image2, factor):
+    """Blend image1 and image2 using 'factor'.
+    Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+    A value of 1.0 means only image2 is used.  A value between 0.0 and
+    1.0 means we linearly interpolate the pixel values between the two
+    images.  A value greater than 1.0 "extrapolates" the difference
+    between the two pixel values, and we clip the results to values
+    between 0 and 1.0.
+    """
+
+    if factor == 0.0:
+        return image1
+    if factor == 1.0:
+        return image2
+
+    difference = image2 - image1
+    scaled = factor * difference
+
+    # Do addition in float.
+    temp = image1 + scaled
+
+    # Interpolate
+    if factor > 0.0 and factor < 1.0:
+        # Interpolation means we always stay within 0 and 255.
+        return temp
+
+    # Extrapolate:
+    #
+    # We need to clip and then cast.
+    return torch.clamp(temp, 0.0, 1.0)
+
+
+def solarize(image, threshold=0.5):
+    # For each pixel in the image, select the pixel
+    # if the value is less than the threshold.
+    # Otherwise, subtract 255 from the pixel.
+    return torch.where(image <= threshold, image, 1.0 - image)
+
+
+def solarize_add(image, addition=0, threshold=0.5):
+    # For each pixel in the image less than threshold
+    # we add 'addition' amount to it and then clip the
+    # pixel value to be between 0 and 255. The value
+    # of 'addition' is between -128 and 128.
+    added_image = image + addition
+    added_image = torch.clamp(added_image, 0.0, 1.0)
+    return torch.where(image <= threshold, added_image, image)
+
+
+def rgb2gray(rgb):
+    gray = rgb[0] * 0.2989 + rgb[1] * 0.5870 + rgb[2] * 0.1140
+    gray = gray.unsqueeze(0).repeat((3, 1, 1))
+    return gray
+
+
+def color(img, factor):
+    """Equivalent of PIL Color."""
+    if img.shape[0] == 0 or img.shape[1] == 0:
+        return img
+
+    degenerate = rgb2gray(img)
+    return blend(degenerate, img, factor)
+
+
+def contrast(img, factor):
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    mean = torch.mean(rgb2gray(img).to(dtype), dim=(-3, -2, -1), keepdim=True)
+    return blend(mean, img, max(factor, 1e-6))
+
+
+def brightness(image, factor):
+    """Equivalent of PIL Brightness."""
+    degenerate = torch.zeros(image.shape)
+    return blend(degenerate, image, factor)
+
+
+def sharpness(image, factor):
+    """Implements Sharpness function from PIL using TF ops."""
+    if image.shape[0] == 0 or image.shape[1] == 0:
+        return image
+    channels = image.shape[0]
+    kernel = torch.Tensor([[1, 1, 1], [1, 5, 1], [1, 1, 1]]).reshape(
+        1, 1, 3, 3) / 13.0
+    kernel = kernel.repeat((3, 1, 1, 1))
+    image_newaxis = image.unsqueeze(0)
+    image_pad = F.pad(image_newaxis, (1, 1, 1, 1), mode='reflect')
+    degenerate = F.conv2d(image_pad, weight=kernel, groups=channels).squeeze(0)
+    return blend(degenerate, image, factor)
+
+
+def equalize(image):
+    """Implements Equalize function from PIL using PyTorch ops based on:
+    https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/
+    autoaugment.py#L352"""
+    image = image * 255
+
+    def scale_channel(im, c):
+        """Scale the data in the channel to implement equalize."""
+        im = im[c, :, :]
+        # Compute the histogram of the image channel.
+        histo = torch.histc(im, bins=256, min=0, max=255)  # .type(torch.int32)
+        # For the purposes of computing the step, filter out the nonzeros.
+        nonzero_histo = torch.reshape(histo[histo != 0], [-1])
+        step = (torch.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+        def build_lut(histo, step):
+            # Compute the cumulative sum, shifting by step // 2
+            # and then normalization by step.
+            lut = (torch.cumsum(histo, 0) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = torch.cat([torch.zeros(1), lut[:-1]])
+            # Clip the counts to be in range.  This is done
+            # in the C code for image.point.
+            return torch.clamp(lut, 0, 255)
+
+        # If step is zero, return the original image.  Otherwise, build
+        # lut from the full histogram and step and then index from it.
+        if step == 0:
+            result = im
+        else:
+            # can't index using 2d index. Have to flatten and then reshape
+            result = torch.gather(
+                build_lut(histo, step), 0,
+                im.flatten().long())
+            result = result.reshape_as(im)
+
+        return result  # .type(torch.uint8)
+
+    # Assumes RGB for now.  Scales each channel independently
+    # and then stacks the result.
+    s1 = scale_channel(image, 0)
+    s2 = scale_channel(image, 1)
+    s3 = scale_channel(image, 2)
+    image = torch.stack([s1, s2, s3], 0) / 255.0
+    return image
+
+
+def autocontrast(image):
+
+    def scale_channel(image):
+        """Scale the 2D image using the autocontrast rule."""
+        lo = torch.min(image)
+        hi = torch.max(image)
+
+        # Scale the image, making the lowest value 0 and the highest value 1.
+        def scale_values(im):
+            scale = 1.0 / (hi - lo)
+            offset = -lo * scale
+            im = im * scale + offset
+            im = torch.clamp(im, 0.0, 1.0)
+            return im
+
+        if hi > lo:
+            result = scale_values(image)
+        else:
+            result = image
+
+        return result
+
+    # Assumes RGB for now. Scales each channel independently
+    # and then stacks the result.
+    s1 = scale_channel(image[0, :, :])
+    s2 = scale_channel(image[1, :, :])
+    s3 = scale_channel(image[2, :, :])
+    image = torch.stack([s1, s2, s3], 0)
+    return image
+
+
+def posterize(image, bits):
+    """Equivalent of PIL Posterize."""
+    image *= 255
+    image = image.long()
+    shift = bits  # 8 - bits
+    image_rightshift = image >> shift
+    image_leftshift = image_rightshift << shift
+    image_leftshift = image_leftshift.float() / 255.0
+    return image_leftshift
+
+
+def _color_aug_func(img, img_aug, target, scale_ratios_splits,
+                    box_sample_probs):
+    scale_ratios, scale_splits = scale_ratios_splits
+    boxes = [
+        bbox for i, bbox in enumerate(target.bbox)
+        if random.random() < box_sample_probs[i]
+    ]
+    img_aug = _merge_gaussian(img, img_aug, boxes, scale_ratios, scale_splits)
+    return img_aug
+
+
+color_aug_func = {
+    'AutoContrast':
+    lambda x, level, target,
+    scale_ratios_splits, box_sample_probs: _color_aug_func(
+        x, autocontrast(x), target, scale_ratios_splits, box_sample_probs),
+    'Equalize':
+    lambda x, leve, target,
+    scale_ratios_splits, box_sample_probs: _color_aug_func(
+        x, equalize(x), target, scale_ratios_splits, box_sample_probs),
+    'SolarizeAdd':
+    lambda x, level, target, scale_ratios_splits, box_sample_probs:
+    _color_aug_func(x, solarize_add(x, level / _MAX_LEVEL * 0.4296875), target,
+                    scale_ratios_splits, box_sample_probs),
+    'Color':
+    lambda x, level, target, scale_ratios_splits, box_sample_probs:
+    _color_aug_func(x, color(x, level / _MAX_LEVEL * 1.8 + 0.1), target,
+                    scale_ratios_splits, box_sample_probs),
+    'Contrast':
+    lambda x, level, target, scale_ratios_splits, box_sample_probs:
+    _color_aug_func(x, contrast(x, level / _MAX_LEVEL * 1.8 + 0.1), target,
+                    scale_ratios_splits, box_sample_probs),
+    'Brightness':
+    lambda x, level, target, scale_ratios_splits, box_sample_probs:
+    _color_aug_func(x, brightness(x, level / _MAX_LEVEL * 1.8 + 0.1), target,
+                    scale_ratios_splits, box_sample_probs),
+    'Sharpness':
+    lambda x, level, target, scale_ratios_splits, box_sample_probs:
+    _color_aug_func(x, sharpness(x, level / _MAX_LEVEL * 1.8 + 0.1), target,
+                    scale_ratios_splits, box_sample_probs),
+}
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/gaussian_maps.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/gaussian_maps.py
new file mode 100644
index 00000000..08acaa21
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/gaussian_maps.py
@@ -0,0 +1,59 @@
+# This file mainly comes from
+# https://github.com/dvlab-research/SA-AutoAug/blob/master/FCOS/fcos_core/augmentations/box_level_augs/gaussian_maps.py
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+
+
+def _gaussian_map(img, boxes, scale_splits=None, scale_ratios=None):
+    g_maps = torch.zeros(*img.shape[1:]).to(img.device)
+    height, width = img.shape[1], img.shape[2]
+
+    x_range = torch.arange(0, height, 1).to(img.device)
+    y_range = torch.arange(0, width, 1).to(img.device)
+    xx, yy = torch.meshgrid(x_range, y_range)
+    pos = torch.empty(xx.shape + (2, )).to(img.device)
+    pos[:, :, 0] = xx
+    pos[:, :, 1] = yy
+
+    for j, box in enumerate(boxes):
+        y1, x1, y2, x2 = box
+        x, y, h, w = x1, y1, x2 - x1, y2 - y1
+        mean_torch = torch.tensor([x + h // 2, y + w // 2]).to(img.device)
+        if scale_ratios is None:
+            scale_ratio = 1.0
+        else:
+            ratio_list = [0.2, 0.4, 0.6, 0.8, 1.0, 2, 4, 6, 8, 10]
+            if h * w < scale_splits[0]:
+                scale_ratio = ratio_list[scale_ratios[0]] * scale_splits[0] / (
+                    h * w)
+            elif h * w < scale_splits[1]:
+                scale_ratio = ratio_list[scale_ratios[1]] * (
+                    scale_splits[0] + scale_splits[1]) / 2.0 / (
+                        h * w)
+            elif h * w < scale_splits[2]:
+                scale_ratio = ratio_list[scale_ratios[2]] * scale_splits[2] / (
+                    h * w)
+            else:
+                scale_ratio = ratio_list[scale_ratios[2]]
+
+        r_var = (scale_ratio * height * width / (2 * math.pi))**0.5
+        var_x = torch.tensor([(h / height) * r_var],
+                             dtype=torch.float32).to(img.device)
+        var_y = torch.tensor([(w / width) * r_var],
+                             dtype=torch.float32).to(img.device)
+        g_map = torch.exp(-(
+            ((xx.float() - mean_torch[0])**2 / (2.0 * var_x**2)
+             + (yy.float() - mean_torch[1])**2 / (2.0 * var_y**2)))).to(
+                 img.device)
+        g_maps += g_map
+    return g_maps
+
+
+def _merge_gaussian(img, img_aug, boxes, scale_ratios, scale_splits):
+    g_maps = _gaussian_map(img, boxes, scale_splits, scale_ratios)
+    g_maps = g_maps.clamp(min=0, max=1.0)
+    out = img * (1 - g_maps) + img_aug * g_maps
+    return out
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/geometric_augs.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/geometric_augs.py
new file mode 100644
index 00000000..b7984a8b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/box_level_augs/geometric_augs.py
@@ -0,0 +1,157 @@
+# This file mainly comes from
+# https://github.com/dvlab-research/SA-AutoAug/blob/master/FCOS/fcos_core/augmentations/box_level_augs/geometric_augs.py
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import copy
+import random
+
+import torch
+import torchvision.transforms as transforms
+
+from .gaussian_maps import _gaussian_map
+
+_MAX_LEVEL = 10.0
+pixel_mean = [102.9801, 115.9465, 122.7717]
+
+
+def scale_area(box, height, width, scale_ratio=1.0):
+    y1, x1, y2, x2 = box
+    h, w = x2 - x1, y2 - y1
+    h_new, w_new = h * scale_ratio, w * scale_ratio
+    x1, y1 = max(x1 + h / 2 - h_new / 2, 0), max(y1 + w / 2 - w_new / 2, 0)
+    x2, y2 = min(x1 + h_new, height), min(y1 + w_new, width)
+    box_new = torch.Tensor([y1, x1, y2, x2])
+    return box_new
+
+
+def _geometric_aug_func(x,
+                        target,
+                        angle=0,
+                        translate=(0, 0),
+                        scale=1,
+                        shear=(0, 0),
+                        hflip=False,
+                        boxes_sample_prob=[],
+                        scale_ratio=1.0):
+    boxes_and_labels = [(target.bbox[i], target.extra_fields['labels'][i])
+                        for i in range(len(target.bbox))
+                        if random.random() < boxes_sample_prob[i]]
+    boxes = [b_and_l[0] for b_and_l in boxes_and_labels]
+    labels = [b_and_l[1] for b_and_l in boxes_and_labels]
+
+    if random.random() < 0.5:
+        angle *= -1
+        translate = (-translate[0], -translate[1])
+        shear = (-shear[0], -shear[1])
+
+    height, width = x.shape[1], x.shape[2]
+
+    x_crops = []
+    boxes_crops = []
+    boxes_new = []
+    labels_new = []
+    for i, box in enumerate(boxes):
+        box_crop = scale_area(box, height, width, scale_ratio)
+        y1, x1, y2, x2 = box_crop.long()
+
+        x_crop = x[:, x1:x2, y1:y2]
+        boxes_crops.append(box_crop)
+
+        if x1 >= x2 or y1 >= y2:
+            x_crops.append(x_crop)
+            continue
+
+        if hflip:
+            x_crop = x_crop.flip(-1)
+        elif translate[0] + translate[1] != 0:
+            offset_y = (y2 + translate[0]).clamp(0, width).long().tolist() - y2
+            offset_x = (x2 + translate[1]).clamp(0,
+                                                 height).long().tolist() - x2
+            if offset_x != 0 or offset_y != 0:
+                offset = [offset_y, offset_x]
+                boxes_new.append(box + torch.Tensor(offset * 2))
+                labels_new.append(labels[i])
+        else:
+            x_crop = transforms.functional.to_pil_image(x_crop.cpu())
+            x_crop = transforms.functional.affine(
+                x_crop,
+                angle,
+                translate,
+                scale,
+                shear,
+                resample=2,
+                fillcolor=tuple([int(i) for i in pixel_mean]))
+            x_crop = transforms.functional.to_tensor(x_crop).to(x.device)
+        x_crops.append(x_crop)
+    y = _transform(x, x_crops, boxes_crops, translate)
+
+    if translate[0] + translate[1] != 0 and len(boxes_new) > 0:
+        target.bbox = torch.cat((target.bbox, torch.stack(boxes_new)))
+        target.extra_fields['labels'] = torch.cat(
+            (target.extra_fields['labels'], torch.Tensor(labels_new).long()))
+
+    return y, target
+
+
+def _transform(x, x_crops, boxes_crops, translate=(0, 0)):
+    y = copy.deepcopy(x)
+    height, width = x.shape[1], x.shape[2]
+
+    for i, box in enumerate(boxes_crops):
+        y1_c, x1_c, y2_c, x2_c = boxes_crops[i].long()
+
+        y1_c = (y1_c + translate[0]).clamp(0, width).long().tolist()
+        x1_c = (x1_c + translate[1]).clamp(0, height).long().tolist()
+        y2_c = (y2_c + translate[0]).clamp(0, width).long().tolist()
+        x2_c = (x2_c + translate[1]).clamp(0, height).long().tolist()
+
+        y_crop = copy.deepcopy(y[:, x1_c:x2_c, y1_c:y2_c])
+        x_crop = x_crops[i][:, :y_crop.shape[1], :y_crop.shape[2]]
+
+        if y_crop.shape[1] * y_crop.shape[2] == 0:
+            continue
+
+        g_maps = _gaussian_map(x_crop,
+                               [[0, 0, y_crop.shape[2], y_crop.shape[1]]])
+        _, _h, _w = y[:, x1_c:x2_c, y1_c:y2_c].shape
+        y[:, x1_c:x1_c + x_crop.shape[1],
+          y1_c:y1_c + x_crop.shape[2]] = g_maps * x_crop + (
+              1 - g_maps) * y_crop[:, :x_crop.shape[1], :x_crop.shape[2]]
+    return y
+
+
+geometric_aug_func = {
+    'hflip':
+    lambda x, level, target, boxes_sample_probs: _geometric_aug_func(
+        x, target, hflip=True, boxes_sample_prob=boxes_sample_probs),
+    'rotate':
+    lambda x, level, target, boxes_sample_probs: _geometric_aug_func(
+        x,
+        target,
+        level / _MAX_LEVEL * 30,
+        boxes_sample_prob=boxes_sample_probs),
+    'shearX':
+    lambda x, level, target, boxes_sample_probs: _geometric_aug_func(
+        x,
+        target,
+        shear=(level / _MAX_LEVEL * 15, 0),
+        boxes_sample_prob=boxes_sample_probs),
+    'shearY':
+    lambda x, level, target, boxes_sample_probs: _geometric_aug_func(
+        x,
+        target,
+        shear=(0, level / _MAX_LEVEL * 15),
+        boxes_sample_prob=boxes_sample_probs),
+    'translateX':
+    lambda x, level, target, boxes_sample_probs: _geometric_aug_func(
+        x,
+        target,
+        translate=(level / _MAX_LEVEL * 120.0, 0),
+        boxes_sample_prob=boxes_sample_probs),
+    'translateY':
+    lambda x, level, target, boxes_sample_probs: _geometric_aug_func(
+        x,
+        target,
+        translate=(0, level / _MAX_LEVEL * 120.0),
+        boxes_sample_prob=boxes_sample_probs)
+}
diff --git a/modelscope/models/cv/tinynas_detection/damo/augmentations/scale_aware_aug.py b/modelscope/models/cv/tinynas_detection/damo/augmentations/scale_aware_aug.py
new file mode 100644
index 00000000..ec105526
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/augmentations/scale_aware_aug.py
@@ -0,0 +1,77 @@
+# This file mainly comes from
+# https://github.com/dvlab-research/SA-AutoAug/blob/master/FCOS/fcos_core/augmentations/scale_aware_aug.py
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import copy
+
+from .box_level_augs.box_level_augs import Box_augs
+from .box_level_augs.color_augs import color_aug_func
+from .box_level_augs.geometric_augs import geometric_aug_func
+
+
+class SA_Aug(object):
+
+    def __init__(self, iters_per_epoch, start_epoch, total_epochs,
+                 no_aug_epochs, batch_size, num_gpus, num_workers, sada_cfg):
+
+        autoaug_list = sada_cfg.autoaug_params
+        num_policies = sada_cfg.num_subpolicies
+        scale_splits = sada_cfg.scale_splits
+        box_prob = sada_cfg.box_prob
+
+        self.batch_size = batch_size / num_gpus
+        self.num_workers = num_workers
+        self.max_iters = (total_epochs - no_aug_epochs) * iters_per_epoch
+        self.count = start_epoch * iters_per_epoch
+        if self.num_workers == 0:
+            self.num_workers += 1
+
+        box_aug_list = autoaug_list[4:]
+        color_aug_types = list(color_aug_func.keys())
+        geometric_aug_types = list(geometric_aug_func.keys())
+        policies = []
+        for i in range(num_policies):
+            _start_pos = i * 6
+            sub_policy = [
+                (
+                    color_aug_types[box_aug_list[_start_pos + 0]
+                                    % len(color_aug_types)],
+                    box_aug_list[_start_pos + 1] * 0.1,
+                    box_aug_list[_start_pos + 2],
+                ),  # box_color policy
+                (geometric_aug_types[box_aug_list[_start_pos + 3]
+                                     % len(geometric_aug_types)],
+                 box_aug_list[_start_pos + 4] * 0.1,
+                 box_aug_list[_start_pos + 5])
+            ]  # box_geometric policy
+            policies.append(sub_policy)
+
+        _start_pos = num_policies * 6
+        scale_ratios = {
+            'area': [
+                box_aug_list[_start_pos + 0], box_aug_list[_start_pos + 1],
+                box_aug_list[_start_pos + 2]
+            ],
+            'prob': [
+                box_aug_list[_start_pos + 3], box_aug_list[_start_pos + 4],
+                box_aug_list[_start_pos + 5]
+            ]
+        }
+
+        box_augs_dict = {'policies': policies, 'scale_ratios': scale_ratios}
+
+        self.box_augs = Box_augs(
+            box_augs_dict=box_augs_dict,
+            max_iters=self.max_iters,
+            scale_splits=scale_splits,
+            box_prob=box_prob)
+
+    def __call__(self, tensor, target):
+        iteration = self.count // self.batch_size * self.num_workers
+        tensor = copy.deepcopy(tensor)
+        target = copy.deepcopy(target)
+        tensor, target = self.box_augs(tensor, target, iteration=iteration)
+
+        self.count += 1
+
+        return tensor, target
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/__init__.py b/modelscope/models/cv/tinynas_detection/damo/base_models/__init__.py
new file mode 100644
index 00000000..04654d8a
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .backbones import build_backbone
+from .heads import build_head
+from .necks import build_neck
diff --git a/modelscope/models/cv/tinynas_detection/backbone/__init__.py b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/__init__.py
similarity index 74%
rename from modelscope/models/cv/tinynas_detection/backbone/__init__.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/backbones/__init__.py
index 22a7654f..cf6d0705 100644
--- a/modelscope/models/cv/tinynas_detection/backbone/__init__.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
 import copy
 
@@ -11,9 +10,11 @@ from .tinynas_res import load_tinynas_net as load_tinynas_net_res
 def build_backbone(cfg):
     backbone_cfg = copy.deepcopy(cfg)
     name = backbone_cfg.pop('name')
-    if name == 'CSPDarknet':
-        return CSPDarknet(**backbone_cfg)
+    if name == 'TinyNAS_res':
+        return load_tinynas_net_res(backbone_cfg)
     elif name == 'TinyNAS_csp':
         return load_tinynas_net_csp(backbone_cfg)
-    elif name == 'TinyNAS_res':
-        return load_tinynas_net_res(backbone_cfg)
+    elif name == 'CSPDarknet':
+        return CSPDarknet(**backbone_cfg)
+    else:
+        print(f'{name} is not supported yet!')
diff --git a/modelscope/models/cv/tinynas_detection/backbone/darknet.py b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/darknet.py
similarity index 97%
rename from modelscope/models/cv/tinynas_detection/backbone/darknet.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/backbones/darknet.py
index d8f80e76..27936ff8 100644
--- a/modelscope/models/cv/tinynas_detection/backbone/darknet.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/darknet.py
@@ -4,7 +4,7 @@
 import torch
 from torch import nn
 
-from modelscope.models.cv.tinynas_detection.core.base_ops import (
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.base_ops import (
     BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck)
 
 
diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/tinynas_csp.py
similarity index 96%
rename from modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/backbones/tinynas_csp.py
index 903b6900..4b1bb510 100644
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas_csp.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/tinynas_csp.py
@@ -1,13 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors, and available
-# at https://github.com/tinyvision/damo-yolo.
 
 import torch
 import torch.nn as nn
 
-from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv,
-                                                             SPPBottleneck,
-                                                             get_activation)
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.ops import (
+    Focus, RepConv, SPPBottleneck, get_activation)
 from modelscope.utils.file_utils import read_file
 
 
@@ -27,6 +24,9 @@ class ConvKXBN(nn.Module):
     def forward(self, x):
         return self.bn1(self.conv1(x))
 
+    def fuseforward(self, x):
+        return self.conv1(x)
+
 
 class ConvKXBNRELU(nn.Module):
 
diff --git a/modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/tinynas_res.py
similarity index 95%
rename from modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/backbones/tinynas_res.py
index 3fb9e573..5ca844a8 100644
--- a/modelscope/models/cv/tinynas_detection/backbone/tinynas_res.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/backbones/tinynas_res.py
@@ -1,13 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors, and available
-# at https://github.com/tinyvision/damo-yolo.
 
 import torch
 import torch.nn as nn
 
-from modelscope.models.cv.tinynas_detection.core.ops import (Focus, RepConv,
-                                                             SPPBottleneck,
-                                                             get_activation)
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.ops import (
+    Focus, RepConv, SPPBottleneck, get_activation)
 from modelscope.utils.file_utils import read_file
 
 
@@ -27,6 +24,9 @@ class ConvKXBN(nn.Module):
     def forward(self, x):
         return self.bn1(self.conv1(x))
 
+    def fuseforward(self, x):
+        return self.conv1(x)
+
 
 class ConvKXBNRELU(nn.Module):
 
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/core/__init__.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/core/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/core/base_ops.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/base_ops.py
similarity index 100%
rename from modelscope/models/cv/tinynas_detection/core/base_ops.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/core/base_ops.py
diff --git a/modelscope/models/cv/tinynas_detection/core/neck_ops.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/neck_ops.py
similarity index 100%
rename from modelscope/models/cv/tinynas_detection/core/neck_ops.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/core/neck_ops.py
diff --git a/modelscope/models/cv/tinynas_detection/core/ops.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/ops.py
similarity index 99%
rename from modelscope/models/cv/tinynas_detection/core/ops.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/core/ops.py
index 07a96c13..6ca86a90 100644
--- a/modelscope/models/cv/tinynas_detection/core/ops.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/core/ops.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/core/ota_assigner.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/ota_assigner.py
new file mode 100755
index 00000000..90bd4c22
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/core/ota_assigner.py
@@ -0,0 +1,435 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import warnings
+
+import torch
+import torch.nn.functional as F
+
+from modelscope.models.cv.tinynas_detection.damo.utils.boxes import \
+    bbox_overlaps
+
+
+class BaseAssigner(object):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
+
+
+class AssignResult(object):
+    """Stores assignments between predicted and truth boxes.
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assinged to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+        Example:
+            >>> from mmdet.core.bbox.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from mmdet.core.bbox import demodata
+        rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        p_use_label = kwargs.get('p_use_label', 0.5)
+        num_classes = kwargs.get('p_use_label', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            if p_use_label is True or p_use_label < rng.rand():
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = None
+        else:
+            import numpy as np
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned]
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if p_use_label is True or p_use_label < rng.rand():
+                if num_classes == 0:
+                    labels = torch.zeros(num_preds, dtype=torch.int64)
+                else:
+                    labels = torch.from_numpy(
+                        # remind that we set FG labels to [0, num_class-1]
+                        # since mmdet v2.0
+                        # BG cat_id: num_class
+                        rng.randint(0, num_classes, size=num_preds))
+                    labels[~is_assigned] = 0
+            else:
+                labels = None
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
+
+
+class AlignOTAAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth.
+    Args:
+        center_radius (int | float, optional): Ground truth center size
+            to judge whether a prior is in center. Default 2.5.
+        candidate_topk (int, optional): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Default 10.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 3.0.
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+    """
+
+    def __init__(self,
+                 center_radius=2.5,
+                 candidate_topk=10,
+                 iou_weight=3.0,
+                 cls_weight=1.0):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+
+    def assign(self,
+               pred_scores,
+               priors,
+               decoded_bboxes,
+               gt_bboxes,
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Assign gt to priors using SimOTA. It will switch to CPU mode when
+        GPU is out of memory.
+        Args:
+            pred_scores (Tensor): Classification scores of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            decoded_bboxes (Tensor): Predicted bboxes, a 2D-Tensor with shape
+                [num_priors, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes (Tensor): Ground truth bboxes of one image, a 2D-Tensor
+                with shape [num_gts, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth labels of one image, a Tensor
+                with shape [num_gts].
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            eps (float): A value added to the denominator for numerical
+                stability. Default 1e-7.
+        Returns:
+            assign_result (obj:`AssignResult`): The assigned result.
+        """
+        try:
+            assign_result = self._assign(pred_scores, priors, decoded_bboxes,
+                                         gt_bboxes, gt_labels,
+                                         gt_bboxes_ignore, eps)
+            return assign_result
+        except RuntimeError:
+            origin_device = pred_scores.device
+            warnings.warn('OOM RuntimeError is raised due to the huge memory '
+                          'cost during label assignment. CPU mode is applied '
+                          'in this batch. If you want to avoid this issue, '
+                          'try to reduce the batch size or image size.')
+            torch.cuda.empty_cache()
+
+            pred_scores = pred_scores.cpu()
+            priors = priors.cpu()
+            decoded_bboxes = decoded_bboxes.cpu()
+            gt_bboxes = gt_bboxes.cpu().float()
+            gt_labels = gt_labels.cpu()
+
+            assign_result = self._assign(pred_scores, priors, decoded_bboxes,
+                                         gt_bboxes, gt_labels,
+                                         gt_bboxes_ignore, eps)
+            assign_result.gt_inds = assign_result.gt_inds.to(origin_device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(
+                origin_device)
+            assign_result.labels = assign_result.labels.to(origin_device)
+
+            return assign_result
+
+    def _assign(self,
+                pred_scores,
+                priors,
+                decoded_bboxes,
+                gt_bboxes,
+                gt_labels,
+                gt_bboxes_ignore=None,
+                eps=1e-7):
+        """Assign gt to priors using SimOTA.
+        Args:
+            pred_scores (Tensor): Classification scores of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            decoded_bboxes (Tensor): Predicted bboxes, a 2D-Tensor with shape
+                [num_priors, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_bboxes (Tensor): Ground truth bboxes of one image, a 2D-Tensor
+                with shape [num_gts, 4] in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth labels of one image, a Tensor
+                with shape [num_gts].
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            eps (float): A value added to the denominator for numerical
+                stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        INF = 100000000
+        num_gt = gt_bboxes.size(0)
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+
+        if num_gt == 0 or num_bboxes == 0 or num_valid == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if gt_labels is None:
+                assigned_labels = None
+            else:
+                assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                          -1,
+                                                          dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        pairwise_ious = bbox_overlaps(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + eps)
+        iou_cost = iou_cost * self.iou_weight
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+
+        soft_label = gt_onehot_label * pairwise_ious[..., None]
+        scale_factor = soft_label - valid_pred_scores
+
+        cls_cost = F.binary_cross_entropy(
+            valid_pred_scores, soft_label,
+            reduction='none') * scale_factor.abs().pow(2.0)
+
+        cls_cost = cls_cost.sum(dim=-1)
+        cls_cost = cls_cost * self.cls_weight
+        cost_matrix = (cls_cost + iou_cost + (~is_in_boxes_and_center) * INF)
+        matched_pred_ious, matched_gt_inds = self.dynamic_k_matching(
+            cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def get_in_gt_and_in_center_info(self, priors, gt_bboxes):
+        num_gt = gt_bboxes.size(0)
+
+        repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt)
+        repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt)
+
+        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
+        l_ = repeated_x - gt_bboxes[:, 0]
+        t_ = repeated_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - repeated_x
+        b_ = gt_bboxes[:, 3] - repeated_y
+
+        deltas = torch.stack([l_, t_, r_, b_], dim=1)
+        is_in_gts = deltas.min(dim=1).values > 0
+        is_in_gts_all = is_in_gts.sum(dim=1) > 0
+
+        # is prior centers in gt centers
+        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
+        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
+        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
+        ct_box_b = gt_cys + self.center_radius * repeated_stride_y
+
+        cl_ = repeated_x - ct_box_l
+        ct_ = repeated_y - ct_box_t
+        cr_ = ct_box_r - repeated_x
+        cb_ = ct_box_b - repeated_y
+
+        ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1)
+        is_in_cts = ct_deltas.min(dim=1).values > 0
+        is_in_cts_all = is_in_cts.sum(dim=1) > 0
+
+        # in boxes or in centers, shape: [num_priors]
+        is_in_gts_or_centers = is_in_gts_all | is_in_cts_all
+
+        # both in boxes and centers, shape: [num_fg, num_gt]
+        is_in_boxes_and_centers = (
+            is_in_gts[is_in_gts_or_centers, :]
+            & is_in_cts[is_in_gts_or_centers, :])
+        return is_in_gts_or_centers, is_in_boxes_and_centers
+
+    def dynamic_k_matching(self, cost, pairwise_ious, num_gt, valid_mask):
+        matching_matrix = torch.zeros_like(cost)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1.0
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0.0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1.0
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0.0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix
+                             * pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/modelscope/models/cv/tinynas_detection/core/repvgg_block.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/repvgg_block.py
similarity index 100%
rename from modelscope/models/cv/tinynas_detection/core/repvgg_block.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/core/repvgg_block.py
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/core/utils.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/utils.py
new file mode 100644
index 00000000..04b587e1
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/core/utils.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from functools import partial
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """A learnable scale parameter.
+    This layer scales the input by a learnable factor. It multiplies a
+    learnable scale parameter of shape (1,) with input of any shape.
+    Args:
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self, scale=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x):
+        return x * self.scale
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds.type(torch.bool), :] = data
+    return ret
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/core/weight_init.py b/modelscope/models/cv/tinynas_detection/damo/base_models/core/weight_init.py
new file mode 100644
index 00000000..92548b10
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/core/weight_init.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch.nn as nn
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
diff --git a/modelscope/models/cv/tinynas_detection/head/__init__.py b/modelscope/models/cv/tinynas_detection/damo/base_models/heads/__init__.py
similarity index 67%
rename from modelscope/models/cv/tinynas_detection/head/__init__.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/heads/__init__.py
index b522ef8a..8c264404 100644
--- a/modelscope/models/cv/tinynas_detection/head/__init__.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/heads/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
 import copy
 
@@ -11,9 +10,9 @@ def build_head(cfg):
 
     head_cfg = copy.deepcopy(cfg)
     name = head_cfg.pop('name')
-    if name == 'GFocalV2':
-        return GFocalHead_Tiny(**head_cfg)
-    elif name == 'ZeroHead':
+    if name == 'ZeroHead':
         return ZeroHead(**head_cfg)
+    elif name == 'GFocalV2':
+        return GFocalHead_Tiny(**head_cfg)
     else:
         raise NotImplementedError
diff --git a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py b/modelscope/models/cv/tinynas_detection/damo/base_models/heads/gfocal_v2_tiny.py
similarity index 98%
rename from modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/heads/gfocal_v2_tiny.py
index 822efd2a..017846d3 100644
--- a/modelscope/models/cv/tinynas_detection/head/gfocal_v2_tiny.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/heads/gfocal_v2_tiny.py
@@ -9,8 +9,8 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from modelscope.models.cv.tinynas_detection.core.base_ops import (BaseConv,
-                                                                  DWConv)
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.base_ops import (
+    BaseConv, DWConv)
 
 
 class Scale(nn.Module):
@@ -370,6 +370,4 @@ class GFocalHead_Tiny(nn.Module):
         dis_preds = self.integral(reg_preds) * mlvl_center_priors[..., 2, None]
         bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)
 
-        res = torch.cat([bboxes, cls_preds[..., 0:self.num_classes]], dim=-1)
-
-        return res
+        return cls_preds[..., 0:self.num_classes], bboxes
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/heads/zero_head.py b/modelscope/models/cv/tinynas_detection/damo/base_models/heads/zero_head.py
new file mode 100644
index 00000000..8026c301
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/heads/zero_head.py
@@ -0,0 +1,526 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.ops import \
+    ConvBNAct
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.ota_assigner import \
+    AlignOTAAssigner
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.utils import (
+    Scale, multi_apply, reduce_mean)
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.weight_init import (
+    bias_init_with_prob, normal_init)
+from modelscope.models.cv.tinynas_detection.damo.base_models.losses.gfocal_loss import (
+    DistributionFocalLoss, GIoULoss, QualityFocalLoss)
+from modelscope.models.cv.tinynas_detection.damo.utils import postprocess
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return torch.stack([x1, y1, x2, y2], -1)
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+    """
+
+    def __init__(self, reg_max=16):
+        super(Integral, self).__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x):
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+        """
+        b, hw, _, _ = x.size()
+        x = x.reshape(b * hw * 4, self.reg_max + 1)
+        y = self.project.type_as(x).unsqueeze(1)
+        x = torch.matmul(x, y).reshape(b, hw, 4)
+        return x
+
+
+class ZeroHead(nn.Module):
+    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
+    Estimation for Dense Object Detection.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            stacked_convs=4,  # 4
+            feat_channels=256,
+            reg_max=12,
+            strides=[8, 16, 32],
+            norm='gn',
+            act='relu',
+            nms_conf_thre=0.05,
+            nms_iou_thre=0.7,
+            nms=True,
+            **kwargs):
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.stacked_convs = stacked_convs
+        self.act = act
+        self.strides = strides
+        if stacked_convs == 0:
+            feat_channels = in_channels
+        if isinstance(feat_channels, list):
+            self.feat_channels = feat_channels
+        else:
+            self.feat_channels = [feat_channels] * len(self.strides)
+        # add 1 for keep consistance with former models
+        self.cls_out_channels = num_classes + 1
+        self.reg_max = reg_max
+
+        self.nms = nms
+        self.nms_conf_thre = nms_conf_thre
+        self.nms_iou_thre = nms_iou_thre
+
+        self.assigner = AlignOTAAssigner(
+            center_radius=2.5, cls_weight=1.0, iou_weight=3.0)
+
+        self.feat_size = [torch.zeros(4) for _ in strides]
+
+        super(ZeroHead, self).__init__()
+        self.integral = Integral(self.reg_max)
+        self.loss_dfl = DistributionFocalLoss(loss_weight=0.25)
+        self.loss_cls = QualityFocalLoss(
+            use_sigmoid=False, beta=2.0, loss_weight=1.0)
+        self.loss_bbox = GIoULoss(loss_weight=2.0)
+
+        self._init_layers()
+
+    def _build_not_shared_convs(self, in_channel, feat_channels):
+        cls_convs = nn.ModuleList()
+        reg_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = feat_channels if i > 0 else in_channel
+            kernel_size = 3 if i > 0 else 1
+            cls_convs.append(
+                ConvBNAct(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=1,
+                    norm='bn',
+                    act=self.act))
+            reg_convs.append(
+                ConvBNAct(
+                    chn,
+                    feat_channels,
+                    kernel_size,
+                    stride=1,
+                    groups=1,
+                    norm='bn',
+                    act=self.act))
+
+        return cls_convs, reg_convs
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+
+        for i in range(len(self.strides)):
+            cls_convs, reg_convs = self._build_not_shared_convs(
+                self.in_channels[i], self.feat_channels[i])
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+
+        self.gfl_cls = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.gfl_reg = nn.ModuleList([
+            nn.Conv2d(
+                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
+            for i in range(len(self.strides))
+        ])
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        for cls_conv in self.cls_convs:
+            for m in cls_conv:
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.01)
+        for reg_conv in self.reg_convs:
+            for m in reg_conv:
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        for i in range(len(self.strides)):
+            normal_init(self.gfl_cls[i], std=0.01, bias=bias_cls)
+            normal_init(self.gfl_reg[i], std=0.01)
+
+    def forward(self, xin, labels=None, imgs=None, aux_targets=None):
+        if self.training:
+            return self.forward_train(xin=xin, labels=labels, imgs=imgs)
+        else:
+            return self.forward_eval(xin=xin, labels=labels, imgs=imgs)
+
+    def forward_train(self, xin, labels=None, imgs=None, aux_targets=None):
+
+        # prepare labels during training
+        b, c, h, w = xin[0].shape
+        if labels is not None:
+            gt_bbox_list = []
+            gt_cls_list = []
+            for label in labels:
+                gt_bbox_list.append(label.bbox)
+                gt_cls_list.append((label.get_field('labels')
+                                    - 1).long())  # labels starts from 1
+
+        # prepare priors for label assignment and bbox decode
+        mlvl_priors_list = [
+            self.get_single_level_center_priors(
+                xin[i].shape[0],
+                xin[i].shape[-2:],
+                stride,
+                dtype=torch.float32,
+                device=xin[0].device) for i, stride in enumerate(self.strides)
+        ]
+        mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
+
+        # forward for bboxes and classification prediction
+        cls_scores, bbox_preds, bbox_before_softmax = multi_apply(
+            self.forward_single,
+            xin,
+            self.cls_convs,
+            self.reg_convs,
+            self.gfl_cls,
+            self.gfl_reg,
+            self.scales,
+        )
+        cls_scores = torch.cat(cls_scores, dim=1)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+        bbox_before_softmax = torch.cat(bbox_before_softmax, dim=1)
+
+        # calculating losses
+        loss = self.loss(
+            cls_scores,
+            bbox_preds,
+            bbox_before_softmax,
+            gt_bbox_list,
+            gt_cls_list,
+            mlvl_priors,
+        )
+        return loss
+
+    def forward_eval(self, xin, labels=None, imgs=None):
+
+        # prepare priors for label assignment and bbox decode
+        if self.feat_size[0] != xin[0].shape:
+            mlvl_priors_list = [
+                self.get_single_level_center_priors(
+                    xin[i].shape[0],
+                    xin[i].shape[-2:],
+                    stride,
+                    dtype=torch.float32,
+                    device=xin[0].device)
+                for i, stride in enumerate(self.strides)
+            ]
+            self.mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
+            self.feat_size[0] = xin[0].shape
+
+        # forward for bboxes and classification prediction
+        cls_scores, bbox_preds = multi_apply(
+            self.forward_single,
+            xin,
+            self.cls_convs,
+            self.reg_convs,
+            self.gfl_cls,
+            self.gfl_reg,
+            self.scales,
+        )
+        cls_scores = torch.cat(cls_scores, dim=1)[:, :, :self.num_classes]
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+        # batch bbox decode
+        bbox_preds = self.integral(bbox_preds) * self.mlvl_priors[..., 2, None]
+        bbox_preds = distance2bbox(self.mlvl_priors[..., :2], bbox_preds)
+
+        if self.nms:
+            output = postprocess(cls_scores, bbox_preds, self.num_classes,
+                                 self.nms_conf_thre, self.nms_iou_thre, imgs)
+            return output
+        return cls_scores, bbox_preds
+
+    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, scale):
+        """Forward feature of a single scale level.
+
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_conv, reg_conv in zip(cls_convs, reg_convs):
+            cls_feat = cls_conv(cls_feat)
+            reg_feat = reg_conv(reg_feat)
+
+        bbox_pred = scale(gfl_reg(reg_feat)).float()
+        N, C, H, W = bbox_pred.size()
+        if self.training:
+            bbox_before_softmax = bbox_pred.reshape(N, 4, self.reg_max + 1, H,
+                                                    W)
+            bbox_before_softmax = bbox_before_softmax.flatten(
+                start_dim=3).permute(0, 3, 1, 2)
+        bbox_pred = F.softmax(
+            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
+
+        cls_score = gfl_cls(cls_feat).sigmoid()
+
+        cls_score = cls_score.flatten(start_dim=2).permute(
+            0, 2, 1)  # N, h*w, self.num_classes+1
+        bbox_pred = bbox_pred.flatten(start_dim=3).permute(
+            0, 3, 1, 2)  # N, h*w, 4, self.reg_max+1
+        if self.training:
+            return cls_score, bbox_pred, bbox_before_softmax
+        else:
+            return cls_score, bbox_pred
+
+    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
+                                       dtype, device):
+
+        h, w = featmap_size
+        x_range = (torch.arange(0, int(w), dtype=dtype,
+                                device=device)) * stride
+        y_range = (torch.arange(0, int(h), dtype=dtype,
+                                device=device)) * stride
+
+        x = x_range.repeat(h, 1)
+        y = y_range.unsqueeze(-1).repeat(1, w)
+
+        y = y.flatten()
+        x = x.flatten()
+        strides = x.new_full((x.shape[0], ), stride)
+        priors = torch.stack([x, y, strides, strides], dim=-1)
+
+        return priors.unsqueeze(0).repeat(batch_size, 1, 1)
+
+    def loss(
+        self,
+        cls_scores,
+        bbox_preds,
+        bbox_before_softmax,
+        gt_bboxes,
+        gt_labels,
+        mlvl_center_priors,
+        gt_bboxes_ignore=None,
+    ):
+        """Compute losses of the head.
+
+        """
+        device = cls_scores[0].device
+
+        # get decoded bboxes for label assignment
+        dis_preds = self.integral(bbox_preds) * mlvl_center_priors[..., 2,
+                                                                   None]
+        decoded_bboxes = distance2bbox(mlvl_center_priors[..., :2], dis_preds)
+        cls_reg_targets = self.get_targets(
+            cls_scores,
+            decoded_bboxes,
+            gt_bboxes,
+            mlvl_center_priors,
+            gt_labels_list=gt_labels)
+
+        if cls_reg_targets is None:
+            return None
+
+        (labels_list, label_scores_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, dfl_targets_list, num_pos) = cls_reg_targets
+
+        num_total_pos = max(
+            reduce_mean(torch.tensor(num_pos).type(
+                torch.float).to(device)).item(), 1.0)
+
+        labels = torch.cat(labels_list, dim=0)
+        label_scores = torch.cat(label_scores_list, dim=0)
+        bbox_targets = torch.cat(bbox_targets_list, dim=0)
+        dfl_targets = torch.cat(dfl_targets_list, dim=0)
+
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # bbox_preds = bbox_preds.reshape(-1, 4 * (self.reg_max + 1))
+        bbox_before_softmax = bbox_before_softmax.reshape(
+            -1, 4 * (self.reg_max + 1))
+        decoded_bboxes = decoded_bboxes.reshape(-1, 4)
+
+        loss_qfl = self.loss_cls(
+            cls_scores, (labels, label_scores), avg_factor=num_total_pos)
+
+        pos_inds = torch.nonzero(
+            (labels >= 0) & (labels < self.num_classes),
+            as_tuple=False).squeeze(1)
+
+        weight_targets = cls_scores.detach()
+        weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+        norm_factor = max(reduce_mean(weight_targets.sum()).item(), 1.0)
+
+        if len(pos_inds) > 0:
+            loss_bbox = self.loss_bbox(
+                decoded_bboxes[pos_inds],
+                bbox_targets[pos_inds],
+                weight=weight_targets,
+                avg_factor=1.0 * norm_factor,
+            )
+            loss_dfl = self.loss_dfl(
+                bbox_before_softmax[pos_inds].reshape(-1, self.reg_max + 1),
+                dfl_targets[pos_inds].reshape(-1),
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0 * norm_factor,
+            )
+        else:
+            loss_bbox = bbox_preds.sum() / norm_factor * 0.0
+            loss_dfl = bbox_preds.sum() / norm_factor * 0.0
+
+        total_loss = loss_qfl + loss_bbox + loss_dfl
+
+        return dict(
+            total_loss=total_loss,
+            loss_cls=loss_qfl,
+            loss_bbox=loss_bbox,
+            loss_dfl=loss_dfl,
+        )
+
+    def get_targets(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    mlvl_center_priors,
+                    gt_labels_list=None,
+                    unmap_outputs=True):
+        """Get targets for GFL head.
+
+        """
+        num_imgs = mlvl_center_priors.shape[0]
+
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+
+        (all_labels, all_label_scores, all_label_weights, all_bbox_targets,
+         all_bbox_weights, all_dfl_targets,
+         all_pos_num) = multi_apply(self.get_target_single, mlvl_center_priors,
+                                    cls_scores, bbox_preds, gt_bboxes_list,
+                                    gt_labels_list)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        all_pos_num = sum(all_pos_num)
+
+        return (all_labels, all_label_scores, all_label_weights,
+                all_bbox_targets, all_bbox_weights, all_dfl_targets,
+                all_pos_num)
+
+    def get_target_single(self,
+                          center_priors,
+                          cls_scores,
+                          bbox_preds,
+                          gt_bboxes,
+                          gt_labels,
+                          unmap_outputs=True,
+                          gt_bboxes_ignore=None):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        """
+        # assign gt and sample anchors
+
+        num_valid_center = center_priors.shape[0]
+
+        labels = center_priors.new_full((num_valid_center, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+        label_weights = center_priors.new_zeros(
+            num_valid_center, dtype=torch.float)
+        label_scores = center_priors.new_zeros(
+            num_valid_center, dtype=torch.float)
+
+        bbox_targets = torch.zeros_like(center_priors)
+        bbox_weights = torch.zeros_like(center_priors)
+        dfl_targets = torch.zeros_like(center_priors)
+
+        if gt_labels.size(0) == 0:
+
+            return (labels, label_scores, label_weights, bbox_targets,
+                    bbox_weights, dfl_targets, 0)
+
+        assign_result = self.assigner.assign(cls_scores.detach(),
+                                             center_priors,
+                                             bbox_preds.detach(), gt_bboxes,
+                                             gt_labels)
+
+        pos_inds, neg_inds, pos_bbox_targets, pos_assign_gt_inds = self.sample(
+            assign_result, gt_bboxes)
+        pos_ious = assign_result.max_overlaps[pos_inds]
+
+        if len(pos_inds) > 0:
+            labels[pos_inds] = gt_labels[pos_assign_gt_inds]
+            label_scores[pos_inds] = pos_ious
+            label_weights[pos_inds] = 1.0
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            dfl_targets[pos_inds, :] = (
+                bbox2distance(
+                    center_priors[pos_inds, :2]
+                    / center_priors[pos_inds, None, 2],
+                    pos_bbox_targets / center_priors[pos_inds, None, 2],
+                    self.reg_max))
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+        # map up to original set of anchors
+
+        return (labels, label_scores, label_weights, bbox_targets,
+                bbox_weights, dfl_targets, pos_inds.size(0))
+
+    def sample(self, assign_result, gt_bboxes):
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert pos_assigned_gt_inds.numel() == 0
+            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
+
+        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/losses/__init__.py b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/losses/distill_loss.py b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/distill_loss.py
new file mode 100644
index 00000000..fda2fc47
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/distill_loss.py
@@ -0,0 +1,173 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FeatureLoss(nn.Module):
+
+    def __init__(self,
+                 channels_s,
+                 channels_t,
+                 distiller='cwd',
+                 loss_weight=1.0):
+        super(FeatureLoss, self).__init__()
+        self.loss_weight = loss_weight
+
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.align_module = nn.ModuleList([
+            nn.Conv2d(
+                channel, tea_channel, kernel_size=1, stride=1,
+                padding=0).to(device)
+            for channel, tea_channel in zip(channels_s, channels_t)
+        ])
+        self.norm = [
+            nn.BatchNorm2d(tea_channel, affine=False).to(device)
+            for tea_channel in channels_t
+        ]
+
+        if (distiller == 'mimic'):
+            self.feature_loss = MimicLoss(channels_s, channels_t)
+        elif (distiller == 'mgd'):
+            self.feature_loss = MGDLoss(channels_s, channels_t)
+        elif (distiller == 'cwd'):
+            self.feature_loss = CWDLoss(channels_s, channels_t)
+        else:
+            raise NotImplementedError
+
+    def forward(self, y_s, y_t):
+        assert len(y_s) == len(y_t)
+        tea_feats = []
+        stu_feats = []
+
+        for idx, (s, t) in enumerate(zip(y_s, y_t)):
+            s = self.align_module[idx](s)
+            s = self.norm[idx](s)
+            t = self.norm[idx](t)
+            tea_feats.append(t)
+            stu_feats.append(s)
+
+        loss = self.feature_loss(stu_feats, tea_feats)
+        return self.loss_weight * loss
+
+
+class MimicLoss(nn.Module):
+
+    def __init__(self, channels_s, channels_t):
+        super(MimicLoss, self).__init__()
+        self.mse = nn.MSELoss()
+
+    def forward(self, y_s, y_t):
+        """Forward computation.
+        Args:
+            y_s (list): The student model prediction with
+                shape (N, C, H, W) in list.
+            y_t (list): The teacher model prediction with
+                shape (N, C, H, W) in list.
+        Return:
+            torch.Tensor: The calculated loss value of all stages.
+        """
+        assert len(y_s) == len(y_t)
+        losses = []
+        for idx, (s, t) in enumerate(zip(y_s, y_t)):
+            assert s.shape == t.shape
+            losses.append(self.mse(s, t))
+        loss = sum(losses)
+        return loss
+
+
+class MGDLoss(nn.Module):
+
+    def __init__(self,
+                 channels_s,
+                 channels_t,
+                 alpha_mgd=0.00002,
+                 lambda_mgd=0.65):
+        super(MGDLoss, self).__init__()
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.alpha_mgd = alpha_mgd
+        self.lambda_mgd = lambda_mgd
+
+        self.generation = [
+            nn.Sequential(
+                nn.Conv2d(channel, channel, kernel_size=3, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(channel, channel, kernel_size=3,
+                          padding=1)).to(device) for channel in channels_t
+        ]
+
+    def forward(self, y_s, y_t):
+        """Forward computation.
+        Args:
+            y_s (list): The student model prediction with
+                shape (N, C, H, W) in list.
+            y_t (list): The teacher model prediction with
+                shape (N, C, H, W) in list.
+        Return:
+            torch.Tensor: The calculated loss value of all stages.
+        """
+        assert len(y_s) == len(y_t)
+        losses = []
+        for idx, (s, t) in enumerate(zip(y_s, y_t)):
+            assert s.shape == t.shape
+            losses.append(self.get_dis_loss(s, t, idx) * self.alpha_mgd)
+        loss = sum(losses)
+        return loss
+
+    def get_dis_loss(self, preds_S, preds_T, idx):
+        loss_mse = nn.MSELoss(reduction='sum')
+        N, C, H, W = preds_T.shape
+
+        device = preds_S.device
+        mat = torch.rand((N, 1, H, W)).to(device)
+        mat = torch.where(mat > 1 - self.lambda_mgd, 0, 1).to(device)
+
+        masked_fea = torch.mul(preds_S, mat)
+        new_fea = self.generation[idx](masked_fea)
+
+        dis_loss = loss_mse(new_fea, preds_T) / N
+
+        return dis_loss
+
+
+class CWDLoss(nn.Module):
+    """PyTorch version of `Channel-wise Distillation for Semantic Segmentation.
+    <https://arxiv.org/abs/2011.13256>`_.
+    """
+
+    def __init__(self, channels_s, channels_t, tau=1.0):
+        super(CWDLoss, self).__init__()
+        self.tau = tau
+
+    def forward(self, y_s, y_t):
+        """Forward computation.
+        Args:
+            y_s (list): The student model prediction with
+                shape (N, C, H, W) in list.
+            y_t (list): The teacher model prediction with
+                shape (N, C, H, W) in list.
+        Return:
+            torch.Tensor: The calculated loss value of all stages.
+        """
+        assert len(y_s) == len(y_t)
+        losses = []
+
+        for idx, (s, t) in enumerate(zip(y_s, y_t)):
+            assert s.shape == t.shape
+            N, C, H, W = s.shape
+            # normalize in channel diemension
+            softmax_pred_T = F.softmax(
+                t.view(-1, W * H) / self.tau, dim=1)  # [N*C, H*W]
+
+            logsoftmax = torch.nn.LogSoftmax(dim=1)
+            cost = torch.sum(softmax_pred_T
+                             * logsoftmax(t.view(-1, W * H) / self.tau)
+                             - softmax_pred_T
+                             * logsoftmax(s.view(-1, W * H) / self.tau)) * (
+                                 self.tau**2)
+
+            losses.append(cost / (C * N))
+        loss = sum(losses)
+
+        return loss
diff --git a/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py
new file mode 100644
index 00000000..5314e2e7
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/losses/gfocal_loss.py
@@ -0,0 +1,328 @@
+# This file mainly comes from
+# https://github.com/implus/GFocalV2/blob/master/mmdet/models/losses/gfocal_loss.py
+
+import functools
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.tinynas_detection.damo.utils.boxes import \
+    bbox_overlaps
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Avarage factor when computing the mean of losses.
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            loss = loss.sum() / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+@weighted_loss
+def giou_loss(pred, target, eps=1e-7):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+    loss = 1 - gious
+    return loss
+
+
+class GIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(GIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@weighted_loss
+def distribution_focal_loss(pred, label):
+    r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (torch.Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (torch.Tensor): Target distance label for bounding boxes with
+            shape (N,).
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.long()
+    dis_right = dis_left + 1
+    weight_left = dis_right.float() - label
+    weight_right = label - dis_left.float()
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+class DistributionFocalLoss(nn.Module):
+    r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+        Args:
+            pred (torch.Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (torch.Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * distribution_focal_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_cls
+
+
+@weighted_loss
+def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+    if use_sigmoid:
+        func = F.binary_cross_entropy_with_logits
+    else:
+        func = F.binary_cross_entropy
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred.sigmoid() if use_sigmoid else pred
+    scale_factor = pred_sigmoid  # 8400, 81
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0)
+           & (label < bg_class_ind)).nonzero(as_tuple=False).squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = func(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+class QualityFocalLoss(nn.Module):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(QualityFocalLoss, self).__init__()
+        # assert use_sigmoid is True, 'Only sigmoid in QFL supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+        Args:
+            pred (torch.Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (tuple([torch.Tensor])): Target category label with shape
+                (N,) and target quality label with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * quality_focal_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            use_sigmoid=self.use_sigmoid,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_cls
diff --git a/modelscope/models/cv/tinynas_detection/neck/__init__.py b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/__init__.py
similarity index 64%
rename from modelscope/models/cv/tinynas_detection/neck/__init__.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/necks/__init__.py
index e5b9e72a..a1b17fd7 100644
--- a/modelscope/models/cv/tinynas_detection/neck/__init__.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
 import copy
 
@@ -10,7 +9,9 @@ from .giraffe_fpn_btn import GiraffeNeckV2
 def build_neck(cfg):
     neck_cfg = copy.deepcopy(cfg)
     name = neck_cfg.pop('name')
-    if name == 'GiraffeNeck':
-        return GiraffeNeck(**neck_cfg)
-    elif name == 'GiraffeNeckV2':
+    if name == 'GiraffeNeckV2':
         return GiraffeNeckV2(**neck_cfg)
+    elif name == 'GiraffeNeck':
+        return GiraffeNeck(**neck_cfg)
+    else:
+        raise NotImplementedError
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_config.py
similarity index 97%
rename from modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_config.py
index 23994356..1e610b16 100644
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_config.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_config.py
@@ -1,12 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
 import collections
-import itertools
-import os
 
 import networkx as nx
-from omegaconf import OmegaConf
 
 Node = collections.namedtuple('Node', ['id', 'inputs', 'type'])
 
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_fpn.py
similarity index 99%
rename from modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_fpn.py
index 1b7db26e..987358e8 100644
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_fpn.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
-import logging
 import math
 from collections import OrderedDict
 from functools import partial
@@ -15,7 +14,7 @@ from timm import create_model
 from timm.models.layers import (Swish, create_conv2d, create_pool2d,
                                 get_act_layer)
 
-from modelscope.models.cv.tinynas_detection.core.base_ops import (
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.base_ops import (
     CSPLayer, ShuffleBlock, ShuffleCSPLayer)
 from .giraffe_config import get_graph_config
 
@@ -473,7 +472,6 @@ class GiraffeLayer(nn.Module):
         self.fnode = nn.ModuleList()
         reduction_base = feature_info[0]['reduction']
         for i, fnode_cfg in fpn_config.items():
-            logging.debug('fnode {} : {}'.format(i, fnode_cfg))
 
             if fnode_cfg['is_out'] == 1:
                 fpn_channels = outer_fpn_channels
@@ -623,7 +621,6 @@ class GiraffeNeck(nn.Module):
                 feature_info.append(dict(num_chs=in_chs, reduction=reduction))
 
         self.cell = SequentialList()
-        logging.debug('building giraffeNeck')
         giraffe_layer = GiraffeLayer(
             feature_info=feature_info,
             fpn_config=fpn_config,
diff --git a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_fpn_btn.py
similarity index 94%
rename from modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py
rename to modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_fpn_btn.py
index f8519df0..9ce58f7c 100644
--- a/modelscope/models/cv/tinynas_detection/neck/giraffe_fpn_btn.py
+++ b/modelscope/models/cv/tinynas_detection/damo/base_models/necks/giraffe_fpn_btn.py
@@ -1,10 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors at https://github.com/tinyvision/damo-yolo.
 
 import torch
 import torch.nn as nn
 
-from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct, CSPStage
+from modelscope.models.cv.tinynas_detection.damo.base_models.core.ops import (
+    ConvBNAct, CSPStage)
 
 
 class GiraffeNeckV2(nn.Module):
diff --git a/modelscope/models/cv/tinynas_detection/damo/detectors/__init__.py b/modelscope/models/cv/tinynas_detection/damo/detectors/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/detectors/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/detectors/detector.py b/modelscope/models/cv/tinynas_detection/damo/detectors/detector.py
new file mode 100644
index 00000000..5f6b216b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/detectors/detector.py
@@ -0,0 +1,99 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.cv.tinynas_detection.damo.base_models.backbones import \
+    build_backbone
+from modelscope.models.cv.tinynas_detection.damo.base_models.heads import \
+    build_head
+from modelscope.models.cv.tinynas_detection.damo.base_models.necks import \
+    build_neck
+from modelscope.models.cv.tinynas_detection.damo.structures.image_list import \
+    to_image_list
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class Detector(TorchModel):
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.backbone = build_backbone(config.model.backbone)
+        self.neck = build_neck(config.model.neck)
+        self.head = build_head(config.model.head)
+
+        self.config = config
+
+    def init_bn(self, M):
+
+        for m in M.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eps = 1e-3
+                m.momentum = 0.03
+
+    def init_model(self):
+
+        self.apply(self.init_bn)
+
+        self.backbone.init_weights()
+        self.neck.init_weights()
+        self.head.init_weights()
+
+    def load_pretrain_detector(self, pretrain_model):
+        ckpt = torch.load(pretrain_model, map_location='cpu')
+        if 'model' in ckpt:
+            state_dict = ckpt['model']
+        elif 'state_dict' in ckpt:
+            state_dict = ckpt['state_dict']
+        logger.info(f'Finetune from {pretrain_model}................')
+        new_state_dict = {}
+        for k, v in self.state_dict().items():
+            k = k.replace('module.', '')
+            if 'head' in k:
+                new_state_dict[k] = self.state_dict()[k]
+                continue
+            new_state_dict[k] = state_dict[k]
+
+        self.load_state_dict(new_state_dict, strict=True)
+
+    def forward(self, x, targets=None, tea=False, stu=False):
+        images = to_image_list(x)
+        feature_outs = self.backbone(images.tensors)  # list of tensor
+        fpn_outs = self.neck(feature_outs)
+
+        if tea:
+            return fpn_outs
+        else:
+            outputs = self.head(
+                fpn_outs,
+                targets,
+                imgs=images,
+            )
+            if stu:
+                return outputs, fpn_outs
+            else:
+                return outputs
+
+
+def build_local_model(config, device):
+    model = Detector(config)
+    model.init_model()
+    model.to(device)
+
+    return model
+
+
+def build_ddp_model(model, local_rank):
+    model = DDP(
+        model,
+        device_ids=[local_rank],
+        output_device=local_rank,
+        broadcast_buffers=False,
+        find_unused_parameters=False)
+
+    return model
diff --git a/modelscope/models/cv/tinynas_detection/damo/structures/__init__.py b/modelscope/models/cv/tinynas_detection/damo/structures/__init__.py
new file mode 100644
index 00000000..14e2370d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/structures/__init__.py
@@ -0,0 +1 @@
+# Copyright © Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/tinynas_detection/damo/structures/bounding_box.py b/modelscope/models/cv/tinynas_detection/damo/structures/bounding_box.py
new file mode 100644
index 00000000..49fa646b
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/structures/bounding_box.py
@@ -0,0 +1,245 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import torch
+
+# transpose
+FLIP_LEFT_RIGHT = 0
+FLIP_TOP_BOTTOM = 1
+
+
+class BoxList(object):
+    """
+    This class represents a set of bounding boxes.
+    The bounding boxes are represented as a Nx4 Tensor.
+    In order to uniquely determine the bounding boxes with respect
+    to an image, we also store the corresponding image dimensions.
+    They can contain extra information that is specific to each bounding box,
+    such as labels.
+    """
+
+    def __init__(self, bbox, image_size, mode='xyxy'):
+        device = bbox.device if isinstance(
+            bbox, torch.Tensor) else torch.device('cpu')
+        bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device)
+        if bbox.ndimension() != 2:
+            raise ValueError('bbox should have 2 dimensions, got {}'.format(
+                bbox.ndimension()))
+        if bbox.size(-1) != 4:
+            raise ValueError('last dimension of bbox should have a '
+                             'size of 4, got {}'.format(bbox.size(-1)))
+        if mode not in ('xyxy', 'xywh'):
+            raise ValueError("mode should be 'xyxy' or 'xywh'")
+
+        self.bbox = bbox
+        self.size = image_size  # (image_width, image_height)
+        self.mode = mode
+        self.extra_fields = {}
+
+    def add_field(self, field, field_data):
+        self.extra_fields[field] = field_data
+
+    def get_field(self, field):
+        return self.extra_fields[field]
+
+    def has_field(self, field):
+        return field in self.extra_fields
+
+    def fields(self):
+        return list(self.extra_fields.keys())
+
+    def _copy_extra_fields(self, bbox):
+        for k, v in bbox.extra_fields.items():
+            self.extra_fields[k] = v
+
+    def convert(self, mode):
+        if mode not in ('xyxy', 'xywh'):
+            raise ValueError("mode should be 'xyxy' or 'xywh'")
+        if mode == self.mode:
+            return self
+        # we only have two modes, so don't need to check
+        # self.mode
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        if mode == 'xyxy':
+            bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
+            bbox = BoxList(bbox, self.size, mode=mode)
+        else:
+            TO_REMOVE = 0
+            bbox = torch.cat(
+                (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE),
+                dim=-1)
+            bbox = BoxList(bbox, self.size, mode=mode)
+        bbox._copy_extra_fields(self)
+        return bbox
+
+    def _split_into_xyxy(self):
+        if self.mode == 'xyxy':
+            xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1)
+            return xmin, ymin, xmax, ymax
+        elif self.mode == 'xywh':
+            TO_REMOVE = 0
+            xmin, ymin, w, h = self.bbox.split(1, dim=-1)
+            return (
+                xmin,
+                ymin,
+                xmin + (w - TO_REMOVE).clamp(min=0),
+                ymin + (h - TO_REMOVE).clamp(min=0),
+            )
+        else:
+            raise RuntimeError('Should not be here')
+
+    def resize(self, size, *args, **kwargs):
+        """
+        Returns a resized copy of this bounding box
+        :param size: The requested size in pixels, as a 2-tuple:
+            (width, height).
+        """
+        ratios = tuple(
+            float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+        if ratios[0] == ratios[1]:
+            ratio = ratios[0]
+            scaled_box = self.bbox * ratio
+            bbox = BoxList(scaled_box, size, mode=self.mode)
+            for k, v in self.extra_fields.items():
+                if not isinstance(v, torch.Tensor):
+                    v = v.resize(size, *args, **kwargs)
+                bbox.add_field(k, v)
+            return bbox
+
+        ratio_width, ratio_height = ratios
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        scaled_xmin = xmin * ratio_width
+        scaled_xmax = xmax * ratio_width
+        scaled_ymin = ymin * ratio_height
+        scaled_ymax = ymax * ratio_height
+        scaled_box = torch.cat(
+            (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1)
+        bbox = BoxList(scaled_box, size, mode='xyxy')
+        for k, v in self.extra_fields.items():
+            if not isinstance(v, torch.Tensor):
+                v = v.resize(size, *args, **kwargs)
+            bbox.add_field(k, v)
+
+        return bbox.convert(self.mode)
+
+    def transpose(self, method):
+        """
+        Transpose bounding box (flip or rotate in 90 degree steps)
+        :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`,
+          :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`,
+          :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`,
+          :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`.
+        """
+        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
+            raise NotImplementedError(
+                'Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented')
+
+        image_width, image_height = self.size
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        if method == FLIP_LEFT_RIGHT:
+            TO_REMOVE = 0
+            transposed_xmin = image_width - xmax - TO_REMOVE
+            transposed_xmax = image_width - xmin - TO_REMOVE
+            transposed_ymin = ymin
+            transposed_ymax = ymax
+        elif method == FLIP_TOP_BOTTOM:
+            transposed_xmin = xmin
+            transposed_xmax = xmax
+            transposed_ymin = image_height - ymax
+            transposed_ymax = image_height - ymin
+
+        transposed_boxes = torch.cat((transposed_xmin, transposed_ymin,
+                                      transposed_xmax, transposed_ymax),
+                                     dim=-1)
+        bbox = BoxList(transposed_boxes, self.size, mode='xyxy')
+        for k, v in self.extra_fields.items():
+            if not isinstance(v, torch.Tensor):
+                v = v.transpose(method)
+            bbox.add_field(k, v)
+        return bbox.convert(self.mode)
+
+    def crop(self, box):
+        """
+        Cropss a rectangular region from this bounding box. The box is a
+        4-tuple defining the left, upper, right, and lower pixel
+        coordinate.
+        """
+        xmin, ymin, xmax, ymax = self._split_into_xyxy()
+        w, h = box[2] - box[0], box[3] - box[1]
+        cropped_xmin = (xmin - box[0]).clamp(min=0, max=w)
+        cropped_ymin = (ymin - box[1]).clamp(min=0, max=h)
+        cropped_xmax = (xmax - box[0]).clamp(min=0, max=w)
+        cropped_ymax = (ymax - box[1]).clamp(min=0, max=h)
+
+        cropped_box = torch.cat(
+            (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1)
+        bbox = BoxList(cropped_box, (w, h), mode='xyxy')
+        for k, v in self.extra_fields.items():
+            if not isinstance(v, torch.Tensor):
+                v = v.crop(box)
+            bbox.add_field(k, v)
+        return bbox.convert(self.mode)
+
+    # Tensor-like methods
+
+    def to(self, device):
+        bbox = BoxList(self.bbox.to(device), self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            if hasattr(v, 'to'):
+                v = v.to(device)
+            bbox.add_field(k, v)
+        return bbox
+
+    def __getitem__(self, item):
+        bbox = BoxList(self.bbox[item], self.size, self.mode)
+        for k, v in self.extra_fields.items():
+            bbox.add_field(k, v[item])
+        return bbox
+
+    def __len__(self):
+        return self.bbox.shape[0]
+
+    def clip_to_image(self, remove_empty=True):
+        TO_REMOVE = 0
+        self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE)
+        self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE)
+        self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE)
+        self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE)
+        if remove_empty:
+            box = self.bbox
+            keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0])
+            return self[keep]
+        return self
+
+    def area(self):
+        box = self.bbox
+        if self.mode == 'xyxy':
+            TO_REMOVE = 0
+            area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (
+                box[:, 3] - box[:, 1] + TO_REMOVE)
+        elif self.mode == 'xywh':
+            area = box[:, 2] * box[:, 3]
+        else:
+            raise RuntimeError('Should not be here')
+
+        return area
+
+    def copy_with_fields(self, fields, skip_missing=False):
+        bbox = BoxList(self.bbox, self.size, self.mode)
+        if not isinstance(fields, (list, tuple)):
+            fields = [fields]
+        for field in fields:
+            if self.has_field(field):
+                bbox.add_field(field, self.get_field(field))
+            elif not skip_missing:
+                raise KeyError("Field '{}' not found in {}".format(
+                    field, self))
+        return bbox
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'num_boxes={}, '.format(len(self))
+        s += 'image_width={}, '.format(self.size[0])
+        s += 'image_height={}, '.format(self.size[1])
+        s += 'mode={})'.format(self.mode)
+        return s
diff --git a/modelscope/models/cv/tinynas_detection/damo/structures/boxlist_ops.py b/modelscope/models/cv/tinynas_detection/damo/structures/boxlist_ops.py
new file mode 100644
index 00000000..6d00a89d
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/structures/boxlist_ops.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import torch
+
+from .bounding_box import BoxList
+
+
+def remove_small_boxes(boxlist, min_size):
+    """
+    Only keep boxes with both sides >= min_size
+    Arguments:
+        boxlist (Boxlist)
+        min_size (int)
+    """
+    xywh_boxes = boxlist.convert('xywh').bbox
+    _, _, ws, hs = xywh_boxes.unbind(dim=1)
+    keep = ((ws >= min_size) & (hs >= min_size)).nonzero().squeeze(1)
+    return boxlist[keep]
+
+
+def boxlist_iou(boxlist1, boxlist2):
+    """Compute the intersection over union of two set of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+    Arguments:
+      box1: (BoxList) bounding boxes, sized [N,4].
+      box2: (BoxList) bounding boxes, sized [M,4].
+    Returns:
+      (tensor) iou, sized [N,M].
+    Reference:
+      https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py
+    """
+    if boxlist1.size != boxlist2.size:
+        raise RuntimeError(
+            'boxlists should have same image size, got {}, {}'.format(
+                boxlist1, boxlist2))
+
+    area1 = boxlist1.area()
+    area2 = boxlist2.area()
+
+    box1, box2 = boxlist1.bbox, boxlist2.bbox
+
+    lt = torch.max(box1[:, None, :2], box2[:, :2])  # [N,M,2]
+    rb = torch.min(box1[:, None, 2:], box2[:, 2:])  # [N,M,2]
+
+    TO_REMOVE = 1
+
+    wh = (rb - lt + TO_REMOVE).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    iou = inter / (area1[:, None] + area2 - inter)
+    return iou
+
+
+def _cat(tensors, dim=0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only
+    a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+def cat_boxlist(bboxes):
+    """
+    Concatenates a list of BoxList (having the same image size) into a
+    single BoxList
+    Arguments:
+        bboxes (list[BoxList])
+    """
+    assert isinstance(bboxes, (list, tuple))
+    assert all(isinstance(bbox, BoxList) for bbox in bboxes)
+
+    size = bboxes[0].size
+    assert all(bbox.size == size for bbox in bboxes)
+
+    mode = bboxes[0].mode
+    assert all(bbox.mode == mode for bbox in bboxes)
+
+    fields = set(bboxes[0].fields())
+    assert all(set(bbox.fields()) == fields for bbox in bboxes)
+
+    cat_boxes = BoxList(
+        _cat([bbox.bbox for bbox in bboxes], dim=0), size, mode)
+
+    for field in fields:
+        data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0)
+        cat_boxes.add_field(field, data)
+
+    return cat_boxes
diff --git a/modelscope/models/cv/tinynas_detection/damo/structures/image_list.py b/modelscope/models/cv/tinynas_detection/damo/structures/image_list.py
new file mode 100644
index 00000000..42aa7525
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/structures/image_list.py
@@ -0,0 +1,78 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from __future__ import division
+
+import torch
+
+
+class ImageList(object):
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+    """
+
+    def __init__(self, tensors, image_sizes, pad_sizes):
+        """
+        Arguments:
+            tensors (tensor)
+            image_sizes (list[tuple[int, int]])
+        """
+        self.tensors = tensors
+        self.image_sizes = image_sizes
+        self.pad_sizes = pad_sizes
+
+    def to(self, *args, **kwargs):
+        cast_tensor = self.tensors.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes, self.pad_sizes)
+
+
+def to_image_list(tensors, size_divisible=0, max_size=None):
+    """
+    tensors can be an ImageList, a torch.Tensor or
+    an iterable of Tensors. It can't be a numpy array.
+    When tensors is an iterable of Tensors, it pads
+    the Tensors with zeros so that they have the same
+    shape
+    """
+    if isinstance(tensors, torch.Tensor) and size_divisible > 0:
+        tensors = [tensors]
+
+    if isinstance(tensors, ImageList):
+        return tensors
+    elif isinstance(tensors, torch.Tensor):
+        # single tensor shape can be inferred
+        if tensors.dim() == 3:
+            tensors = tensors[None]
+        assert tensors.dim() == 4
+        image_sizes = [tensor.shape[-2:] for tensor in tensors]
+        return ImageList(tensors, image_sizes, image_sizes)
+    elif isinstance(tensors, (tuple, list)):
+        if max_size is None:
+            max_size = tuple(
+                max(s) for s in zip(*[img.shape for img in tensors]))
+        # TODO Ideally, just remove this and let me model handle arbitrary
+        # input sizs
+        if size_divisible > 0:
+            import math
+
+            stride = size_divisible
+            max_size = list(max_size)
+            max_size[1] = int(math.ceil(max_size[1] / stride) * stride)
+            max_size[2] = int(math.ceil(max_size[2] / stride) * stride)
+            max_size = tuple(max_size)
+
+        batch_shape = (len(tensors), ) + max_size
+        batched_imgs = tensors[0].new(*batch_shape).zero_()  # + 114
+        for img, pad_img in zip(tensors, batched_imgs):
+            pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img)
+
+        image_sizes = [im.shape[-2:] for im in tensors]
+        pad_sizes = [batched_imgs.shape[-2:] for im in batched_imgs]
+
+        return ImageList(batched_imgs, image_sizes, pad_sizes)
+    else:
+        raise TypeError('Unsupported type for to_image_list: {}'.format(
+            type(tensors)))
diff --git a/modelscope/models/cv/tinynas_detection/damo/utils/__init__.py b/modelscope/models/cv/tinynas_detection/damo/utils/__init__.py
new file mode 100644
index 00000000..e93321ac
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from .boxes import adjust_box_anns, postprocess
+from .model_utils import ema_model, get_model_info
+from .scheduler import cosine_scheduler
diff --git a/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py b/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py
new file mode 100644
index 00000000..b112b514
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/utils/boxes.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import numpy as np
+import torch
+import torchvision
+
+from modelscope.models.cv.tinynas_detection.damo.structures.bounding_box import \
+    BoxList
+
+__all__ = [
+    'filter_box',
+    'postprocess',
+    'bboxes_iou',
+    'matrix_iou',
+    'adjust_box_anns',
+    'xyxy2xywh',
+    'xyxy2cxcywh',
+    'bbox_overlaps',
+]
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "iof" (intersection over
+            foreground).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   iou_thr,
+                   max_num=100,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels \
+            are 0-based.
+    """
+    num_classes = multi_scores.size(1)
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+    scores = multi_scores
+    # filter out boxes with low scores
+    valid_mask = scores > score_thr  # 1000 * 80 bool
+
+    # We use masked_select for ONNX exporting purpose,
+    # which is equivalent to bboxes = bboxes[valid_mask]
+    # (TODO): as ONNX does not support repeat now,
+    # we have to use this ugly code
+    # bboxes -> 1000, 4
+    bboxes = torch.masked_select(
+        bboxes,
+        torch.stack((valid_mask, valid_mask, valid_mask, valid_mask),
+                    -1)).view(-1, 4)  # mask->  1000*80*4, 80000*4
+    if score_factors is not None:
+        scores = scores * score_factors[:, None]
+    scores = torch.masked_select(scores, valid_mask)
+    labels = valid_mask.nonzero(as_tuple=False)[:, 1]
+
+    if bboxes.numel() == 0:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+        scores = multi_bboxes.new_zeros((0, ))
+
+        return bboxes, scores, labels
+
+    keep = torchvision.ops.batched_nms(bboxes, scores, labels, iou_thr)
+
+    if max_num > 0:
+        keep = keep[:max_num]
+
+    return bboxes[keep], scores[keep], labels[keep]
+
+
+def filter_box(output, scale_range):
+    """
+    output: (N, 5+class) shape
+    """
+    min_scale, max_scale = scale_range
+    w = output[:, 2] - output[:, 0]
+    h = output[:, 3] - output[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return output[keep]
+
+
+def filter_results(boxlist, num_classes, nms_thre):
+    boxes = boxlist.bbox
+    scores = boxlist.get_field('scores')
+    cls = boxlist.get_field('labels')
+    nms_out_index = torchvision.ops.batched_nms(
+        boxes,
+        scores,
+        cls,
+        nms_thre,
+    )
+    boxlist = boxlist[nms_out_index]
+
+    return boxlist
+
+
+def postprocess(cls_scores,
+                bbox_preds,
+                num_classes,
+                conf_thre=0.7,
+                nms_thre=0.45,
+                imgs=None):
+    batch_size = bbox_preds.size(0)
+    output = [None for _ in range(batch_size)]
+    for i in range(batch_size):
+        # If none are remaining => process next image
+        if not bbox_preds[i].size(0):
+            continue
+        detections, scores, labels = multiclass_nms(bbox_preds[i],
+                                                    cls_scores[i], conf_thre,
+                                                    nms_thre, 500)
+        detections = torch.cat(
+            (detections, scores[:, None], scores[:, None], labels[:, None]),
+            dim=1)
+
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+
+    # transfer to BoxList
+    for i in range(len(output)):
+        res = output[i]
+        if res is None or imgs is None:
+            boxlist = BoxList(torch.zeros(0, 4), (0, 0), mode='xyxy')
+            boxlist.add_field('objectness', 0)
+            boxlist.add_field('scores', 0)
+            boxlist.add_field('labels', -1)
+
+        else:
+            img_h, img_w = imgs.image_sizes[i]
+            boxlist = BoxList(res[:, :4], (img_w, img_h), mode='xyxy')
+            boxlist.add_field('objectness', res[:, 4])
+            boxlist.add_field('scores', res[:, 5])
+            boxlist.add_field('labels', res[:, 6] + 1)
+        output[i] = boxlist
+
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+    return bbox
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+    bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+    return bboxes
diff --git a/modelscope/models/cv/tinynas_detection/damo/utils/model_utils.py b/modelscope/models/cv/tinynas_detection/damo/utils/model_utils.py
new file mode 100644
index 00000000..82f947b8
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/utils/model_utils.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import math
+import time
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from thop import profile
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+__all__ = [
+    'fuse_conv_and_bn',
+    'fuse_model',
+    'get_model_info',
+    'replace_module',
+    'ema_model',
+]
+
+
+def ema_scheduler(x, ema_momentum):
+    return ema_momentum * (1 - math.exp(-x / 2000))
+
+
+class ema_model:
+
+    def __init__(self, student, ema_momentum):
+
+        self.model = deepcopy(student).eval()
+        self.ema_momentum = ema_momentum
+        for param in self.model.parameters():
+            param.requires_grad_(False)
+
+    def update(self, iters, student):
+        if isinstance(student, DDP):
+            student = student.module.state_dict()
+        else:
+            student = student.state_dict()
+        with torch.no_grad():
+            momentum = ema_scheduler(iters, self.ema_momentum)
+            for name, param in self.model.state_dict().items():
+                if param.dtype.is_floating_point:
+                    param *= momentum
+                    param += (1.0 - momentum) * student[name].detach()
+
+
+def get_latency(model, inp, iters=500, warmup=2):
+
+    start = time.time()
+    for i in range(iters):
+        out = model(inp)
+        torch.cuda.synchronize()
+        if i <= warmup:
+            start = time.time()
+    latency = (time.time() - start) / (iters - warmup)
+
+    return out, latency
+
+
+def get_model_info(model, tsize):
+    stride = 640
+    model = model.eval()
+    backbone = model.backbone
+    neck = model.neck
+    head = model.head
+    h, w = tsize
+    img = torch.randn((1, 3, stride, stride),
+                      device=next(model.parameters()).device)
+
+    bf, bp = profile(deepcopy(backbone), inputs=(img, ), verbose=False)
+    bo, bl = get_latency(backbone, img, iters=10)
+
+    nf, np = profile(deepcopy(neck), inputs=(bo, ), verbose=False)
+    no, nl = get_latency(neck, bo, iters=10)
+
+    hf, hp = profile(deepcopy(head), inputs=(no, ), verbose=False)
+    ho, hl = get_latency(head, no, iters=10)
+
+    _, total_latency = get_latency(model, img)
+    total_flops = 0
+    info = ''
+    for name, flops, params, latency in zip(('backbone', 'neck', 'head'),
+                                            (bf, nf, hf), (bp, np, hp),
+                                            (bl, nl, hl)):
+        params /= 1e6
+        flops /= 1e9
+        flops *= tsize[0] * tsize[1] / stride / stride * 2  # Gflops
+        total_flops += flops
+        info += f"{name}'s params(M): {params:.2f}, " + \
+                f'flops(G): {flops:.2f}, latency(ms): {latency*1000:.3f}\n'
+    info += f'total latency(ms): {total_latency*1000:.3f}, ' + \
+            f'total flops(G): {total_flops:.2f}\n'
+    return info
+
+
+def fuse_conv_and_bn(conv, bn):
+    # Fuse convolution and batchnorm layers
+    # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    fusedconv = (
+        nn.Conv2d(
+            conv.in_channels,
+            conv.out_channels,
+            kernel_size=conv.kernel_size,
+            stride=conv.stride,
+            padding=conv.padding,
+            groups=conv.groups,
+            bias=True,
+        ).requires_grad_(False).to(conv.weight.device))
+
+    # prepare filters
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+
+    # prepare spatial bias
+    b_conv = (
+        torch.zeros(conv.weight.size(0), device=conv.weight.device)
+        if conv.bias is None else conv.bias)
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(
+        torch.sqrt(bn.running_var + bn.eps))
+    fusedconv.bias.copy_(
+        torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+    return fusedconv
+
+
+def fuse_model(model):
+    from damo.base_models.core.ops import ConvBNAct
+    from damo.base_models.backbones.tinynas_res import ConvKXBN
+
+    for m in model.modules():
+        if type(m) is ConvBNAct and hasattr(m, 'bn'):
+            m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+            delattr(m, 'bn')  # remove batchnorm
+            m.forward = m.fuseforward  # update forward
+        elif type(m) is ConvKXBN and hasattr(m, 'bn1'):
+            m.conv1 = fuse_conv_and_bn(m.conv1, m.bn1)  # update conv
+            delattr(m, 'bn1')  # remove batchnorm
+            m.forward = m.fuseforward  # update forward
+
+    return model
+
+
+def replace_module(module,
+                   replaced_module_type,
+                   new_module_type,
+                   replace_func=None):
+    """
+    Replace given type in module to a new type. mostly used in deploy.
+
+    Args:
+        module (nn.Module): model to apply replace operation.
+        replaced_module_type (Type): module type to be replaced.
+        new_module_type (Type)
+        replace_func (function): python function to describe replace logic.
+                                 Defalut value None.
+
+    Returns:
+        model (nn.Module): module that already been replaced.
+    """
+
+    def default_replace_func(replaced_module_type, new_module_type):
+        return new_module_type()
+
+    if replace_func is None:
+        replace_func = default_replace_func
+
+    model = module
+    if isinstance(module, replaced_module_type):
+        model = replace_func(replaced_module_type, new_module_type)
+    else:  # recurrsively replace
+        for name, child in module.named_children():
+            new_child = replace_module(child, replaced_module_type,
+                                       new_module_type)
+            if new_child is not child:  # child is already replaced
+                model.add_module(name, new_child)
+
+    return model
diff --git a/modelscope/models/cv/tinynas_detection/damo/utils/scheduler.py b/modelscope/models/cv/tinynas_detection/damo/utils/scheduler.py
new file mode 100644
index 00000000..0e042db5
--- /dev/null
+++ b/modelscope/models/cv/tinynas_detection/damo/utils/scheduler.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+
+class cosine_scheduler:
+
+    def __init__(self,
+                 base_lr_per_img,
+                 batch_size,
+                 min_lr_ratio,
+                 total_iters,
+                 no_aug_iters,
+                 warmup_iters,
+                 warmup_start_lr=0):
+
+        self.base_lr = base_lr_per_img * batch_size
+        self.final_lr = self.base_lr * min_lr_ratio
+        self.warmup_iters = warmup_iters
+        self.warmup_start_lr = warmup_start_lr
+        self.total_iters = total_iters
+        self.no_aug_iters = no_aug_iters
+
+    def get_lr(self, iters):
+
+        if iters < self.warmup_iters:
+            lr = (self.base_lr - self.warmup_start_lr) * pow(
+                iters / float(self.warmup_iters), 2) + self.warmup_start_lr
+        elif iters >= self.total_iters - self.no_aug_iters:
+            lr = self.final_lr
+        else:
+            lr = self.final_lr + 0.5 * (self.base_lr - self.final_lr) \
+                * (1.0 + math.cos(math.pi * (iters - self.warmup_iters)
+                   / (self.total_iters - self.warmup_iters - self.no_aug_iters)))
+        return lr
diff --git a/modelscope/models/cv/tinynas_detection/detector.py b/modelscope/models/cv/tinynas_detection/detector.py
index d7320aaa..94599dcc 100644
--- a/modelscope/models/cv/tinynas_detection/detector.py
+++ b/modelscope/models/cv/tinynas_detection/detector.py
@@ -4,19 +4,18 @@
 import os.path as osp
 import pickle
 
-import cv2
 import torch
 import torch.nn as nn
 import torchvision
 
-from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
-from modelscope.models.builder import MODELS
-from modelscope.utils.config import Config
-from modelscope.utils.constant import ModelFile, Tasks
-from .backbone import build_backbone
-from .head import build_head
-from .neck import build_neck
+from modelscope.models.cv.tinynas_detection.damo.base_models.backbones import \
+    build_backbone
+from modelscope.models.cv.tinynas_detection.damo.base_models.heads import \
+    build_head
+from modelscope.models.cv.tinynas_detection.damo.base_models.necks import \
+    build_neck
+from modelscope.outputs.cv_outputs import DetectionOutput
 from .utils import parse_config
 
 
@@ -48,13 +47,17 @@ class SingleStageDetector(TorchModel):
         self.backbone = build_backbone(self.cfg.model.backbone)
         self.neck = build_neck(self.cfg.model.neck)
         self.head = build_head(self.cfg.model.head)
+        self.head.nms = False
         self.apply(self.init_bn)
 
         self.load_pretrain_model(model_path)
 
     def load_pretrain_model(self, pretrain_model):
-
-        state_dict = torch.load(pretrain_model, map_location='cpu')['model']
+        ckpt = torch.load(pretrain_model, map_location='cpu')
+        if 'model' in ckpt:
+            state_dict = ckpt['model']
+        elif 'state_dict' in ckpt:
+            state_dict = ckpt['state_dict']
         new_state_dict = {}
         for k, v in state_dict.items():
             k = k.replace('module.', '')
@@ -67,41 +70,16 @@ class SingleStageDetector(TorchModel):
                 m.eps = 1e-3
                 m.momentum = 0.03
 
-    def inference(self, x):
-
+    def forward(self, x):
         if self.training:
-            return self.forward_train(x)
+            pass
         else:
-            return self.forward_eval(x)
-
-    def forward_train(self, x):
-
-        pass
-
-    def forward_eval(self, x):
-
-        x = self.backbone(x)
-        x = self.neck(x)
-        prediction = self.head(x)
-
-        return prediction
-
-    def preprocess(self, image):
-        image = torch.from_numpy(image).type(torch.float32)
-        image = image.permute(2, 0, 1)
-        shape = image.shape  # c, h, w
-        if self.size_divisible > 0:
-            import math
-            stride = self.size_divisible
-            shape = list(shape)
-            shape[1] = int(math.ceil(shape[1] / stride) * stride)
-            shape[2] = int(math.ceil(shape[2] / stride) * stride)
-            shape = tuple(shape)
-        pad_img = image.new(*shape).zero_()
-        pad_img[:, :image.shape[1], :image.shape[2]].copy_(image)
-        pad_img = pad_img.unsqueeze(0)
-
-        return pad_img
+            x = self.backbone(x)
+            x = self.neck(x)
+            cls_scores, bbox_preds = self.head(x)
+            prediction = torch.cat(
+                [bbox_preds, cls_scores[..., 0:self.num_classes]], dim=-1)
+            return prediction
 
     def postprocess(self, preds):
         bboxes, scores, labels_idx = postprocess_gfocal(
@@ -111,7 +89,11 @@ class SingleStageDetector(TorchModel):
         labels_idx = labels_idx.cpu().numpy()
         labels = [self.label_map[idx + 1][0]['name'] for idx in labels_idx]
 
-        return (bboxes, scores, labels)
+        return DetectionOutput(
+            boxes=bboxes,
+            scores=scores,
+            class_ids=labels,
+        )
 
 
 def multiclass_nms(multi_bboxes,
diff --git a/modelscope/models/cv/tinynas_detection/head/zero_head.py b/modelscope/models/cv/tinynas_detection/head/zero_head.py
deleted file mode 100644
index 0e23ebc3..00000000
--- a/modelscope/models/cv/tinynas_detection/head/zero_head.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# The DAMO-YOLO implementation is also open-sourced by the authors, and available
-# at https://github.com/tinyvision/damo-yolo.
-from functools import partial
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from modelscope.models.cv.tinynas_detection.core.ops import ConvBNAct
-
-
-class Scale(nn.Module):
-
-    def __init__(self, scale=1.0):
-        super(Scale, self).__init__()
-        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
-
-    def forward(self, x):
-        return x * self.scale
-
-
-def multi_apply(func, *args, **kwargs):
-
-    pfunc = partial(func, **kwargs) if kwargs else func
-    map_results = map(pfunc, *args)
-    return tuple(map(list, zip(*map_results)))
-
-
-def distance2bbox(points, distance, max_shape=None):
-    """Decode distance prediction to bounding box.
-    """
-    x1 = points[..., 0] - distance[..., 0]
-    y1 = points[..., 1] - distance[..., 1]
-    x2 = points[..., 0] + distance[..., 2]
-    y2 = points[..., 1] + distance[..., 3]
-    if max_shape is not None:
-        x1 = x1.clamp(min=0, max=max_shape[1])
-        y1 = y1.clamp(min=0, max=max_shape[0])
-        x2 = x2.clamp(min=0, max=max_shape[1])
-        y2 = y2.clamp(min=0, max=max_shape[0])
-    return torch.stack([x1, y1, x2, y2], -1)
-
-
-def bbox2distance(points, bbox, max_dis=None, eps=0.1):
-    """Decode bounding box based on distances.
-    """
-    left = points[:, 0] - bbox[:, 0]
-    top = points[:, 1] - bbox[:, 1]
-    right = bbox[:, 2] - points[:, 0]
-    bottom = bbox[:, 3] - points[:, 1]
-    if max_dis is not None:
-        left = left.clamp(min=0, max=max_dis - eps)
-        top = top.clamp(min=0, max=max_dis - eps)
-        right = right.clamp(min=0, max=max_dis - eps)
-        bottom = bottom.clamp(min=0, max=max_dis - eps)
-    return torch.stack([left, top, right, bottom], -1)
-
-
-class Integral(nn.Module):
-    """A fixed layer for calculating integral result from distribution.
-    """
-
-    def __init__(self, reg_max=16):
-        super(Integral, self).__init__()
-        self.reg_max = reg_max
-        self.register_buffer('project',
-                             torch.linspace(0, self.reg_max, self.reg_max + 1))
-
-    def forward(self, x):
-        """Forward feature from the regression head to get integral result of
-        bounding box location.
-        """
-        b, hw, _, _ = x.size()
-        x = x.reshape(b * hw * 4, self.reg_max + 1)
-        y = self.project.type_as(x).unsqueeze(1)
-        x = torch.matmul(x, y).reshape(b, hw, 4)
-        return x
-
-
-class ZeroHead(nn.Module):
-    """Ref to Generalized Focal Loss V2: Learning Reliable Localization Quality
-    Estimation for Dense Object Detection.
-    """
-
-    def __init__(
-            self,
-            num_classes,
-            in_channels,
-            stacked_convs=4,  # 4
-            feat_channels=256,
-            reg_max=12,
-            strides=[8, 16, 32],
-            norm='gn',
-            act='relu',
-            nms_conf_thre=0.05,
-            nms_iou_thre=0.7,
-            nms=True,
-            **kwargs):
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.stacked_convs = stacked_convs
-        self.act = act
-        self.strides = strides
-        if stacked_convs == 0:
-            feat_channels = in_channels
-        if isinstance(feat_channels, list):
-            self.feat_channels = feat_channels
-        else:
-            self.feat_channels = [feat_channels] * len(self.strides)
-        # add 1 for keep consistance with former models
-        self.cls_out_channels = num_classes + 1
-        self.reg_max = reg_max
-
-        self.nms = nms
-        self.nms_conf_thre = nms_conf_thre
-        self.nms_iou_thre = nms_iou_thre
-
-        self.feat_size = [torch.zeros(4) for _ in strides]
-
-        super(ZeroHead, self).__init__()
-        self.integral = Integral(self.reg_max)
-
-        self._init_layers()
-
-    def _build_not_shared_convs(self, in_channel, feat_channels):
-        cls_convs = nn.ModuleList()
-        reg_convs = nn.ModuleList()
-
-        for i in range(self.stacked_convs):
-            chn = feat_channels if i > 0 else in_channel
-            kernel_size = 3 if i > 0 else 1
-            cls_convs.append(
-                ConvBNAct(
-                    chn,
-                    feat_channels,
-                    kernel_size,
-                    stride=1,
-                    groups=1,
-                    norm='bn',
-                    act=self.act))
-            reg_convs.append(
-                ConvBNAct(
-                    chn,
-                    feat_channels,
-                    kernel_size,
-                    stride=1,
-                    groups=1,
-                    norm='bn',
-                    act=self.act))
-
-        return cls_convs, reg_convs
-
-    def _init_layers(self):
-        """Initialize layers of the head."""
-        self.cls_convs = nn.ModuleList()
-        self.reg_convs = nn.ModuleList()
-
-        for i in range(len(self.strides)):
-            cls_convs, reg_convs = self._build_not_shared_convs(
-                self.in_channels[i], self.feat_channels[i])
-            self.cls_convs.append(cls_convs)
-            self.reg_convs.append(reg_convs)
-
-        self.gfl_cls = nn.ModuleList([
-            nn.Conv2d(
-                self.feat_channels[i], self.cls_out_channels, 3, padding=1)
-            for i in range(len(self.strides))
-        ])
-
-        self.gfl_reg = nn.ModuleList([
-            nn.Conv2d(
-                self.feat_channels[i], 4 * (self.reg_max + 1), 3, padding=1)
-            for i in range(len(self.strides))
-        ])
-
-        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
-
-    def forward(self, xin, labels=None, imgs=None, aux_targets=None):
-        if self.training:
-            return NotImplementedError
-        else:
-            return self.forward_eval(xin=xin, labels=labels, imgs=imgs)
-
-    def forward_eval(self, xin, labels=None, imgs=None):
-
-        # prepare priors for label assignment and bbox decode
-        if self.feat_size[0] != xin[0].shape:
-            mlvl_priors_list = [
-                self.get_single_level_center_priors(
-                    xin[i].shape[0],
-                    xin[i].shape[-2:],
-                    stride,
-                    dtype=torch.float32,
-                    device=xin[0].device)
-                for i, stride in enumerate(self.strides)
-            ]
-            self.mlvl_priors = torch.cat(mlvl_priors_list, dim=1)
-            self.feat_size[0] = xin[0].shape
-
-        # forward for bboxes and classification prediction
-        cls_scores, bbox_preds = multi_apply(
-            self.forward_single,
-            xin,
-            self.cls_convs,
-            self.reg_convs,
-            self.gfl_cls,
-            self.gfl_reg,
-            self.scales,
-        )
-        cls_scores = torch.cat(cls_scores, dim=1)[:, :, :self.num_classes]
-        bbox_preds = torch.cat(bbox_preds, dim=1)
-        # batch bbox decode
-        bbox_preds = self.integral(bbox_preds) * self.mlvl_priors[..., 2, None]
-        bbox_preds = distance2bbox(self.mlvl_priors[..., :2], bbox_preds)
-
-        res = torch.cat([bbox_preds, cls_scores[..., 0:self.num_classes]],
-                        dim=-1)
-        return res
-
-    def forward_single(self, x, cls_convs, reg_convs, gfl_cls, gfl_reg, scale):
-        """Forward feature of a single scale level.
-
-        """
-        cls_feat = x
-        reg_feat = x
-
-        for cls_conv, reg_conv in zip(cls_convs, reg_convs):
-            cls_feat = cls_conv(cls_feat)
-            reg_feat = reg_conv(reg_feat)
-
-        bbox_pred = scale(gfl_reg(reg_feat)).float()
-        N, C, H, W = bbox_pred.size()
-        if self.training:
-            bbox_before_softmax = bbox_pred.reshape(N, 4, self.reg_max + 1, H,
-                                                    W)
-            bbox_before_softmax = bbox_before_softmax.flatten(
-                start_dim=3).permute(0, 3, 1, 2)
-        bbox_pred = F.softmax(
-            bbox_pred.reshape(N, 4, self.reg_max + 1, H, W), dim=2)
-
-        cls_score = gfl_cls(cls_feat).sigmoid()
-
-        cls_score = cls_score.flatten(start_dim=2).permute(
-            0, 2, 1)  # N, h*w, self.num_classes+1
-        bbox_pred = bbox_pred.flatten(start_dim=3).permute(
-            0, 3, 1, 2)  # N, h*w, 4, self.reg_max+1
-        if self.training:
-            return cls_score, bbox_pred, bbox_before_softmax
-        else:
-            return cls_score, bbox_pred
-
-    def get_single_level_center_priors(self, batch_size, featmap_size, stride,
-                                       dtype, device):
-
-        h, w = featmap_size
-        x_range = (torch.arange(0, int(w), dtype=dtype,
-                                device=device)) * stride
-        y_range = (torch.arange(0, int(h), dtype=dtype,
-                                device=device)) * stride
-
-        x = x_range.repeat(h, 1)
-        y = y_range.unsqueeze(-1).repeat(1, w)
-
-        y = y.flatten()
-        x = x.flatten()
-        strides = x.new_full((x.shape[0], ), stride)
-        priors = torch.stack([x, y, strides, strides], dim=-1)
-
-        return priors.unsqueeze(0).repeat(batch_size, 1, 1)
-
-    def sample(self, assign_result, gt_bboxes):
-        pos_inds = torch.nonzero(
-            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
-        neg_inds = torch.nonzero(
-            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
-        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
-
-        if gt_bboxes.numel() == 0:
-            # hack for index error case
-            assert pos_assigned_gt_inds.numel() == 0
-            pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
-        else:
-            if len(gt_bboxes.shape) < 2:
-                gt_bboxes = gt_bboxes.view(-1, 4)
-            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
-
-        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
diff --git a/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py b/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
index 181c3095..3243e4db 100644
--- a/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
+++ b/modelscope/models/cv/tinynas_detection/tinynas_damoyolo.py
@@ -6,6 +6,9 @@ from modelscope.utils.constant import Tasks
 from .detector import SingleStageDetector
 
 
+@MODELS.register_module(
+    Tasks.domain_specific_object_detection,
+    module_name=Models.tinynas_damoyolo)
 @MODELS.register_module(
     Tasks.image_object_detection, module_name=Models.tinynas_damoyolo)
 class DamoYolo(SingleStageDetector):
diff --git a/modelscope/models/cv/video_depth_estimation/__init__.py b/modelscope/models/cv/video_depth_estimation/__init__.py
new file mode 100644
index 00000000..b05e314a
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .dro_model import DROEstimation
+
+else:
+    _import_structure = {
+        'dro_model': ['DROEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_depth_estimation/configs/__init__.py b/modelscope/models/cv/video_depth_estimation/configs/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/configs/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/configs/default_config.py b/modelscope/models/cv/video_depth_estimation/configs/default_config.py
new file mode 100644
index 00000000..5634a2d2
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/configs/default_config.py
@@ -0,0 +1,216 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+"""Default dro_sfm configuration parameters (overridable in configs/*.yaml)
+"""
+
+import os
+
+from yacs.config import CfgNode as CN
+
+########################################################################################################################
+cfg = CN()
+cfg.name = ''  # Run name
+cfg.debug = False  # Debugging flag
+########################################################################################################################
+# ARCH
+########################################################################################################################
+cfg.arch = CN()
+cfg.arch.seed = 42  # Random seed for Pytorch/Numpy initialization
+cfg.arch.min_epochs = 1  # Minimum number of epochs
+cfg.arch.max_epochs = 50  # Maximum number of epochs
+########################################################################################################################
+# CHECKPOINT
+########################################################################################################################
+cfg.checkpoint = CN()
+cfg.checkpoint.filepath = './results/mdoel'  # Checkpoint filepath to save data
+cfg.checkpoint.save_top_k = 5  # Number of best models to save
+cfg.checkpoint.monitor = 'abs_rel_pp_gt'  # Metric to monitor for logging
+cfg.checkpoint.monitor_index = 0  # Dataset index for the metric to monitor
+cfg.checkpoint.mode = 'auto'  # Automatically determine direction of improvement (increase or decrease)
+cfg.checkpoint.s3_path = ''  # s3 path for AWS model syncing
+cfg.checkpoint.s3_frequency = 1  # How often to s3 sync
+########################################################################################################################
+# SAVE
+########################################################################################################################
+cfg.save = CN()
+cfg.save.folder = './results'  # Folder where data will be saved
+cfg.save.depth = CN()
+cfg.save.depth.rgb = True  # Flag for saving rgb images
+cfg.save.depth.viz = True  # Flag for saving inverse depth map visualization
+cfg.save.depth.npz = True  # Flag for saving numpy depth maps
+cfg.save.depth.png = True  # Flag for saving png depth maps
+########################################################################################################################
+# WANDB
+########################################################################################################################
+cfg.wandb = CN()
+cfg.wandb.dry_run = True  # Wandb dry-run (not logging)
+cfg.wandb.name = ''  # Wandb run name
+cfg.wandb.project = os.environ.get('WANDB_PROJECT', '')  # Wandb project
+cfg.wandb.entity = os.environ.get('WANDB_ENTITY', '')  # Wandb entity
+cfg.wandb.tags = []  # Wandb tags
+cfg.wandb.dir = ''  # Wandb save folder
+########################################################################################################################
+# MODEL
+########################################################################################################################
+cfg.model = CN()
+cfg.model.name = ''  # Training model
+cfg.model.checkpoint_path = ''  # Checkpoint path for model saving
+########################################################################################################################
+# MODEL.OPTIMIZER
+########################################################################################################################
+cfg.model.optimizer = CN()
+cfg.model.optimizer.name = 'Adam'  # Optimizer name
+cfg.model.optimizer.depth = CN()
+cfg.model.optimizer.depth.lr = 0.0002  # Depth learning rate
+cfg.model.optimizer.depth.weight_decay = 0.0  # Dept weight decay
+cfg.model.optimizer.pose = CN()
+cfg.model.optimizer.pose.lr = 0.0002  # Pose learning rate
+cfg.model.optimizer.pose.weight_decay = 0.0  # Pose weight decay
+cfg.model.optimizer.momentum = 0.9
+########################################################################################################################
+# MODEL.SCHEDULER
+########################################################################################################################
+cfg.model.scheduler = CN()
+cfg.model.scheduler.name = 'StepLR'  # Scheduler name
+cfg.model.scheduler.step_size = 10  # Scheduler step size
+cfg.model.scheduler.gamma = 0.5  # Scheduler gamma value
+cfg.model.scheduler.T_max = 20  # Scheduler maximum number of iterations
+cfg.model.scheduler.eta_min = 1e-7
+cfg.model.scheduler.milestones = [10, 15, 20, 25, 30, 35, 40, 45]
+########################################################################################################################
+# MODEL.PARAMS
+########################################################################################################################
+cfg.model.params = CN()
+cfg.model.params.crop = ''  # Which crop should be used during evaluation
+cfg.model.params.min_depth = 0.0  # Minimum depth value to evaluate
+cfg.model.params.max_depth = 80.0  # Maximum depth value to evaluate
+########################################################################################################################
+# MODEL.LOSS
+########################################################################################################################
+cfg.model.loss = CN()
+#
+cfg.model.loss.num_scales = 4  # Number of inverse depth scales to use
+cfg.model.loss.progressive_scaling = 0.0  # Training percentage to decay number of scales
+cfg.model.loss.flip_lr_prob = 0.5  # Probablity of horizontal flippping
+cfg.model.loss.rotation_mode = 'euler'  # Rotation mode
+cfg.model.loss.upsample_depth_maps = True  # Resize depth maps to highest resolution
+#
+cfg.model.loss.ssim_loss_weight = 0.85  # SSIM loss weight
+cfg.model.loss.occ_reg_weight = 0.1  # Occlusion regularizer loss weight
+cfg.model.loss.smooth_loss_weight = 0.001  # Smoothness loss weight
+cfg.model.loss.C1 = 1e-4  # SSIM parameter
+cfg.model.loss.C2 = 9e-4  # SSIM parameter
+cfg.model.loss.photometric_reduce_op = 'min'  # Method for photometric loss reducing
+cfg.model.loss.disp_norm = True  # Inverse depth normalization
+cfg.model.loss.clip_loss = 0.0  # Clip loss threshold variance
+cfg.model.loss.padding_mode = 'zeros'  # Photometric loss padding mode
+cfg.model.loss.automask_loss = True  # Automasking to remove static pixels
+#
+cfg.model.loss.velocity_loss_weight = 0.1  # Velocity supervision loss weight
+#
+cfg.model.loss.supervised_method = 'sparse-l1'  # Method for depth supervision
+cfg.model.loss.supervised_num_scales = 4  # Number of scales for supervised learning
+cfg.model.loss.supervised_loss_weight = 0.9  # Supervised loss weight
+########################################################################################################################
+# MODEL.DEPTH_NET
+########################################################################################################################
+cfg.model.depth_net = CN()
+cfg.model.depth_net.name = ''  # Depth network name
+cfg.model.depth_net.checkpoint_path = ''  # Depth checkpoint filepath
+cfg.model.depth_net.version = ''  # Depth network version
+cfg.model.depth_net.dropout = 0.0  # Depth network dropout
+########################################################################################################################
+# MODEL.POSE_NET
+########################################################################################################################
+cfg.model.pose_net = CN()
+cfg.model.pose_net.name = ''  # Pose network name
+cfg.model.pose_net.checkpoint_path = ''  # Pose checkpoint filepath
+cfg.model.pose_net.version = ''  # Pose network version
+cfg.model.pose_net.dropout = 0.0  # Pose network dropout
+########################################################################################################################
+# MODEL.perpcep_net
+########################################################################################################################
+cfg.model.percep_net = CN()
+cfg.model.percep_net.name = ''  # percep_net network name
+cfg.model.percep_net.checkpoint_path = ''  # percep_net checkpoint filepath
+cfg.model.percep_net.version = ''  # percep_net network version
+cfg.model.percep_net.dropout = 0.0  # percep_net network dropout
+########################################################################################################################
+# DATASETS
+########################################################################################################################
+cfg.datasets = CN()
+########################################################################################################################
+# DATASETS.AUGMENTATION
+########################################################################################################################
+cfg.datasets.augmentation = CN()
+cfg.datasets.augmentation.image_shape = (192, 640)  # Image shape
+cfg.datasets.augmentation.jittering = (0.2, 0.2, 0.2, 0.05
+                                       )  # Color jittering values
+########################################################################################################################
+# DATASETS.TRAIN
+########################################################################################################################
+cfg.datasets.train = CN()
+cfg.datasets.train.batch_size = 8  # Training batch size
+cfg.datasets.train.num_workers = 16  # Training number of workers
+cfg.datasets.train.back_context = 1  # Training backward context
+cfg.datasets.train.forward_context = 1  # Training forward context
+cfg.datasets.train.dataset = []  # Training dataset
+cfg.datasets.train.path = []  # Training data path
+cfg.datasets.train.split = []  # Training split
+cfg.datasets.train.depth_type = ['']  # Training depth type
+cfg.datasets.train.cameras = [
+    []
+]  # Training cameras (double list, one for each dataset)
+cfg.datasets.train.repeat = [
+    1
+]  # Number of times training dataset is repeated per epoch
+cfg.datasets.train.num_logs = 5  # Number of training images to log
+cfg.datasets.train.strides = (1, )  # stride
+########################################################################################################################
+# DATASETS.VALIDATION
+########################################################################################################################
+cfg.datasets.validation = CN()
+cfg.datasets.validation.batch_size = 1  # Validation batch size
+cfg.datasets.validation.num_workers = 8  # Validation number of workers
+cfg.datasets.validation.back_context = 0  # Validation backward context
+cfg.datasets.validation.forward_context = 0  # Validation forward contxt
+cfg.datasets.validation.dataset = []  # Validation dataset
+cfg.datasets.validation.path = []  # Validation data path
+cfg.datasets.validation.split = []  # Validation split
+cfg.datasets.validation.depth_type = ['']  # Validation depth type
+cfg.datasets.validation.cameras = [
+    []
+]  # Validation cameras (double list, one for each dataset)
+cfg.datasets.validation.num_logs = 5  # Number of validation images to log
+cfg.datasets.validation.strides = (1, )  # stride
+########################################################################################################################
+# DATASETS.TEST
+########################################################################################################################
+cfg.datasets.test = CN()
+cfg.datasets.test.batch_size = 1  # Test batch size
+cfg.datasets.test.num_workers = 8  # Test number of workers
+cfg.datasets.test.back_context = 0  # Test backward context
+cfg.datasets.test.forward_context = 0  # Test forward context
+cfg.datasets.test.dataset = []  # Test dataset
+cfg.datasets.test.path = []  # Test data path
+cfg.datasets.test.split = []  # Test split
+cfg.datasets.test.depth_type = ['']  # Test depth type
+cfg.datasets.test.cameras = [
+    []
+]  # Test cameras (double list, one for each dataset)
+cfg.datasets.test.num_logs = 5  # Number of test images to log
+cfg.datasets.test.strides = (1, )  # stride
+########################################################################################################################
+# THESE SHOULD NOT BE CHANGED
+########################################################################################################################
+cfg.config = ''  # Run configuration file
+cfg.default = ''  # Run default configuration file
+cfg.wandb.url = ''  # Wandb URL
+cfg.checkpoint.s3_url = ''  # s3 URL
+cfg.save.pretrained = ''  # Pretrained checkpoint
+cfg.prepared = False  # Prepared flag
+########################################################################################################################
+
+
+def get_cfg_defaults():
+    return cfg.clone()
diff --git a/modelscope/models/cv/video_depth_estimation/dro_model.py b/modelscope/models/cv/video_depth_estimation/dro_model.py
new file mode 100644
index 00000000..761b4b71
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/dro_model.py
@@ -0,0 +1,189 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+from glob import glob
+
+import cv2
+import numpy as np
+import torch
+import tqdm
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.video_depth_estimation.models.model_wrapper import \
+    ModelWrapper
+from modelscope.models.cv.video_depth_estimation.utils.augmentations import (
+    resize_image, to_tensor)
+from modelscope.models.cv.video_depth_estimation.utils.config import \
+    parse_test_file
+from modelscope.models.cv.video_depth_estimation.utils.depth import (
+    inv2depth, viz_inv_depth, write_depth)
+from modelscope.models.cv.video_depth_estimation.utils.image import (
+    get_intrinsics, load_image, parse_video)
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.video_depth_estimation,
+    module_name=Models.dro_resnet18_depth_estimation)
+class DROEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+
+        # Parse arguments
+        config, state_dict = parse_test_file(model_path)
+
+        # If no image shape is provided, use the checkpoint one
+        self.image_shape = config.datasets.augmentation.image_shape
+        print(f'== input image shape:{self.image_shape}')
+
+        # Initialize model wrapper from checkpoint arguments
+        self.model_wrapper = ModelWrapper(config, load_datasets=False)
+        # Restore monodepth_model state
+        self.model_wrapper.load_state_dict(state_dict)
+
+        # Send model to GPU if available
+        if torch.cuda.is_available():
+            model_wrapper = self.model_wrapper.to('cuda')
+        else:
+            raise RuntimeError('cuda is not available')
+
+        # Set to eval mode
+        model_wrapper.eval()
+
+    def forward(self, Inputs):
+        return self.model_wrapper(Inputs)
+
+    def postprocess(self, Inputs):
+        return Inputs
+
+    def inference(self, data):
+
+        print('processing video input:.........')
+        input_type = 'video'
+        sample_rate = 1
+        data_type = 'indoor'
+
+        assert osp.splitext(data['video_path'])[1] in [
+            '.mp4', '.avi', '.mov', '.mpeg', '.flv', '.wmv'
+        ]
+        input_video_images = os.path.join('tmp/input_video_images')
+        parse_video(data['video_path'], input_video_images, sample_rate)
+        # update input
+        input = input_video_images
+
+        files = []
+        for ext in ['png', 'jpg', 'bmp']:
+            files.extend(glob((os.path.join(input, '*.{}'.format(ext)))))
+
+        if input_type == 'folder':
+            print('processing folder input:...........')
+            print(f'folder total frames num: {len(files)}')
+            files = files[::sample_rate]
+
+        files.sort()
+        print('Found total {} files'.format(len(files)))
+        assert len(files) > 2
+
+        # Process each file
+        list_of_files = list(zip(files[:-2], files[1:-1], files[2:]))
+
+        depth_list = []
+        pose_list = []
+        vis_depth_list = []
+        depth_upsample_list = []
+        vis_depth_upsample_list = []
+
+        print(f'*********************data_type:{data_type}')
+        print('inference start.....................')
+        for fn1, fn2, fn3 in tqdm.tqdm(list_of_files):
+            depth, vis_depth, depth_upsample, vis_depth_upsample, pose21, pose23, intr, rgb = self.infer_and_save_pose(
+                [fn1, fn3], fn2, self.model_wrapper, self.image_shape,
+                data_type)
+            pose_list.append((pose21, pose23))
+            depth_list.append(depth)
+            vis_depth_list.append(vis_depth.astype(np.uint8))
+            depth_upsample_list.append(depth_upsample)
+            vis_depth_upsample_list.append(vis_depth_upsample.astype(np.uint8))
+
+        return {
+            'depths': depth_list,
+            'depths_color': vis_depth_upsample_list,
+            'poses': pose_list
+        }
+
+    @torch.no_grad()
+    def infer_and_save_pose(self, input_file_refs, input_file, model_wrapper,
+                            image_shape, data_type):
+        """
+        Process a single input file to produce and save visualization
+
+        Parameters
+        ----------
+        input_file_refs : list(str)
+            Reference image file paths
+        input_file : str
+            Image file for pose estimation
+        model_wrapper : nn.Module
+            Model wrapper used for inference
+        image_shape : Image shape
+            Input image shape
+        half: bool
+            use half precision (fp16)
+        save: str
+            Save format (npz or png)
+        """
+
+        image_raw_wh = load_image(input_file).size
+
+        # Load image
+        def process_image(filename):
+            image = load_image(filename)
+            # Resize and to tensor
+            intr = get_intrinsics(image.size, image_shape, data_type)  # (3, 3)
+            image = resize_image(image, image_shape)
+            image = to_tensor(image).unsqueeze(0)
+            intr = torch.from_numpy(intr).unsqueeze(0)  # (1, 3, 3)
+            # Send image to GPU if available
+            if torch.cuda.is_available():
+                image = image.to('cuda')
+                intr = intr.to('cuda')
+            return image, intr
+
+        image_ref = [
+            process_image(input_file_ref)[0]
+            for input_file_ref in input_file_refs
+        ]
+        image, intrinsics = process_image(input_file)
+
+        batch = {
+            'rgb': image,
+            'rgb_context': image_ref,
+            'intrinsics': intrinsics
+        }
+
+        output = self.forward(batch)
+        inv_depth = output['inv_depths'][0]  # (1, 1, h, w)
+        depth = inv2depth(inv_depth)[0, 0].detach().cpu().numpy()  # (h, w)
+
+        pose21 = output['poses'][0].mat[0].detach().cpu().numpy(
+        )  # (4, 4)  # TODO check: targe -> ref[0]
+        pose23 = output['poses'][1].mat[0].detach().cpu().numpy(
+        )  # (4, 4)  # TODO check: targe -> ref[0]
+
+        vis_depth = viz_inv_depth(inv_depth[0]) * 255
+
+        vis_depth_upsample = cv2.resize(
+            vis_depth, image_raw_wh, interpolation=cv2.INTER_LINEAR)
+        depth_upsample = cv2.resize(
+            depth, image_raw_wh, interpolation=cv2.INTER_NEAREST)
+
+        return depth, vis_depth, depth_upsample, vis_depth_upsample, pose21, pose23, intrinsics[
+            0].detach().cpu().numpy(), image[0].permute(
+                1, 2, 0).detach().cpu().numpy() * 255
diff --git a/modelscope/models/cv/video_depth_estimation/geometry/__init__.py b/modelscope/models/cv/video_depth_estimation/geometry/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/geometry/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/geometry/camera.py b/modelscope/models/cv/video_depth_estimation/geometry/camera.py
new file mode 100644
index 00000000..426a02e9
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/geometry/camera.py
@@ -0,0 +1,191 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import lru_cache
+
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_depth_estimation.geometry.camera_utils import \
+    scale_intrinsics
+from modelscope.models.cv.video_depth_estimation.geometry.pose import Pose
+from modelscope.models.cv.video_depth_estimation.utils.image import image_grid
+
+
+class Camera(nn.Module):
+    """
+    Differentiable camera class implementing reconstruction and projection
+    functions for a pinhole model.
+    """
+
+    def __init__(self, K, Tcw=None):
+        """
+        Initializes the Camera class
+
+        Parameters
+        ----------
+        K : torch.Tensor [3,3]
+            Camera intrinsics
+        Tcw : Pose
+            Camera -> World pose transformation
+        """
+        super().__init__()
+        self.K = K
+        self.Tcw = Pose.identity(len(K)) if Tcw is None else Tcw
+
+    def __len__(self):
+        """Batch size of the camera intrinsics"""
+        return len(self.K)
+
+    def to(self, *args, **kwargs):
+        """Moves object to a specific device"""
+        self.K = self.K.to(*args, **kwargs)
+        self.Tcw = self.Tcw.to(*args, **kwargs)
+        return self
+
+    @property
+    def fx(self):
+        """Focal length in x"""
+        return self.K[:, 0, 0]
+
+    @property
+    def fy(self):
+        """Focal length in y"""
+        return self.K[:, 1, 1]
+
+    @property
+    def cx(self):
+        """Principal point in x"""
+        return self.K[:, 0, 2]
+
+    @property
+    def cy(self):
+        """Principal point in y"""
+        return self.K[:, 1, 2]
+
+    @property
+    @lru_cache()
+    def Twc(self):
+        """World -> Camera pose transformation (inverse of Tcw)"""
+        return self.Tcw.inverse()
+
+    @property
+    @lru_cache()
+    def Kinv(self):
+        """Inverse intrinsics (for lifting)"""
+        Kinv = self.K.clone()
+        Kinv[:, 0, 0] = 1. / self.fx
+        Kinv[:, 1, 1] = 1. / self.fy
+        Kinv[:, 0, 2] = -1. * self.cx / self.fx
+        Kinv[:, 1, 2] = -1. * self.cy / self.fy
+        return Kinv
+
+    def scaled(self, x_scale, y_scale=None):
+        """
+        Returns a scaled version of the camera (changing intrinsics)
+
+        Parameters
+        ----------
+        x_scale : float
+            Resize scale in x
+        y_scale : float
+            Resize scale in y. If None, use the same as x_scale
+
+        Returns
+        -------
+        camera : Camera
+            Scaled version of the current cmaera
+        """
+        # If single value is provided, use for both dimensions
+        if y_scale is None:
+            y_scale = x_scale
+        # If no scaling is necessary, return same camera
+        if x_scale == 1. and y_scale == 1.:
+            return self
+        # Scale intrinsics and return new camera with same Pose
+        K = scale_intrinsics(self.K.clone(), x_scale, y_scale)
+        return Camera(K, Tcw=self.Tcw)
+
+    def reconstruct(self, depth, frame='w'):
+        """
+        Reconstructs pixel-wise 3D points from a depth map.
+
+        Parameters
+        ----------
+        depth : torch.Tensor [B,1,H,W]
+            Depth map for the camera
+        frame : 'w'
+            Reference frame: 'c' for camera and 'w' for world
+
+        Returns
+        -------
+        points : torch.tensor [B,3,H,W]
+            Pixel-wise 3D points
+        """
+        B, C, H, W = depth.shape
+        assert C == 1
+
+        # Create flat index grid
+        grid = image_grid(
+            B, H, W, depth.dtype, depth.device, normalized=False)  # [B,3,H,W]
+        flat_grid = grid.view(B, 3, -1)  # [B,3,HW]
+
+        # Estimate the outward rays in the camera frame
+        xnorm = (self.Kinv.bmm(flat_grid)).view(B, 3, H, W)
+        # Scale rays to metric depth
+        Xc = xnorm * depth
+
+        # If in camera frame of reference
+        if frame == 'c':
+            return Xc
+        # If in world frame of reference
+        elif frame == 'w':
+            return self.Twc @ Xc
+        # If none of the above
+        else:
+            raise ValueError('Unknown reference frame {}'.format(frame))
+
+    def project(self, X, frame='w', normalize=True):
+        """
+        Projects 3D points onto the image plane
+
+        Parameters
+        ----------
+        X : torch.Tensor [B,3,H,W]
+            3D points to be projected
+        frame : 'w'
+            Reference frame: 'c' for camera and 'w' for world
+
+        Returns
+        -------
+        points : torch.Tensor [B,H,W,2]
+            2D projected points that are within the image boundaries
+        """
+        B, C, H, W = X.shape
+        assert C == 3
+
+        # Project 3D points onto the camera image plane
+        if frame == 'c':
+            Xc = self.K.bmm(X.view(B, 3, -1))
+        elif frame == 'w':
+            Xc = self.K.bmm((self.Tcw @ X).view(B, 3, -1))
+        else:
+            raise ValueError('Unknown reference frame {}'.format(frame))
+
+        # Normalize points
+        X = Xc[:, 0]
+        Y = Xc[:, 1]
+        Z = Xc[:, 2].clamp(min=1e-5)
+        if normalize:
+            Xnorm = 2 * (X / Z) / (W - 1) - 1.  # (-1, 1)
+            Ynorm = 2 * (Y / Z) / (H - 1) - 1.
+        else:
+            Xnorm = X / Z
+            Ynorm = Y / Z
+
+        # Clamp out-of-bounds pixels
+        # Xmask = ((Xnorm > 1) + (Xnorm < -1)).detach()
+        # Xnorm[Xmask] = 2.
+        # Ymask = ((Ynorm > 1) + (Ynorm < -1)).detach()
+        # Ynorm[Ymask] = 2.
+
+        # Return pixel coordinates
+        return torch.stack([Xnorm, Ynorm], dim=-1).view(B, H, W, 2)
diff --git a/modelscope/models/cv/video_depth_estimation/geometry/camera_utils.py b/modelscope/models/cv/video_depth_estimation/geometry/camera_utils.py
new file mode 100644
index 00000000..08c9d454
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/geometry/camera_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn.functional as funct
+
+########################################################################################################################
+
+
+def construct_K(fx, fy, cx, cy, dtype=torch.float, device=None):
+    """Construct a [3,3] camera intrinsics from pinhole parameters"""
+    return torch.tensor([[fx, 0, cx], [0, fy, cy], [0, 0, 1]],
+                        dtype=dtype,
+                        device=device)
+
+
+def scale_intrinsics(K, x_scale, y_scale):
+    """Scale intrinsics given x_scale and y_scale factors"""
+    K[..., 0, 0] *= x_scale
+    K[..., 1, 1] *= y_scale
+    K[..., 0, 2] = (K[..., 0, 2] + 0.5) * x_scale - 0.5
+    K[..., 1, 2] = (K[..., 1, 2] + 0.5) * y_scale - 0.5
+    return K
+
+
+########################################################################################################################
+
+
+def view_synthesis(ref_image,
+                   depth,
+                   ref_cam,
+                   cam,
+                   mode='bilinear',
+                   padding_mode='zeros'):
+    """
+    Synthesize an image from another plus a depth map.
+
+    Parameters
+    ----------
+    ref_image : torch.Tensor [B,3,H,W]
+        Reference image to be warped
+    depth : torch.Tensor [B,1,H,W]
+        Depth map from the original image
+    ref_cam : Camera
+        Camera class for the reference image
+    cam : Camera
+        Camera class for the original image
+    mode : str
+        Interpolation mode
+    padding_mode : str
+        Padding mode for interpolation
+
+    Returns
+    -------
+    ref_warped : torch.Tensor [B,3,H,W]
+        Warped reference image in the original frame of reference
+    """
+    assert depth.size(1) == 1
+    # Reconstruct world points from target_camera
+    world_points = cam.reconstruct(depth, frame='w')
+    # Project world points onto reference camera
+    ref_coords = ref_cam.project(world_points, frame='w')
+
+    # View-synthesis given the projected reference points
+    return funct.grid_sample(
+        ref_image,
+        ref_coords,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=True)
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_depth_estimation/geometry/pose.py b/modelscope/models/cv/video_depth_estimation/geometry/pose.py
new file mode 100644
index 00000000..ccb73782
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/geometry/pose.py
@@ -0,0 +1,107 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+
+from modelscope.models.cv.video_depth_estimation.geometry.pose_utils import (
+    invert_pose, pose_vec2mat)
+
+########################################################################################################################
+
+
+class Pose:
+    """
+    Pose class, that encapsulates a [4,4] transformation matrix
+    for a specific reference frame
+    """
+
+    def __init__(self, mat):
+        """
+        Initializes a Pose object.
+
+        Parameters
+        ----------
+        mat : torch.Tensor [B,4,4]
+            Transformation matrix
+        """
+        assert tuple(mat.shape[-2:]) == (4, 4)
+        if mat.dim() == 2:
+            mat = mat.unsqueeze(0)
+        assert mat.dim() == 3
+        self.mat = mat
+
+    def __len__(self):
+        """Batch size of the transformation matrix"""
+        return len(self.mat)
+
+########################################################################################################################
+
+    @classmethod
+    def identity(cls, N=1, device=None, dtype=torch.float):
+        """Initializes as a [4,4] identity matrix"""
+        return cls(torch.eye(4, device=device, dtype=dtype).repeat([N, 1, 1]))
+
+    @classmethod
+    def from_vec(cls, vec, mode):
+        """Initializes from a [B,6] batch vector"""
+        mat = pose_vec2mat(vec, mode)  # [B,3,4]
+        pose = torch.eye(
+            4, device=vec.device, dtype=vec.dtype).repeat([len(vec), 1, 1])
+        pose[:, :3, :3] = mat[:, :3, :3]
+        pose[:, :3, -1] = mat[:, :3, -1]
+        return cls(pose)
+
+########################################################################################################################
+
+    @property
+    def shape(self):
+        """Returns the transformation matrix shape"""
+        return self.mat.shape
+
+    def item(self):
+        """Returns the transformation matrix"""
+        return self.mat
+
+    def repeat(self, *args, **kwargs):
+        """Repeats the transformation matrix multiple times"""
+        self.mat = self.mat.repeat(*args, **kwargs)
+        return self
+
+    def inverse(self):
+        """Returns a new Pose that is the inverse of this one"""
+        return Pose(invert_pose(self.mat))
+
+    def to(self, *args, **kwargs):
+        """Moves object to a specific device"""
+        self.mat = self.mat.to(*args, **kwargs)
+        return self
+
+########################################################################################################################
+
+    def transform_pose(self, pose):
+        """Creates a new pose object that compounds this and another one (self * pose)"""
+        assert tuple(pose.shape[-2:]) == (4, 4)
+        return Pose(self.mat.bmm(pose.item()))
+
+    def transform_points(self, points):
+        """Transforms 3D points using this object"""
+        assert points.shape[1] == 3
+        B, _, H, W = points.shape
+        out = self.mat[:, :3, :3].bmm(points.view(B, 3, -1)) + \
+            self.mat[:, :3, -1].unsqueeze(-1)
+        return out.view(B, 3, H, W)
+
+    def __matmul__(self, other):
+        """Transforms the input (Pose or 3D points) using this object"""
+        if isinstance(other, Pose):
+            return self.transform_pose(other)
+        elif isinstance(other, torch.Tensor):
+            if other.shape[1] == 3 and other.dim() > 2:
+                assert other.dim() == 3 or other.dim() == 4
+                return self.transform_points(other)
+            else:
+                raise ValueError('Unknown tensor dimensions {}'.format(
+                    other.shape))
+        else:
+            raise NotImplementedError()
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_depth_estimation/geometry/pose_utils.py b/modelscope/models/cv/video_depth_estimation/geometry/pose_utils.py
new file mode 100644
index 00000000..d58105db
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/geometry/pose_utils.py
@@ -0,0 +1,115 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+from torch._C import dtype
+
+
+def mat2euler(mat):
+    euler = torch.ones(mat.shape[0], 3, dtype=mat.dtype, device=mat.device)
+    cy_thresh = 1e-6
+    # try:
+    #     cy_thresh = np.finfo(mat.dtype).eps * 4
+    # except ValueError:
+    #     cy_thresh = np.finfo(np.float).eps * 4.0
+    # print("cy_thresh", cy_thresh)
+    r11, r12, r13, r21, r22, r23, _, _, r33 = mat[:, 0, 0], mat[:, 0, 1], mat[:, 0, 2], \
+        mat[:, 1, 0], mat[:, 1, 1], mat[:, 1, 2], \
+        mat[:, 2, 0], mat[:, 2, 1], mat[:, 2, 2]
+    # cy: sqrt((cos(y)*cos(z))**2 + (cos(x)*cos(y))**2)
+    cy = torch.sqrt(r33 * r33 + r23 * r23)
+
+    mask = cy > cy_thresh
+
+    if torch.sum(mask) > 1:
+        euler[mask, 0] = torch.atan2(-r23, r33)[mask]
+        euler[mask, 1] = torch.atan2(r13, cy)[mask]
+        euler[mask, 2] = torch.atan2(-r12, r11)[mask]
+
+    mask = cy <= cy_thresh
+    if torch.sum(mask) > 1:
+        print('mat2euler!!!!!!')
+        euler[mask, 0] = 0.0
+        euler[mask, 1] = torch.atan2(r13, cy)  # atan2(sin(y), cy)
+        euler[mask, 2] = torch.atan2(r21, r22)
+
+    return euler
+
+
+########################################################################################################################
+
+
+def euler2mat(angle):
+    """Convert euler angles to rotation matrix"""
+    B = angle.size(0)
+    x, y, z = angle[:, 0], angle[:, 1], angle[:, 2]
+
+    cosz = torch.cos(z)
+    sinz = torch.sin(z)
+
+    zeros = z.detach() * 0
+    ones = zeros.detach() + 1
+    zmat = torch.stack(
+        [cosz, -sinz, zeros, sinz, cosz, zeros, zeros, zeros, ones],
+        dim=1).view(B, 3, 3)
+
+    cosy = torch.cos(y)
+    siny = torch.sin(y)
+
+    ymat = torch.stack(
+        [cosy, zeros, siny, zeros, ones, zeros, -siny, zeros, cosy],
+        dim=1).view(B, 3, 3)
+
+    cosx = torch.cos(x)
+    sinx = torch.sin(x)
+
+    xmat = torch.stack(
+        [ones, zeros, zeros, zeros, cosx, -sinx, zeros, sinx, cosx],
+        dim=1).view(B, 3, 3)
+
+    rot_mat = xmat.bmm(ymat).bmm(zmat)
+    return rot_mat
+
+
+########################################################################################################################
+
+
+def pose_vec2mat(vec, mode='euler'):
+    """Convert Euler parameters to transformation matrix."""
+    if mode is None:
+        return vec
+    trans, rot = vec[:, :3].unsqueeze(-1), vec[:, 3:]
+    if mode == 'euler':
+        rot_mat = euler2mat(rot)
+    elif mode == 'axis_angle':
+        from modelscope.models.cv.video_depth_estimation.geometry.pose_trans import axis_angle_to_matrix
+        rot_mat = axis_angle_to_matrix(rot)
+    else:
+        raise ValueError('Rotation mode not supported {}'.format(mode))
+    mat = torch.cat([rot_mat, trans], dim=2)  # [B,3,4]
+    return mat
+
+
+########################################################################################################################
+
+
+def invert_pose(T):
+    """Inverts a [B,4,4] torch.tensor pose"""
+    Tinv = torch.eye(4, device=T.device, dtype=T.dtype).repeat([len(T), 1, 1])
+    Tinv[:, :3, :3] = torch.transpose(T[:, :3, :3], -2, -1)
+    Tinv[:, :3, -1] = torch.bmm(-1. * Tinv[:, :3, :3],
+                                T[:, :3, -1].unsqueeze(-1)).squeeze(-1)
+    return Tinv
+
+
+########################################################################################################################
+
+
+def invert_pose_numpy(T):
+    """Inverts a [4,4] np.array pose"""
+    Tinv = np.copy(T)
+    R, t = Tinv[:3, :3], Tinv[:3, 3]
+    Tinv[:3, :3], Tinv[:3, 3] = R.T, -np.matmul(R.T, t)
+    return Tinv
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_depth_estimation/models/__init__.py b/modelscope/models/cv/video_depth_estimation/models/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/models/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/models/model_checkpoint.py b/modelscope/models/cv/video_depth_estimation/models/model_checkpoint.py
new file mode 100644
index 00000000..761990ff
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/models/model_checkpoint.py
@@ -0,0 +1,144 @@
+# The implementation is adopted from Pytorch-Lightning
+# https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/callbacks/model_checkpoint.py
+
+import os
+import re
+
+import numpy as np
+import torch
+
+
+def save_code(filepath):
+    """Save code in the models folder"""
+    os.system('tar cfz {}/code.tar.gz *'.format(filepath))
+
+
+class ModelCheckpoint:
+
+    def __init__(self,
+                 filepath=None,
+                 monitor='val_loss',
+                 save_top_k=1,
+                 mode='auto',
+                 period=1,
+                 s3_path='',
+                 s3_frequency=5):
+        super().__init__()
+        # If save_top_k is zero, save all models
+        if save_top_k == 0:
+            save_top_k = 1e6
+        # Create checkpoint folder
+        self.dirpath, self.filename = os.path.split(filepath)
+        print(self.dirpath, self.filename, filepath)
+        os.makedirs(self.dirpath, exist_ok=True)
+        # Store arguments
+        self.monitor = monitor
+        self.save_top_k = save_top_k
+        self.period = period
+        self.epoch_last_check = None
+        self.best_k_models = {}
+        self.kth_best_model = ''
+        self.best = 0
+        # Monitoring modes
+        torch_inf = torch.tensor(np.Inf)
+        mode_dict = {
+            'min': (torch_inf, 'min'),
+            'max': (-torch_inf, 'max'),
+            'auto': (-torch_inf, 'max') if 'acc' in self.monitor
+            or 'a1' in self.monitor or self.monitor.startswith('fmeasure') else
+            (torch_inf, 'min'),
+        }
+        self.kth_value, self.mode = mode_dict[mode]
+
+        self.s3_path = s3_path
+        self.s3_frequency = s3_frequency
+        self.s3_enabled = (s3_path != '') and (s3_frequency > 0)
+        self.save_code = True
+
+    @staticmethod
+    def _del_model(filepath):
+        if os.path.isfile(filepath):
+            os.remove(filepath)
+
+    def _save_model(self, filepath, model):
+        # Create folder, save model and sync to s3
+        os.makedirs(os.path.dirname(filepath), exist_ok=True)
+        torch.save(
+            {
+                'config': model.config,
+                'epoch': model.current_epoch,
+                'state_dict': model.state_dict(),
+                'optimizer': model.optimizer.state_dict(),
+                'scheduler': model.scheduler.state_dict(),
+            }, filepath)
+
+    def check_monitor_top_k(self, current):
+        # If we don't have enough models
+        if len(self.best_k_models) < self.save_top_k:
+            return True
+        # Convert to torch if necessary
+        if not isinstance(current, torch.Tensor):
+            current = torch.tensor(current)
+        # Get monitoring operation
+        monitor_op = {
+            'min': torch.lt,
+            'max': torch.gt,
+        }[self.mode]
+        # Compare and return
+        return monitor_op(current, self.best_k_models[self.kth_best_model])
+
+    def format_checkpoint_name(self, epoch, metrics):
+        metrics['epoch'] = epoch
+        filename = self.filename
+        for tmp in re.findall(r'(\{.*?)[:\}]', self.filename):
+            name = tmp[1:]
+            filename = filename.replace(tmp, name + '={' + name)
+            if name not in metrics:
+                metrics[name] = 0
+        filename = filename.format(**metrics)
+        return os.path.join(self.dirpath, '{}.ckpt'.format(filename))
+
+    def check_and_save(self, model, metrics):
+        # Check saving interval
+        epoch = model.current_epoch
+        if self.epoch_last_check is not None and \
+                (epoch - self.epoch_last_check) < self.period:
+            return
+        self.epoch_last_check = epoch
+        # Prepare filepath
+        filepath = self.format_checkpoint_name(epoch, metrics)
+        while os.path.isfile(filepath):
+            filepath = self.format_checkpoint_name(epoch, metrics)
+        # Check if saving or not
+        if self.save_top_k != -1:
+            current = metrics.get(self.monitor)
+            assert current, 'Checkpoint metric is not available'
+            if self.check_monitor_top_k(current):
+                self._do_check_save(filepath, model, current)
+        else:
+            self._save_model(filepath, model)
+
+    def _do_check_save(self, filepath, model, current):
+        # List of models to delete
+        del_list = []
+        if len(self.best_k_models) == self.save_top_k and self.save_top_k > 0:
+            delpath = self.kth_best_model
+            self.best_k_models.pop(self.kth_best_model)
+            del_list.append(delpath)
+        # Monitor current models
+        self.best_k_models[filepath] = current
+        if len(self.best_k_models) == self.save_top_k:
+            # Monitor dict has reached k elements
+            _op = max if self.mode == 'min' else min
+            self.kth_best_model = _op(
+                self.best_k_models, key=self.best_k_models.get)
+            self.kth_value = self.best_k_models[self.kth_best_model]
+        # Determine best model
+        _op = min if self.mode == 'min' else max
+        self.best = _op(self.best_k_models.values())
+        # Delete old models
+        for cur_path in del_list:
+            if cur_path != filepath:
+                self._del_model(cur_path)
+        # Save model
+        self._save_model(filepath, model)
diff --git a/modelscope/models/cv/video_depth_estimation/models/model_utils.py b/modelscope/models/cv/video_depth_estimation/models/model_utils.py
new file mode 100644
index 00000000..3c839a51
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/models/model_utils.py
@@ -0,0 +1,70 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+from modelscope.models.cv.video_depth_estimation.utils.types import (is_list,
+                                                                     is_numpy,
+                                                                     is_tensor)
+
+
+def merge_outputs(*outputs):
+    """
+    Merges model outputs for logging
+
+    Parameters
+    ----------
+    outputs : tuple of dict
+        Outputs to be merged
+
+    Returns
+    -------
+    output : dict
+        Dictionary with a "metrics" key containing a dictionary with various metrics and
+        all other keys that are not "loss" (it is handled differently).
+    """
+    ignore = ['loss']  # Keys to ignore
+    combine = ['metrics']  # Keys to combine
+    merge = {key: {} for key in combine}
+    for output in outputs:
+        # Iterate over all keys
+        for key, val in output.items():
+            # Combine these keys
+            if key in combine:
+                for sub_key, sub_val in output[key].items():
+                    assert sub_key not in merge[key].keys(), \
+                        'Combining duplicated key {} to {}'.format(sub_key, key)
+                    merge[key][sub_key] = sub_val
+            # Ignore these keys
+            elif key not in ignore:
+                assert key not in merge.keys(), \
+                    'Adding duplicated key {}'.format(key)
+                merge[key] = val
+    return merge
+
+
+def stack_batch(batch):
+    """
+    Stack multi-camera batches (B,N,C,H,W becomes BN,C,H,W)
+
+    Parameters
+    ----------
+    batch : dict
+        Batch
+
+    Returns
+    -------
+    batch : dict
+        Stacked batch
+    """
+    # If there is multi-camera information
+    if len(batch['rgb'].shape) == 5:
+        assert batch['rgb'].shape[
+            0] == 1, 'Only batch size 1 is supported for multi-cameras'
+        # Loop over all keys
+        for key in batch.keys():
+            # If list, stack every item
+            if is_list(batch[key]):
+                if is_tensor(batch[key][0]) or is_numpy(batch[key][0]):
+                    batch[key] = [sample[0] for sample in batch[key]]
+            # Else, stack single item
+            else:
+                batch[key] = batch[key][0]
+    return batch
diff --git a/modelscope/models/cv/video_depth_estimation/models/model_wrapper.py b/modelscope/models/cv/video_depth_estimation/models/model_wrapper.py
new file mode 100644
index 00000000..c274c48d
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/models/model_wrapper.py
@@ -0,0 +1,319 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import importlib
+import random
+from collections import OrderedDict
+
+import numpy as np
+import torch
+
+from modelscope.models.cv.video_depth_estimation.utils.load import (
+    filter_args, load_class, load_class_args_create, load_network)
+from modelscope.models.cv.video_depth_estimation.utils.misc import pcolor
+
+
+class ModelWrapper(torch.nn.Module):
+    """
+    Top-level torch.nn.Module wrapper around a SfmModel (pose+depth networks).
+    Designed to use models with high-level Trainer classes (cf. trainers/).
+
+    Parameters
+    ----------
+    config : CfgNode
+        Model configuration (cf. configs/default_config.py)
+    """
+
+    def __init__(self, config, resume=None, logger=None, load_datasets=True):
+        super().__init__()
+
+        # Store configuration, checkpoint and logger
+        self.config = config
+        self.logger = logger
+        self.resume = resume
+
+        # Set random seed
+        set_random_seed(config.arch.seed)
+
+        # Task metrics
+        self.metrics_name = 'depth'
+        self.metrics_keys = ('abs_rel', 'sqr_rel', 'rmse', 'rmse_log', 'a1',
+                             'a2', 'a3', 'SILog', 'l1_inv', 'rot_ang', 't_ang',
+                             't_cm')
+        self.metrics_modes = ('', '_pp', '_gt', '_pp_gt')
+
+        # Model, optimizers, schedulers and datasets are None for now
+        self.model = self.optimizer = self.scheduler = None
+        self.train_dataset = self.validation_dataset = self.test_dataset = None
+        self.current_epoch = 0
+
+        # Prepare model
+        self.prepare_model(resume)
+
+        # Preparations done
+        self.config.prepared = True
+
+    def prepare_model(self, resume=None):
+        """Prepare self.model (incl. loading previous state)"""
+        print0(pcolor('### Preparing Model', 'green'))
+        self.model = setup_model(self.config.model, self.config.prepared)
+        # Resume model if available
+        if resume:
+            print0(
+                pcolor(
+                    '### Resuming from {}'.format(resume['file']),
+                    'magenta',
+                    attrs=['bold']))
+            self.model = load_network(self.model, resume['state_dict'],
+                                      'model')
+            if 'epoch' in resume:
+                self.current_epoch = resume['epoch']
+
+    @property
+    def depth_net(self):
+        """Returns depth network."""
+        return self.model.depth_net
+
+    @property
+    def pose_net(self):
+        """Returns pose network."""
+        return self.model.pose_net
+
+    @property
+    def percep_net(self):
+        """Returns perceptual network."""
+        return self.model.percep_net
+
+    @property
+    def logs(self):
+        """Returns various logs for tracking."""
+        params = OrderedDict()
+        for param in self.optimizer.param_groups:
+            params['{}_learning_rate'.format(
+                param['name'].lower())] = param['lr']
+        params['progress'] = self.progress
+        return {
+            **params,
+            **self.model.logs,
+        }
+
+    @property
+    def progress(self):
+        """Returns training progress (current epoch / max. number of epochs)"""
+        return self.current_epoch / self.config.arch.max_epochs
+
+    def configure_optimizers(self):
+        """Configure depth and pose optimizers and the corresponding scheduler."""
+
+        params = []
+        # Load optimizer
+        optimizer = getattr(torch.optim, self.config.model.optimizer.name)
+        # Depth optimizer
+        if self.depth_net is not None:
+            params.append({
+                'name':
+                'Depth',
+                'params':
+                self.depth_net.parameters(),
+                **filter_args(optimizer, self.config.model.optimizer.depth)
+            })
+        # Pose optimizer
+        if self.pose_net is not None:
+            params.append({
+                'name':
+                'Pose',
+                'params': [
+                    param for param in self.pose_net.parameters()
+                    if param.requires_grad
+                ],
+                **filter_args(optimizer, self.config.model.optimizer.pose)
+            })
+        # Create optimizer with parameters
+        optimizer = optimizer(params)
+
+        # Load and initialize scheduler
+        scheduler = getattr(torch.optim.lr_scheduler,
+                            self.config.model.scheduler.name)
+        scheduler = scheduler(
+            optimizer, **filter_args(scheduler, self.config.model.scheduler))
+
+        # Create class variables so we can use it internally
+        self.optimizer = optimizer
+        self.scheduler = scheduler
+
+        # Return optimizer and scheduler
+        return optimizer, scheduler
+
+    def forward(self, *args, **kwargs):
+        """Runs the model and returns the output."""
+        assert self.model is not None, 'Model not defined'
+        return self.model(*args, **kwargs)
+
+    def depth(self, *args, **kwargs):
+        """Runs the pose network and returns the output."""
+        assert self.depth_net is not None, 'Depth network not defined'
+        return self.depth_net(*args, **kwargs)
+
+    def pose(self, *args, **kwargs):
+        """Runs the depth network and returns the output."""
+        assert self.pose_net is not None, 'Pose network not defined'
+        return self.pose_net(*args, **kwargs)
+
+    def percep(self, *args, **kwargs):
+        """Runs the depth network and returns the output."""
+        assert self.percep_net is not None, 'Perceptual network not defined'
+        return self.percep_net(*args, **kwargs)
+
+
+def set_random_seed(seed):
+    if seed >= 0:
+        np.random.seed(seed)
+        random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+
+def setup_depth_net(config, prepared, **kwargs):
+    """
+    Create a depth network
+
+    Parameters
+    ----------
+    config : CfgNode
+        Network configuration
+    prepared : bool
+        True if the network has been prepared before
+    kwargs : dict
+        Extra parameters for the network
+
+    Returns
+    -------
+    depth_net : nn.Module
+        Create depth network
+    """
+    print0(pcolor('DepthNet: %s' % config.name, 'yellow'))
+    if config.name == 'DepthPoseNet':
+        model_class = getattr(
+            importlib.import_module(
+                'modelscope.models.cv.video_depth_estimation.networks.depth_pose.depth_pose_net'
+            ), 'DepthPoseNet')
+    depth_net = model_class(**{**config, **kwargs})
+    if not prepared and config.checkpoint_path != '':
+        depth_net = load_network(depth_net, config.checkpoint_path,
+                                 ['depth_net', 'disp_network'])
+    return depth_net
+
+
+def setup_pose_net(config, prepared, **kwargs):
+    """
+    Create a pose network
+
+    Parameters
+    ----------
+    config : CfgNode
+        Network configuration
+    prepared : bool
+        True if the network has been prepared before
+    kwargs : dict
+        Extra parameters for the network
+
+    Returns
+    -------
+    pose_net : nn.Module
+        Created pose network
+    """
+    print0(pcolor('PoseNet: %s' % config.name, 'yellow'))
+    pose_net = load_class_args_create(
+        config.name,
+        paths=[
+            'modelscope.models.cv.video_depth_estimation.networks.pose',
+        ],
+        args={
+            **config,
+            **kwargs
+        },
+    )
+    if not prepared and config.checkpoint_path != '':
+        pose_net = load_network(pose_net, config.checkpoint_path,
+                                ['pose_net', 'pose_network'])
+    return pose_net
+
+
+def setup_percep_net(config, prepared, **kwargs):
+    """
+    Create a perceputal network
+
+    Parameters
+    ----------
+    config : CfgNode
+        Network configuration
+    prepared : bool
+        True if the network has been prepared before
+    kwargs : dict
+        Extra parameters for the network
+
+    Returns
+    -------
+    depth_net : nn.Module
+        Create depth network
+    """
+    print0(pcolor('PercepNet: %s' % config.name, 'yellow'))
+    percep_net = load_class_args_create(
+        config.name,
+        paths=[
+            'modelscope.models.cv.video_depth_estimation.networks.layers',
+        ],
+        args={
+            **config,
+            **kwargs
+        },
+    )
+    return percep_net
+
+
+def setup_model(config, prepared, **kwargs):
+    """
+    Create a model
+
+    Parameters
+    ----------
+    config : CfgNode
+        Model configuration (cf. configs/default_config.py)
+    prepared : bool
+        True if the model has been prepared before
+    kwargs : dict
+        Extra parameters for the model
+
+    Returns
+    -------
+    model : nn.Module
+        Created model
+    """
+    print0(pcolor('Model: %s' % config.name, 'yellow'))
+    config.loss.min_depth = config.params.min_depth
+    config.loss.max_depth = config.params.max_depth
+    if config.name == 'SupModelMF':
+        model_class = getattr(
+            importlib.import_module(
+                'modelscope.models.cv.video_depth_estimation.models.sup_model_mf'
+            ), 'SupModelMF')
+    model = model_class(**{**config.loss, **kwargs})
+    # Add depth network if required
+    if model.network_requirements['depth_net']:
+        config.depth_net.max_depth = config.params.max_depth
+        config.depth_net.min_depth = config.params.min_depth
+        model.add_depth_net(setup_depth_net(config.depth_net, prepared))
+    # Add pose network if required
+    if model.network_requirements['pose_net']:
+        model.add_pose_net(setup_pose_net(config.pose_net, prepared))
+    # Add percep_net if required
+    if model.network_requirements['percep_net']:
+        model.add_percep_net(setup_percep_net(config.percep_net, prepared))
+    # If a checkpoint is provided, load pretrained model
+    if not prepared and config.checkpoint_path != '':
+        model = load_network(model, config.checkpoint_path, 'model')
+    # Return model
+    return model
+
+
+def print0(string='\n'):
+    print(string)
diff --git a/modelscope/models/cv/video_depth_estimation/models/sfm_model_mf.py b/modelscope/models/cv/video_depth_estimation/models/sfm_model_mf.py
new file mode 100644
index 00000000..bab11196
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/models/sfm_model_mf.py
@@ -0,0 +1,200 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import random
+
+import torch.nn as nn
+
+from modelscope.models.cv.video_depth_estimation.geometry.pose import Pose
+from modelscope.models.cv.video_depth_estimation.utils.image import \
+    flip_lr as flip_lr_img
+from modelscope.models.cv.video_depth_estimation.utils.image import (
+    flip_lr_intr, flip_mf_model, interpolate_scales)
+from modelscope.models.cv.video_depth_estimation.utils.misc import make_list
+
+
+class SfmModelMF(nn.Module):
+    """
+    Model class encapsulating a pose and depth networks.
+
+    Parameters
+    ----------
+    depth_net : nn.Module
+        Depth network to be used
+    pose_net : nn.Module
+        Pose network to be used
+    rotation_mode : str
+        Rotation mode for the pose network
+    flip_lr_prob : float
+        Probability of flipping when using the depth network
+    upsample_depth_maps : bool
+        True if depth map scales are upsampled to highest resolution
+    kwargs : dict
+        Extra parameters
+    """
+
+    def __init__(self,
+                 depth_net=None,
+                 pose_net=None,
+                 rotation_mode='euler',
+                 flip_lr_prob=0.0,
+                 upsample_depth_maps=False,
+                 min_depth=0.1,
+                 max_depth=100,
+                 **kwargs):
+        super().__init__()
+        self.depth_net = depth_net
+        self.pose_net = pose_net
+        self.rotation_mode = rotation_mode
+        self.flip_lr_prob = flip_lr_prob
+        self.upsample_depth_maps = upsample_depth_maps
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self._logs = {}
+        self._losses = {}
+        self._network_requirements = {
+            'depth_net': True,  # Depth network required
+            'pose_net': False,  # Pose network required
+            'percep_net': False,  # Pose network required
+        }
+        self._train_requirements = {
+            'gt_depth': False,  # No ground-truth depth required
+            'gt_pose': False,  # No ground-truth pose required
+        }
+
+    @property
+    def logs(self):
+        """Return logs."""
+        return self._logs
+
+    @property
+    def losses(self):
+        """Return metrics."""
+        return self._losses
+
+    def add_loss(self, key, val):
+        """Add a new loss to the dictionary and detaches it."""
+        self._losses[key] = val.detach()
+
+    @property
+    def network_requirements(self):
+        """
+        Networks required to run the model
+
+        Returns
+        -------
+        requirements : dict
+            depth_net : bool
+                Whether a depth network is required by the model
+            pose_net : bool
+                Whether a depth network is required by the model
+        """
+        return self._network_requirements
+
+    @property
+    def train_requirements(self):
+        """
+        Information required by the model at training stage
+
+        Returns
+        -------
+        requirements : dict
+            gt_depth : bool
+                Whether ground truth depth is required by the model at training time
+            gt_pose : bool
+                Whether ground truth pose is required by the model at training time
+        """
+        return self._train_requirements
+
+    def add_depth_net(self, depth_net):
+        """Add a depth network to the model"""
+        self.depth_net = depth_net
+
+    def add_pose_net(self, pose_net):
+        """Add a pose network to the model"""
+        self.pose_net = pose_net
+
+    def compute_inv_depths(self, image, ref_imgs, intrinsics):
+        """Computes inverse depth maps from single images"""
+        # Randomly flip and estimate inverse depth maps
+        flip_lr = random.random(
+        ) < self.flip_lr_prob if self.training else False
+        if flip_lr:
+            intrinsics = flip_lr_intr(intrinsics, width=image.shape[3])
+        inv_depths_with_poses = flip_mf_model(self.depth_net, image, ref_imgs,
+                                              intrinsics, flip_lr)
+        inv_depths, poses = inv_depths_with_poses
+        inv_depths = make_list(inv_depths)
+        if flip_lr:
+            inv_depths = [flip_lr_img(inv_d) for inv_d in inv_depths]
+        # If upsampling depth maps
+        if self.upsample_depth_maps:
+            inv_depths = interpolate_scales(
+                inv_depths, mode='nearest', align_corners=None)
+        # Return inverse depth maps
+        return inv_depths, poses
+
+    def compute_poses(self, image, contexts, intrinsics, depth):
+        """Compute poses from image and a sequence of context images"""
+        pose_vec = self.pose_net(image, contexts, intrinsics, depth)
+        if pose_vec is None:
+            return None
+        if pose_vec.shape[2] == 6:
+            return [
+                Pose.from_vec(pose_vec[:, i], self.rotation_mode)
+                for i in range(pose_vec.shape[1])
+            ]
+        else:
+            return [Pose(pose_vec[:, i]) for i in range(pose_vec.shape[1])]
+
+    def forward(self, batch, return_logs=False):
+        """
+        Processes a batch.
+
+        Parameters
+        ----------
+        batch : dict
+            Input batch
+        return_logs : bool
+            True if logs are stored
+
+        Returns
+        -------
+        output : dict
+            Dictionary containing predicted inverse depth maps and poses
+        """
+        # Generate inverse depth predictions
+        inv_depths, pose_vec = self.compute_inv_depths(batch['rgb'],
+                                                       batch['rgb_context'],
+                                                       batch['intrinsics'])
+        # # Generate pose predictions if available
+        # pose = None
+        # if 'rgb_context' in batch and self.pose_net is not None:
+        #     pose = self.compute_poses(batch['rgb'],
+        #                               batch['rgb_context'], batch["intrinsics"], inv2depth(inv_depths[0]))
+        # Return output dictionary
+        if pose_vec.shape[2] == 6:
+            poses = [
+                Pose.from_vec(pose_vec[:, i], self.rotation_mode)
+                for i in range(pose_vec.shape[1])
+            ]
+        elif (pose_vec.shape[2]) == 4 and (pose_vec.shape[3] == 4):
+            poses = [Pose(pose_vec[:, i]) for i in range(pose_vec.shape[1])]
+        else:
+            # pose_vec shape: (b, n_view, n_iter, 6)
+            poses = []
+            for i in range(pose_vec.shape[1]):
+                poses_view = []
+                for j in range(pose_vec.shape[2]):
+                    poses_view.append(
+                        Pose.from_vec(pose_vec[:, i, j], self.rotation_mode))
+                poses.append(
+                    poses_view
+                )  # ([pose_view1, pose_view2, ....])  each view has n_iter pose
+
+            # print(poses[0][-1].shape, len(poses), len(poses[0]), len(inv_depths), inv_depths[0].shape)
+            # print(poses[0][-1].mat[0], inv2depth(inv_depths)[-1][0, 0, 12, 40])
+            # print("gt", batch["pose_context"][0][0])
+        return {
+            'inv_depths': inv_depths,
+            'poses': poses,
+        }
diff --git a/modelscope/models/cv/video_depth_estimation/models/sup_model_mf.py b/modelscope/models/cv/video_depth_estimation/models/sup_model_mf.py
new file mode 100644
index 00000000..ba013dcb
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/models/sup_model_mf.py
@@ -0,0 +1,134 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+from modelscope.models.cv.video_depth_estimation.models.model_utils import \
+    merge_outputs
+from modelscope.models.cv.video_depth_estimation.models.sfm_model_mf import \
+    SfmModelMF
+from modelscope.models.cv.video_depth_estimation.utils.depth import depth2inv
+
+
+class SupModelMF(SfmModelMF):
+    """
+    Model that inherits a depth and pose network from SfmModel and
+    includes the photometric loss for self-supervised training.
+
+    Parameters
+    ----------
+    kwargs : dict
+        Extra parameters
+    """
+
+    def __init__(self, **kwargs):
+        # Initializes SfmModel
+        super().__init__(**kwargs)
+        # Initializes the photometric loss
+
+        self._network_requirements = {
+            'depth_net': True,  # Depth network required
+            'pose_net': False,  # Pose network required
+            'percep_net': False,  # Pose network required
+        }
+
+        self._train_requirements = {
+            'gt_depth': True,  # No ground-truth depth required
+            'gt_pose': True,  # No ground-truth pose required
+        }
+
+        # self._photometric_loss = MultiViewPhotometricLoss(**kwargs)
+        # self._loss = SupervisedDepthPoseLoss(**kwargs)
+
+    @property
+    def logs(self):
+        """Return logs."""
+        return {**super().logs, **self._photometric_loss.logs}
+
+    def supervised_loss(self,
+                        image,
+                        ref_images,
+                        inv_depths,
+                        gt_depth,
+                        gt_poses,
+                        poses,
+                        intrinsics,
+                        return_logs=False,
+                        progress=0.0):
+        """
+        Calculates the self-supervised photometric loss.
+
+        Parameters
+        ----------
+        image : torch.Tensor [B,3,H,W]
+            Original image
+        ref_images : list of torch.Tensor [B,3,H,W]
+            Reference images from context
+        inv_depths : torch.Tensor [B,1,H,W]
+            Predicted inverse depth maps from the original image
+        poses : list of Pose
+            List containing predicted poses between original and context images
+        intrinsics : torch.Tensor [B,3,3]
+            Camera intrinsics
+        return_logs : bool
+            True if logs are stored
+        progress :
+            Training progress percentage
+
+        Returns
+        -------
+        output : dict
+            Dictionary containing a "loss" scalar a "metrics" dictionary
+        """
+        return self._loss(
+            image,
+            ref_images,
+            inv_depths,
+            depth2inv(gt_depth),
+            gt_poses,
+            intrinsics,
+            intrinsics,
+            poses,
+            return_logs=return_logs,
+            progress=progress)
+
+    def forward(self, batch, return_logs=False, progress=0.0):
+        """
+        Processes a batch.
+
+        Parameters
+        ----------
+        batch : dict
+            Input batch
+        return_logs : bool
+            True if logs are stored
+        progress :
+            Training progress percentage
+
+        Returns
+        -------
+        output : dict
+            Dictionary containing a "loss" scalar and different metrics and predictions
+            for logging and downstream usage.
+        """
+        # Calculate predicted depth and pose output
+        output = super().forward(batch, return_logs=return_logs)
+        if not self.training:
+            # If not training, no need for self-supervised loss
+            return output
+        else:
+            if output['poses'] is None:
+                return None
+            # Otherwise, calculate self-supervised loss
+            self_sup_output = self.supervised_loss(
+                batch['rgb_original'],
+                batch['rgb_context_original'],
+                output['inv_depths'],
+                batch['depth'],
+                batch['pose_context'],
+                output['poses'],
+                batch['intrinsics'],
+                return_logs=return_logs,
+                progress=progress)
+            # Return loss and metrics
+            return {
+                'loss': self_sup_output['loss'],
+                **merge_outputs(output, self_sup_output),
+            }
diff --git a/modelscope/models/cv/video_depth_estimation/networks/__init__.py b/modelscope/models/cv/video_depth_estimation/networks/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/networks/depth_pose/__init__.py b/modelscope/models/cv/video_depth_estimation/networks/depth_pose/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/depth_pose/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/networks/depth_pose/depth_pose_net.py b/modelscope/models/cv/video_depth_estimation/networks/depth_pose/depth_pose_net.py
new file mode 100644
index 00000000..c1542ddc
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/depth_pose/depth_pose_net.py
@@ -0,0 +1,266 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_depth_estimation.geometry.camera import (
+    Camera, Pose)
+from modelscope.models.cv.video_depth_estimation.networks.layers.resnet.layers import \
+    disp_to_depth
+from modelscope.models.cv.video_depth_estimation.networks.optim.extractor import \
+    ResNetEncoder
+from modelscope.models.cv.video_depth_estimation.networks.optim.update import (
+    BasicUpdateBlockDepth, BasicUpdateBlockPose, DepthHead, PoseHead,
+    UpMaskNet)
+from modelscope.models.cv.video_depth_estimation.utils.depth import inv2depth
+
+
+class DepthPoseNet(nn.Module):
+
+    def __init__(self, version=None, min_depth=0.1, max_depth=100, **kwargs):
+        super().__init__()
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        assert 'it' in version
+        self.iters = int(version.split('-')[0].split('it')[1])
+        self.is_high = 'h' in version
+        self.out_normalize = 'out' in version
+        # get seq len in one stage. default: 4.
+        self.seq_len = 4
+        for str in version.split('-'):
+            if 'seq' in str:
+                self.seq_len = int(str.split('seq')[1])
+        # update iters
+        self.iters = self.iters // self.seq_len
+        # intermediate supervision
+        self.inter_sup = 'inter' in version
+
+        print(
+            f'=======iters:{self.iters}, sub_seq_len:{self.seq_len}, inter_sup: {self.inter_sup}, '
+            f'is_high:{self.is_high}, out_norm:{self.out_normalize}, '
+            f'max_depth:{self.max_depth} min_depth:{self.min_depth}========')
+
+        if self.out_normalize:
+            self.scale_inv_depth = partial(
+                disp_to_depth,
+                min_depth=self.min_depth,
+                max_depth=self.max_depth)
+        else:
+            self.scale_inv_depth = lambda x: (x, None)  # identity
+
+        # feature network, context network, and update block
+        self.foutput_dim = 128
+        self.feat_ratio = 8
+        self.fnet = ResNetEncoder(
+            out_chs=self.foutput_dim, stride=self.feat_ratio)
+
+        self.depth_head = DepthHead(
+            input_dim=self.foutput_dim,
+            hidden_dim=self.foutput_dim,
+            scale=False)
+        self.pose_head = PoseHead(
+            input_dim=self.foutput_dim * 2, hidden_dim=self.foutput_dim)
+        self.upmask_net = UpMaskNet(
+            hidden_dim=self.foutput_dim, ratio=self.feat_ratio)
+
+        self.hdim = 128 if self.is_high else 64
+        self.cdim = 32
+
+        self.update_block_depth = BasicUpdateBlockDepth(
+            hidden_dim=self.hdim,
+            cost_dim=self.foutput_dim,
+            ratio=self.feat_ratio,
+            context_dim=self.cdim)
+        self.update_block_pose = BasicUpdateBlockPose(
+            hidden_dim=self.hdim,
+            cost_dim=self.foutput_dim,
+            context_dim=self.cdim)
+
+        self.cnet = ResNetEncoder(
+            out_chs=self.foutput_dim, stride=self.feat_ratio)
+        self.cnet_depth = ResNetEncoder(
+            out_chs=self.hdim + self.cdim,
+            stride=self.feat_ratio,
+            num_input_images=1)
+        self.cnet_pose = ResNetEncoder(
+            out_chs=self.hdim + self.cdim,
+            stride=self.feat_ratio,
+            num_input_images=2)
+
+    def upsample_depth(self, depth, mask, ratio=8):
+        """ Upsample depth field [H/ratio, W/ratio, 2] -> [H, W, 2] using convex combination """
+        N, _, H, W = depth.shape
+        mask = mask.view(N, 1, 9, ratio, ratio, H, W)
+        mask = torch.softmax(mask, dim=2)
+
+        up_flow = F.unfold(depth, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 1, 9, 1, 1, H, W)
+
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 1, ratio * H, ratio * W)
+
+    def get_cost_each(self, pose, fmap, fmap_ref, depth, K, ref_K,
+                      scale_factor):
+        """
+            depth: (b, 1, h, w)
+            fmap, fmap_ref: (b, c, h, w)
+        """
+        pose = Pose.from_vec(pose, 'euler')
+
+        device = depth.device
+        cam = Camera(K=K.float()).scaled(scale_factor).to(
+            device)  # tcw = Identity
+        ref_cam = Camera(
+            K=ref_K.float(), Tcw=pose).scaled(scale_factor).to(device)
+
+        # Reconstruct world points from target_camera
+        world_points = cam.reconstruct(depth, frame='w')
+        # Project world points onto reference camera
+        ref_coords = ref_cam.project(
+            world_points, frame='w', normalize=True)  # (b, h, w,2)
+
+        fmap_warped = F.grid_sample(
+            fmap_ref,
+            ref_coords,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True)  # (b, c, h, w)
+
+        cost = (fmap - fmap_warped)**2
+
+        return cost
+
+    def depth_cost_calc(self, inv_depth, fmap, fmaps_ref, pose_list, K, ref_K,
+                        scale_factor):
+        cost_list = []
+        for pose, fmap_r in zip(pose_list, fmaps_ref):
+            cost = self.get_cost_each(pose, fmap, fmap_r, inv2depth(inv_depth),
+                                      K, ref_K, scale_factor)
+            cost_list.append(cost)  # (b, c,h, w)
+        # cost = torch.stack(cost_list, dim=1).min(dim=1)[0]
+        cost = torch.stack(cost_list, dim=1).mean(dim=1)
+        return cost
+
+    def forward(self, target_image, ref_imgs, intrinsics):
+        """ Estimate inv depth and  poses """
+        # run the feature network
+        fmaps = self.fnet(torch.cat([target_image] + ref_imgs, dim=0))
+        fmaps = torch.split(
+            fmaps, [target_image.shape[0]] * (1 + len(ref_imgs)), dim=0)
+        fmap1, fmaps_ref = fmaps[0], fmaps[1:]
+        assert target_image.shape[2] / fmap1.shape[2] == self.feat_ratio
+
+        # initial pose
+        pose_list_init = []
+        for fmap_ref in fmaps_ref:
+            pose_list_init.append(
+                self.pose_head(torch.cat([fmap1, fmap_ref], dim=1)))
+
+        # initial depth
+        inv_depth_init = self.depth_head(fmap1, act_fn=F.sigmoid)
+        up_mask = self.upmask_net(fmap1)
+        inv_depth_up_init = self.upsample_depth(
+            inv_depth_init, up_mask, ratio=self.feat_ratio)
+
+        inv_depth_predictions = [self.scale_inv_depth(inv_depth_up_init)[0]]
+        pose_predictions = [[pose.clone() for pose in pose_list_init]]
+
+        # run the context network for optimization
+        if self.iters > 0:
+            cnet_depth = self.cnet_depth(target_image)
+            hidden_d, inp_d = torch.split(
+                cnet_depth, [self.hdim, self.cdim], dim=1)
+            hidden_d = torch.tanh(hidden_d)
+            inp_d = torch.relu(inp_d)
+
+            img_pairs = []
+            for ref_img in ref_imgs:
+                img_pairs.append(torch.cat([target_image, ref_img], dim=1))
+            cnet_pose_list = self.cnet_pose(img_pairs)
+            hidden_p_list, inp_p_list = [], []
+            for cnet_pose in cnet_pose_list:
+                hidden_p, inp_p = torch.split(
+                    cnet_pose, [self.hdim, self.cdim], dim=1)
+                hidden_p_list.append(torch.tanh(hidden_p))
+                inp_p_list.append(torch.relu(inp_p))
+
+        # optimization start.................
+        pose_list = pose_list_init
+        inv_depth = inv_depth_init
+        inv_depth_up = None
+        for itr in range(self.iters):
+            inv_depth = inv_depth.detach()
+            pose_list = [pose.detach() for pose in pose_list]
+
+            # calc cost
+            pose_cost_func_list = []
+            for fmap_ref in fmaps_ref:
+                pose_cost_func_list.append(
+                    partial(
+                        self.get_cost_each,
+                        fmap=fmap1,
+                        fmap_ref=fmap_ref,
+                        depth=inv2depth(self.scale_inv_depth(inv_depth)[0]),
+                        K=intrinsics,
+                        ref_K=intrinsics,
+                        scale_factor=1.0 / self.feat_ratio))
+
+            depth_cost_func = partial(
+                self.depth_cost_calc,
+                fmap=fmap1,
+                fmaps_ref=fmaps_ref,
+                pose_list=pose_list,
+                K=intrinsics,
+                ref_K=intrinsics,
+                scale_factor=1.0 / self.feat_ratio)
+
+            # ########  update depth ##########
+            hidden_d, up_mask_seqs, inv_depth_seqs = self.update_block_depth(
+                hidden_d,
+                depth_cost_func,
+                inv_depth,
+                inp_d,
+                seq_len=self.seq_len,
+                scale_func=self.scale_inv_depth)
+
+            if not self.inter_sup:
+                up_mask_seqs, inv_depth_seqs = [up_mask_seqs[-1]
+                                                ], [inv_depth_seqs[-1]]
+            # upsample predictions
+            for up_mask_i, inv_depth_i in zip(up_mask_seqs, inv_depth_seqs):
+                inv_depth_up = self.upsample_depth(
+                    inv_depth_i, up_mask_i, ratio=self.feat_ratio)
+                inv_depth_predictions.append(
+                    self.scale_inv_depth(inv_depth_up)[0])
+            inv_depth = inv_depth_seqs[-1]
+
+            # ########  update pose ###########
+            pose_list_seqs = [None] * len(pose_list)
+            for i, (pose,
+                    hidden_p) in enumerate(zip(pose_list, hidden_p_list)):
+                hidden_p, pose_seqs = self.update_block_pose(
+                    hidden_p,
+                    pose_cost_func_list[i],
+                    pose,
+                    inp_p_list[i],
+                    seq_len=self.seq_len)
+                hidden_p_list[i] = hidden_p
+                if not self.inter_sup:
+                    pose_seqs = [pose_seqs[-1]]
+                pose_list_seqs[i] = pose_seqs
+
+            for pose_list_i in zip(*pose_list_seqs):
+                pose_predictions.append([pose.clone() for pose in pose_list_i])
+
+            pose_list = list(zip(*pose_list_seqs))[-1]
+
+        if not self.training:
+            return inv_depth_predictions[-1], \
+                torch.stack(pose_predictions[-1], dim=1).view(target_image.shape[0], len(ref_imgs), 6)  # (b, n, 6)
+
+        return inv_depth_predictions, \
+            torch.stack([torch.stack(poses_ref, dim=1) for poses_ref in pose_predictions], dim=2)  # (b, n, iters, 6)
diff --git a/modelscope/models/cv/video_depth_estimation/networks/layers/__init__.py b/modelscope/models/cv/video_depth_estimation/networks/layers/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/layers/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/__init__.py b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/depth_decoder.py b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/depth_decoder.py
new file mode 100644
index 00000000..05cbf833
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/depth_decoder.py
@@ -0,0 +1,265 @@
+# Adapted from monodepth2
+# https://github.com/nianticlabs/monodepth2/blob/master/networks/depth_decoder.py
+
+from __future__ import absolute_import, division, print_function
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .layers import Conv3x3, ConvBlock, upsample
+
+
+class DepthDecoder(nn.Module):
+
+    def __init__(self,
+                 num_ch_enc,
+                 scales=range(4),
+                 num_output_channels=1,
+                 use_skips=True):
+        super(DepthDecoder, self).__init__()
+
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+        self.scales = scales
+
+        self.num_ch_enc = num_ch_enc
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, -1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i
+                                                                           + 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+
+        for s in self.scales:
+            self.convs[('dispconv', s)] = Conv3x3(self.num_ch_dec[s],
+                                                  self.num_output_channels)
+
+        self.decoder = nn.ModuleList(list(self.convs.values()))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input_features):
+        self.outputs = {}
+
+        # decoder
+        x = input_features[-1]
+        for i in range(4, -1, -1):
+            x = self.convs[('upconv', i, 0)](x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = torch.cat(x, 1)
+            x = self.convs[('upconv', i, 1)](x)
+            self.outputs[('feat', i)] = x
+            if i in self.scales:
+                self.outputs[('disp',
+                              i)] = self.sigmoid(self.convs[('dispconv',
+                                                             i)](x))
+
+        return self.outputs
+
+
+class DepthDecoderShare(nn.Module):
+
+    def __init__(self,
+                 num_ch_enc,
+                 scales=range(4),
+                 num_output_channels=1,
+                 stride=8,
+                 use_skips=True,
+                 num_ch_dec=[16, 32, 64, 128, 256]):
+        super(DepthDecoderShare, self).__init__()
+
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+        self.scales = scales
+
+        self.num_ch_enc = num_ch_enc
+        # self.num_ch_dec = np.array([16, 32, 64, 128, 256]) #(s1:16, s2:32, s4:64, s8:128, s16:256)
+        self.num_ch_dec = num_ch_dec
+
+        self.stride = stride
+        # (4:s16, 3:s8, 2:s4, 1:s2, 0:s1)
+        if self.stride == 8:
+            self.scale_idx = 3
+        elif self.stride == 4:
+            self.scale_idx = 2
+        else:
+            raise NotImplementedError
+
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, self.scale_idx - 1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i
+                                                                           + 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+
+        self.decoder = nn.ModuleList(list(self.convs.values()))
+
+    def forward(self, input_features):
+        self.outputs = {}
+
+        # decoder
+        x = input_features[-1]
+        self.outputs[('feat', -1)] = x
+        for i in range(4, self.scale_idx - 1, -1):
+            x = self.convs[('upconv', i, 0)](x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = torch.cat(x, 1)
+            x = self.convs[('upconv', i, 1)](x)
+            self.outputs[('feat', i)] = x
+
+        return self.outputs
+
+
+class DepthDecoderShareFeat(nn.Module):
+
+    def __init__(self,
+                 num_ch_enc,
+                 scales=range(4),
+                 num_output_channels=1,
+                 stride=8,
+                 use_skips=True,
+                 num_ch_dec=[16, 32, 64, 128, 256]):
+        super().__init__()
+
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+        self.scales = scales
+
+        self.num_ch_enc = num_ch_enc
+        # self.num_ch_dec = np.array([16, 32, 64, 128, 256]) #(s1:16, s2:32, s4:64, s8:128, s16:256)
+        self.num_ch_dec = num_ch_dec
+
+        self.stride = stride
+        # (4:s16, 3:s8, 2:s4, 1:s2, 0:s1)
+        if self.stride == 8:
+            self.scale_idx = 3
+        elif self.stride == 4:
+            self.scale_idx = 2
+        else:
+            raise NotImplementedError
+
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, self.scale_idx - 1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i
+                                                                           + 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 0)] = nn.Conv2d(num_ch_in, num_ch_out, 3,
+                                                     1, 1)
+
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 1)] = nn.Conv2d(num_ch_in, num_ch_out, 3,
+                                                     1, 1)
+
+        self.decoder = nn.ModuleList(list(self.convs.values()))
+
+    def forward(self, input_features):
+        self.outputs = {}
+
+        # decoder
+        x = input_features[-1]
+        self.outputs[('feat', -1)] = x
+        for i in range(4, self.scale_idx - 1, -1):
+            x = self.convs[('upconv', i, 0)](x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = torch.cat(x, 1)
+            x = self.convs[('upconv', i, 1)](x)
+            self.outputs[('feat', i)] = x
+
+        return self.outputs
+
+
+class UnetDecoder(nn.Module):
+
+    def __init__(self,
+                 num_ch_enc,
+                 num_output_channels=1,
+                 stride=8,
+                 out_chs=128,
+                 use_skips=True):
+        super(UnetDecoder, self).__init__()
+
+        self.num_output_channels = num_output_channels
+        self.use_skips = use_skips
+        self.upsample_mode = 'nearest'
+
+        self.num_ch_enc = num_ch_enc
+        self.num_ch_dec = np.array([16, 32, 64, 128, 256])
+
+        self.stride = stride
+        # (4:s16, 3:s8, 2:s4, 1:s2, 0:s1)
+        if self.stride == 8:
+            self.scale_idx = 3
+        elif self.stride == 4:
+            self.scale_idx = 2
+        else:
+            raise NotImplementedError
+        # decoder
+        self.convs = OrderedDict()
+        for i in range(4, self.scale_idx - 1, -1):
+            # upconv_0
+            num_ch_in = self.num_ch_enc[-1] if i == 4 else self.num_ch_dec[i
+                                                                           + 1]
+            num_ch_out = self.num_ch_dec[i]
+            self.convs[('upconv', i, 0)] = ConvBlock(num_ch_in, num_ch_out)
+
+            # upconv_1
+            num_ch_in = self.num_ch_dec[i]
+            if self.use_skips and i > 0:
+                num_ch_in += self.num_ch_enc[i - 1]
+            num_ch_out = self.num_ch_dec[i] if i != self.scale_idx else out_chs
+
+            self.convs[('upconv', i, 1)] = ConvBlock(num_ch_in, num_ch_out)
+
+        self.decoder = nn.ModuleList(list(self.convs.values()))
+
+    def forward(self, input_features):
+        self.outputs = {}
+
+        # decoder
+        x = input_features[-1]
+        for i in range(4, self.scale_idx - 1, -1):
+            x = self.convs[('upconv', i, 0)](x)
+            x = [upsample(x)]
+            if self.use_skips and i > 0:
+                x += [input_features[i - 1]]
+            x = torch.cat(x, 1)
+            x = self.convs[('upconv', i, 1)](x)
+            self.outputs[('feat', i)] = x
+
+        return self.outputs
diff --git a/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/layers.py b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/layers.py
new file mode 100644
index 00000000..22acb067
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/layers.py
@@ -0,0 +1,60 @@
+# Adapted from monodepth2
+# https://github.com/nianticlabs/monodepth2/blob/master/layers.py
+
+from __future__ import absolute_import, division, print_function
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def disp_to_depth(disp, min_depth, max_depth):
+    """Convert network's sigmoid output into depth prediction
+    The formula for this conversion is given in the 'additional considerations'
+    section of the paper.
+    """
+    min_disp = 1 / max_depth
+    max_disp = 1 / min_depth
+    scaled_disp = min_disp + (max_disp - min_disp) * disp
+    depth = 1 / scaled_disp
+    return scaled_disp, depth
+
+
+class ConvBlock(nn.Module):
+    """Layer to perform a convolution followed by ELU
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+
+        self.conv = Conv3x3(in_channels, out_channels)
+        self.nonlin = nn.ELU(inplace=True)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.nonlin(out)
+        return out
+
+
+class Conv3x3(nn.Module):
+    """Layer to pad and convolve input
+    """
+
+    def __init__(self, in_channels, out_channels, use_refl=True):
+        super(Conv3x3, self).__init__()
+
+        if use_refl:
+            self.pad = nn.ReflectionPad2d(1)
+        else:
+            self.pad = nn.ZeroPad2d(1)
+        self.conv = nn.Conv2d(int(in_channels), int(out_channels), 3)
+
+    def forward(self, x):
+        out = self.pad(x)
+        out = self.conv(out)
+        return out
+
+
+def upsample(x):
+    """Upsample input tensor by a factor of 2
+    """
+    return F.interpolate(x, scale_factor=2, mode='nearest')
diff --git a/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/pose_decoder.py b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/pose_decoder.py
new file mode 100644
index 00000000..0c2bbf82
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/pose_decoder.py
@@ -0,0 +1,59 @@
+# Adapted from monodepth2
+# https://github.com/nianticlabs/monodepth2/blob/master/networks/pose_decoder.py
+
+from __future__ import absolute_import, division, print_function
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+
+class PoseDecoder(nn.Module):
+
+    def __init__(self,
+                 num_ch_enc,
+                 num_input_features,
+                 num_frames_to_predict_for=None,
+                 stride=1):
+        super(PoseDecoder, self).__init__()
+
+        self.num_ch_enc = num_ch_enc
+        self.num_input_features = num_input_features
+
+        if num_frames_to_predict_for is None:
+            num_frames_to_predict_for = num_input_features - 1
+        self.num_frames_to_predict_for = num_frames_to_predict_for
+
+        self.convs = OrderedDict()
+        self.convs[('squeeze')] = nn.Conv2d(self.num_ch_enc[-1], 256, 1)
+        self.convs[('pose', 0)] = nn.Conv2d(num_input_features * 256, 256, 3,
+                                            stride, 1)
+        self.convs[('pose', 1)] = nn.Conv2d(256, 256, 3, stride, 1)
+        self.convs[('pose', 2)] = nn.Conv2d(256, 6 * num_frames_to_predict_for,
+                                            1)
+
+        self.relu = nn.ReLU()
+
+        self.net = nn.ModuleList(list(self.convs.values()))
+
+    def forward(self, input_features):
+        last_features = [f[-1] for f in input_features]
+
+        cat_features = [
+            self.relu(self.convs['squeeze'](f)) for f in last_features
+        ]
+        cat_features = torch.cat(cat_features, 1)
+
+        out = cat_features
+        for i in range(3):
+            out = self.convs[('pose', i)](out)
+            if i != 2:
+                out = self.relu(out)
+        out = out.mean(3).mean(2)
+
+        out = 0.01 * out.view(-1, self.num_frames_to_predict_for, 1, 6)
+
+        axisangle = out[..., :3]
+        translation = out[..., 3:]
+
+        return axisangle, translation
diff --git a/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/resnet_encoder.py b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/resnet_encoder.py
new file mode 100644
index 00000000..b81ca17f
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/layers/resnet/resnet_encoder.py
@@ -0,0 +1,111 @@
+# Adapted from monodepth2
+# https://github.com/nianticlabs/monodepth2/blob/master/networks/resnet_encoder.py
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import torchvision.models as models
+
+
+class ResNetMultiImageInput(models.ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+    """
+
+    def __init__(self, block, layers, num_classes=1000, num_input_images=1):
+        super(ResNetMultiImageInput, self).__init__(block, layers)
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            num_input_images * 3,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+
+def resnet_multiimage_input(num_layers, pretrained=False, num_input_images=1):
+    """Constructs a ResNet model.
+    Args:
+        num_layers (int): Number of resnet layers. Must be 18 or 50
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        num_input_images (int): Number of frames stacked as input
+    """
+    assert num_layers in [18, 50], 'Can only run with 18 or 50 layer resnet'
+    blocks = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+    block_type = {
+        18: models.resnet.BasicBlock,
+        50: models.resnet.Bottleneck
+    }[num_layers]
+    model = ResNetMultiImageInput(
+        block_type, blocks, num_input_images=num_input_images)
+
+    if pretrained:
+        loaded = model_zoo.load_url(
+            models.resnet.model_urls['resnet{}'.format(num_layers)])
+        loaded['conv1.weight'] = torch.cat(
+            [loaded['conv1.weight']] * num_input_images, 1) / num_input_images
+        model.load_state_dict(loaded)
+    return model
+
+
+class ResnetEncoder(nn.Module):
+    """Pytorch module for a resnet encoder
+    """
+
+    def __init__(self, num_layers, pretrained, num_input_images=1):
+        super(ResnetEncoder, self).__init__()
+        self.num_ch_enc = np.array([64, 64, 128, 256, 512])
+
+        resnets = {
+            18: models.resnet18,
+            34: models.resnet34,
+            50: models.resnet50,
+            101: models.resnet101,
+            152: models.resnet152
+        }
+
+        if num_layers not in resnets:
+            raise ValueError(
+                '{} is not a valid number of resnet layers'.format(num_layers))
+
+        if num_input_images > 1:
+            self.encoder = resnet_multiimage_input(num_layers, pretrained,
+                                                   num_input_images)
+        else:
+            self.encoder = resnets[num_layers](pretrained)
+
+        if num_layers > 34:
+            self.num_ch_enc[1:] *= 4
+
+    def forward(self, input_image):
+        self.features = []
+        x = (input_image - 0.45) / 0.225
+        x = self.encoder.conv1(x)
+        x = self.encoder.bn1(x)
+        self.features.append(self.encoder.relu(x))
+        self.features.append(
+            self.encoder.layer1(self.encoder.maxpool(self.features[-1])))
+        self.features.append(self.encoder.layer2(self.features[-1]))
+        self.features.append(self.encoder.layer3(self.features[-1]))
+        self.features.append(self.encoder.layer4(self.features[-1]))
+
+        return self.features
diff --git a/modelscope/models/cv/video_depth_estimation/networks/optim/__init__.py b/modelscope/models/cv/video_depth_estimation/networks/optim/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/optim/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/networks/optim/extractor.py b/modelscope/models/cv/video_depth_estimation/networks/optim/extractor.py
new file mode 100644
index 00000000..abb0882b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/optim/extractor.py
@@ -0,0 +1,429 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+import torchvision.models as models
+
+
+class ResNetEncoder(models.ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+    """
+
+    def __init__(self,
+                 num_layers=18,
+                 num_input_images=1,
+                 pretrained=False,
+                 out_chs=32,
+                 stride=8):
+        layers = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+        block = {
+            18: models.resnet.BasicBlock,
+            50: models.resnet.Bottleneck
+        }[num_layers]
+        self.upsample_mode = 'bilinear'
+        super(ResNetEncoder, self).__init__(block, layers)
+
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            num_input_images * 3,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+
+        self.stride = stride
+        if stride == 8:
+            self.upconv1 = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv1_fusion = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.out_conv = nn.Conv2d(128, out_chs, 3, 1, padding=1)
+
+        elif stride == 4:
+            self.upconv1 = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv1_fusion = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv2 = nn.Sequential(
+                nn.Conv2d(128, 64, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv2_fusion = nn.Sequential(
+                nn.Conv2d(128, 64, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.out_conv = nn.Conv2d(64, out_chs, 3, 1, padding=1)
+
+        else:
+            raise NotImplementedError
+
+        # self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # del self.layer3
+        del self.layer4
+        del self.fc
+        del self.avgpool
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if pretrained:
+            loaded = model_zoo.load_url(
+                models.resnet.model_urls['resnet{}'.format(num_layers)])
+            loaded['conv1.weight'] = torch.cat(
+                [loaded['conv1.weight']] * num_input_images,
+                1) / num_input_images
+            loaded_flilter = {
+                k: v
+                for k, v in loaded.items()
+                if 'layer4' not in k and 'fc' not in k
+            }
+            try:
+                print('load pretrained model from:',
+                      models.resnet.model_urls['resnet{}'.format(num_layers)])
+                self.load_state_dict(loaded_flilter)
+            except Exception as e:
+                print(e)
+                self.load_state_dict(loaded_flilter, strict=False)
+
+    def forward(self, x):
+        feats = {}
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            num = len(x)
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feats['s4'] = x
+        x = self.layer2(x)
+        feats['s8'] = x
+        x = self.layer3(x)
+
+        if self.stride == 8:
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv1(x)
+            x = self.upconv1_fusion(torch.cat([x, feats['s8']], dim=1))
+            x = self.out_conv(x)
+
+        elif self.stride == 4:
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv1(x)
+            x = self.upconv1_fusion(torch.cat([x, feats['s8']], dim=1))
+
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv2(x)
+            x = self.upconv2_fusion(torch.cat([x, feats['s4']], dim=1))
+
+            x = self.out_conv(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim] * num, dim=0)
+
+        return x
+
+
+########################################################################################################################
+class ResNetEncoderV1(models.ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+    """
+
+    def __init__(self,
+                 num_layers=18,
+                 num_input_images=1,
+                 pretrained=True,
+                 out_chs=32,
+                 stride=8):
+        layers = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+        block = {
+            18: models.resnet.BasicBlock,
+            50: models.resnet.Bottleneck
+        }[num_layers]
+        self.upsample_mode = 'nearest'
+        super().__init__(block, layers)
+
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            num_input_images * 3,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+
+        self.stride = stride
+        if stride == 8:
+            self.upconv1 = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv1_fusion = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.out_conv = nn.Conv2d(128, out_chs, 3, 1, padding=1)
+
+        elif stride == 4:
+            self.upconv1 = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv1_fusion = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv2 = nn.Sequential(
+                nn.Conv2d(128, 64, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv2_fusion = nn.Sequential(
+                nn.Conv2d(128, 64, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.out_conv = nn.Conv2d(64, out_chs, 3, 1, padding=1)
+
+        else:
+            raise NotImplementedError
+
+        # self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # del self.layer3
+        del self.layer4
+        del self.fc
+        del self.avgpool
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if pretrained:
+            loaded = model_zoo.load_url(
+                models.resnet.model_urls['resnet{}'.format(num_layers)])
+            loaded['conv1.weight'] = torch.cat(
+                [loaded['conv1.weight']] * num_input_images,
+                1) / num_input_images
+            loaded_flilter = {
+                k: v
+                for k, v in loaded.items()
+                if 'layer4' not in k and 'fc' not in k
+            }
+            try:
+                print('load pretrained model from:',
+                      models.resnet.model_urls['resnet{}'.format(num_layers)])
+                self.load_state_dict(loaded_flilter)
+            except Exception as e:
+                print(e)
+                self.load_state_dict(loaded_flilter, strict=False)
+
+    def forward(self, x):
+        feats = {}
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            num = len(x)
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feats['s4'] = x
+        x = self.layer2(x)
+        feats['s8'] = x
+        x = self.layer3(x)
+
+        if self.stride == 8:
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv1(x)
+            x = self.upconv1_fusion(torch.cat([x, feats['s8']], dim=1))
+            x = self.out_conv(x)
+
+        elif self.stride == 4:
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv1(x)
+            x = self.upconv1_fusion(torch.cat([x, feats['s8']], dim=1))
+
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv2(x)
+            x = self.upconv2_fusion(torch.cat([x, feats['s4']], dim=1))
+
+            x = self.out_conv(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim] * num, dim=0)
+
+        return x
+
+
+########################################################################################################################
+class ResNetEncoderV2(models.ResNet):
+    """Constructs a resnet model with varying number of input images.
+    Adapted from https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+    """
+
+    def __init__(self,
+                 num_layers=18,
+                 num_input_images=1,
+                 pretrained=True,
+                 out_chs=32,
+                 stride=8):
+        layers = {18: [2, 2, 2, 2], 50: [3, 4, 6, 3]}[num_layers]
+        block = {
+            18: models.resnet.BasicBlock,
+            50: models.resnet.Bottleneck
+        }[num_layers]
+        self.upsample_mode = 'bilinear'
+        super().__init__(block, layers)
+
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            num_input_images * 3,
+            64,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # del self.layer4
+        self.upconv0 = nn.Sequential(
+            nn.Conv2d(512, 256, 3, 1, padding=1), nn.ReLU(inplace=True))
+        self.upconv0_fusion = nn.Sequential(
+            nn.Conv2d(512, 256, 3, 1, padding=1), nn.ReLU(inplace=True))
+
+        self.stride = stride
+        if stride == 8:
+            self.upconv1 = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv1_fusion = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.out_conv = nn.Conv2d(128, out_chs, 3, 1, padding=1)
+
+        elif stride == 4:
+            self.upconv1 = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv1_fusion = nn.Sequential(
+                nn.Conv2d(256, 128, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv2 = nn.Sequential(
+                nn.Conv2d(128, 64, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.upconv2_fusion = nn.Sequential(
+                nn.Conv2d(128, 64, 3, 1, padding=1), nn.ReLU(inplace=True))
+            self.out_conv = nn.Conv2d(64, out_chs, 3, 1, padding=1)
+
+        else:
+            raise NotImplementedError
+
+        del self.fc
+        del self.avgpool
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        if pretrained:
+            loaded = model_zoo.load_url(
+                models.resnet.model_urls['resnet{}'.format(num_layers)])
+            loaded['conv1.weight'] = torch.cat(
+                [loaded['conv1.weight']] * num_input_images,
+                1) / num_input_images
+            loaded_flilter = {
+                k: v
+                for k, v in loaded.items()
+                if 'layer4' not in k and 'fc' not in k
+            }
+            try:
+                print('load pretrained model from:',
+                      models.resnet.model_urls['resnet{}'.format(num_layers)])
+                self.load_state_dict(loaded_flilter)
+            except Exception as e:
+                print(e)
+                self.load_state_dict(loaded_flilter, strict=False)
+
+    def forward(self, x):
+        feats = {}
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            num = len(x)
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feats['s4'] = x
+        x = self.layer2(x)
+        feats['s8'] = x
+        x = self.layer3(x)
+
+        feats['s16'] = x
+        x = self.layer4(x)
+        x = self.upconv0(x)
+        x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+        x = self.upconv0_fusion(torch.cat([x, feats['s16']], dim=1))
+
+        if self.stride == 8:
+            x = self.upconv1(x)
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv1_fusion(torch.cat([x, feats['s8']], dim=1))
+            x = self.out_conv(x)
+
+        elif self.stride == 4:
+            x = self.upconv1(x)
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv1_fusion(torch.cat([x, feats['s8']], dim=1))
+
+            x = self.upconv2(x)
+            x = F.interpolate(x, scale_factor=2, mode=self.upsample_mode)
+            x = self.upconv2_fusion(torch.cat([x, feats['s4']], dim=1))
+
+            x = self.out_conv(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim] * num, dim=0)
+
+        return x
+
+
+################################################################################
+class FeatBlock(nn.Module):
+
+    def __init__(self, planes=128, out_dim=128):
+        super().__init__()
+        self.conv1 = nn.Conv2d(planes, planes, 3, padding=1)
+        self.conv2 = nn.Conv2d(planes, out_dim, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.relu(self.conv1(self.relu(x)))
+        x = self.conv2(x)
+        return x
diff --git a/modelscope/models/cv/video_depth_estimation/networks/optim/update.py b/modelscope/models/cv/video_depth_estimation/networks/optim/update.py
new file mode 100644
index 00000000..fc179f84
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/networks/optim/update.py
@@ -0,0 +1,234 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DepthHead(nn.Module):
+
+    def __init__(self, input_dim=256, hidden_dim=128, scale=False):
+        super(DepthHead, self).__init__()
+        self.scale = scale
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 1, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x_d, act_fn=F.tanh):
+        out = self.conv2(self.relu(self.conv1(x_d)))
+        return act_fn(out)
+
+
+class PoseHead(nn.Module):
+
+    def __init__(self, input_dim=256, hidden_dim=128):
+        super(PoseHead, self).__init__()
+
+        self.conv1_pose = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2_pose = nn.Conv2d(hidden_dim, 6, 3, padding=1)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x_p):
+        out = self.conv2_pose(self.relu(self.conv1_pose(x_p))).mean(3).mean(2)
+        return torch.cat([out[:, :3], 0.01 * out[:, 3:]], dim=1)
+
+
+class ConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+
+        h = (1 - z) * h + z * q
+        return h
+
+
+class SepConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convr1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convq1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+
+        self.convz2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convr2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convq2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        return h
+
+
+class ProjectionInputDepth(nn.Module):
+
+    def __init__(self, cost_dim, hidden_dim, out_chs):
+        super().__init__()
+        self.out_chs = out_chs
+        self.convc1 = nn.Conv2d(cost_dim, hidden_dim, 1, padding=0)
+        self.convc2 = nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1)
+
+        self.convd1 = nn.Conv2d(1, hidden_dim, 7, padding=3)
+        self.convd2 = nn.Conv2d(hidden_dim, 64, 3, padding=1)
+
+        self.convd = nn.Conv2d(64 + hidden_dim, out_chs - 1, 3, padding=1)
+
+    def forward(self, depth, cost):
+        cor = F.relu(self.convc1(cost))
+        cor = F.relu(self.convc2(cor))
+
+        dfm = F.relu(self.convd1(depth))
+        dfm = F.relu(self.convd2(dfm))
+        cor_dfm = torch.cat([cor, dfm], dim=1)
+
+        out_d = F.relu(self.convd(cor_dfm))
+
+        return torch.cat([out_d, depth], dim=1)
+
+
+class ProjectionInputPose(nn.Module):
+
+    def __init__(self, cost_dim, hidden_dim, out_chs):
+        super().__init__()
+        self.out_chs = out_chs
+        self.convc1 = nn.Conv2d(cost_dim, hidden_dim, 1, padding=0)
+        self.convc2 = nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1)
+
+        self.convp1 = nn.Conv2d(6, hidden_dim, 7, padding=3)
+        self.convp2 = nn.Conv2d(hidden_dim, 64, 3, padding=1)
+
+        self.convp = nn.Conv2d(64 + hidden_dim, out_chs - 6, 3, padding=1)
+
+    def forward(self, pose, cost):
+        bs, _, h, w = cost.shape
+        cor = F.relu(self.convc1(cost))
+        cor = F.relu(self.convc2(cor))
+
+        pfm = F.relu(self.convp1(pose.view(bs, 6, 1, 1).repeat(1, 1, h, w)))
+        pfm = F.relu(self.convp2(pfm))
+        cor_pfm = torch.cat([cor, pfm], dim=1)
+
+        out_p = F.relu(self.convp(cor_pfm))
+        return torch.cat(
+            [out_p, pose.view(bs, 6, 1, 1).repeat(1, 1, h, w)], dim=1)
+
+
+class UpMaskNet(nn.Module):
+
+    def __init__(self, hidden_dim=128, ratio=8):
+        super(UpMaskNet, self).__init__()
+        self.mask = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim * 2, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(hidden_dim * 2, ratio * ratio * 9, 1, padding=0))
+
+    def forward(self, feat):
+        # scale mask to balence gradients
+        mask = .25 * self.mask(feat)
+        return mask
+
+
+class BasicUpdateBlockDepth(nn.Module):
+
+    def __init__(self, hidden_dim=128, cost_dim=256, ratio=8, context_dim=64):
+        super(BasicUpdateBlockDepth, self).__init__()
+
+        self.encoder = ProjectionInputDepth(
+            cost_dim=cost_dim, hidden_dim=hidden_dim, out_chs=hidden_dim)
+        self.depth_gru = SepConvGRU(
+            hidden_dim=hidden_dim,
+            input_dim=self.encoder.out_chs + context_dim)
+        self.depth_head = DepthHead(
+            hidden_dim, hidden_dim=hidden_dim, scale=False)
+        self.mask = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim * 2, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(hidden_dim * 2, ratio * ratio * 9, 1, padding=0))
+
+    def forward(self,
+                net,
+                cost_func,
+                inv_depth,
+                context,
+                seq_len=4,
+                scale_func=None):
+        inv_depth_list = []
+        mask_list = []
+        for i in range(seq_len):
+            # TODO detach()
+            # inv_depth = inv_depth.detach()
+            input_features = self.encoder(inv_depth,
+                                          cost_func(scale_func(inv_depth)[0]))
+            inp_i = torch.cat([context, input_features], dim=1)
+
+            net = self.depth_gru(net, inp_i)
+            delta_inv_depth = self.depth_head(net)
+            # scale mask to balence gradients
+            mask = .25 * self.mask(net)
+
+            inv_depth = inv_depth + delta_inv_depth
+            inv_depth_list.append(inv_depth)
+            mask_list.append(mask)
+
+        return net, mask_list, inv_depth_list
+
+
+class BasicUpdateBlockPose(nn.Module):
+
+    def __init__(self, hidden_dim=128, cost_dim=256, context_dim=64):
+        super(BasicUpdateBlockPose, self).__init__()
+        self.encoder = ProjectionInputPose(
+            cost_dim=cost_dim, hidden_dim=hidden_dim, out_chs=hidden_dim)
+        self.pose_gru = SepConvGRU(
+            hidden_dim=hidden_dim,
+            input_dim=self.encoder.out_chs + context_dim)
+        self.pose_head = PoseHead(hidden_dim, hidden_dim=hidden_dim)
+
+    def forward(self, net, cost_func, pose, inp, seq_len=4):
+        pose_list = []
+        for i in range(seq_len):
+            # TODO detach()
+            # pose = pose.detach()
+            input_features = self.encoder(pose, cost_func(pose))
+            inp_i = torch.cat([inp, input_features], dim=1)
+
+            net = self.pose_gru(net, inp_i)
+            delta_pose = self.pose_head(net)
+
+            pose = pose + delta_pose
+            pose_list.append(pose)
+
+        # scale mask to balence gradients
+        return net, pose_list
diff --git a/modelscope/models/cv/video_depth_estimation/utils/__init__.py b/modelscope/models/cv/video_depth_estimation/utils/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/models/cv/video_depth_estimation/utils/augmentations.py b/modelscope/models/cv/video_depth_estimation/utils/augmentations.py
new file mode 100644
index 00000000..5c7694b3
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/augmentations.py
@@ -0,0 +1,245 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import random
+
+import cv2
+import numpy as np
+import torchvision.transforms as transforms
+from PIL import Image
+
+from modelscope.models.cv.video_depth_estimation.utils.misc import filter_dict
+
+########################################################################################################################
+
+
+def resize_image(image, shape, interpolation=Image.ANTIALIAS):
+    """
+    Resizes input image.
+
+    Parameters
+    ----------
+    image : Image.PIL
+        Input image
+    shape : tuple [H,W]
+        Output shape
+    interpolation : int
+        Interpolation mode
+
+    Returns
+    -------
+    image : Image.PIL
+        Resized image
+    """
+    transform = transforms.Resize(shape, interpolation=interpolation)
+    return transform(image)
+
+
+def resize_depth(depth, shape):
+    """
+    Resizes depth map.
+
+    Parameters
+    ----------
+    depth : np.array [h,w]
+        Depth map
+    shape : tuple (H,W)
+        Output shape
+
+    Returns
+    -------
+    depth : np.array [H,W]
+        Resized depth map
+    """
+    depth = cv2.resize(
+        depth, dsize=shape[::-1], interpolation=cv2.INTER_NEAREST)
+    return np.expand_dims(depth, axis=2)
+
+
+def resize_sample_image_and_intrinsics(sample,
+                                       shape,
+                                       image_interpolation=Image.ANTIALIAS):
+    """
+    Resizes the image and intrinsics of a sample
+
+    Parameters
+    ----------
+    sample : dict
+        Dictionary with sample values
+    shape : tuple (H,W)
+        Output shape
+    image_interpolation : int
+        Interpolation mode
+
+    Returns
+    -------
+    sample : dict
+        Resized sample
+    """
+    # Resize image and corresponding intrinsics
+    image_transform = transforms.Resize(
+        shape, interpolation=image_interpolation)
+    (orig_w, orig_h) = sample['rgb'].size
+    (out_h, out_w) = shape
+    # Scale intrinsics
+    for key in filter_dict(sample, ['intrinsics']):
+        intrinsics = np.copy(sample[key])
+        intrinsics[0] *= out_w / orig_w
+        intrinsics[1] *= out_h / orig_h
+        sample[key] = intrinsics
+    # Scale images
+    for key in filter_dict(sample, [
+            'rgb',
+            'rgb_original',
+    ]):
+        sample[key] = image_transform(sample[key])
+    # Scale context images
+    for key in filter_dict(sample, [
+            'rgb_context',
+            'rgb_context_original',
+    ]):
+        sample[key] = [image_transform(k) for k in sample[key]]
+    # Return resized sample
+    return sample
+
+
+def resize_sample(sample, shape, image_interpolation=Image.ANTIALIAS):
+    """
+    Resizes a sample, including image, intrinsics and depth maps.
+
+    Parameters
+    ----------
+    sample : dict
+        Dictionary with sample values
+    shape : tuple (H,W)
+        Output shape
+    image_interpolation : int
+        Interpolation mode
+
+    Returns
+    -------
+    sample : dict
+        Resized sample
+    """
+    # Resize image and intrinsics
+    sample = resize_sample_image_and_intrinsics(sample, shape,
+                                                image_interpolation)
+    # Resize depth maps
+    for key in filter_dict(sample, [
+            'depth',
+    ]):
+        sample[key] = resize_depth(sample[key], shape)
+    # Resize depth contexts
+    for key in filter_dict(sample, [
+            'depth_context',
+    ]):
+        sample[key] = [resize_depth(k, shape) for k in sample[key]]
+    # Return resized sample
+    return sample
+
+
+########################################################################################################################
+
+
+def to_tensor(image, tensor_type='torch.FloatTensor'):
+    """Casts an image to a torch.Tensor"""
+    transform = transforms.ToTensor()
+    return transform(image).type(tensor_type)
+
+
+def to_tensor_sample(sample, tensor_type='torch.FloatTensor'):
+    """
+    Casts the keys of sample to tensors.
+
+    Parameters
+    ----------
+    sample : dict
+        Input sample
+    tensor_type : str
+        Type of tensor we are casting to
+
+    Returns
+    -------
+    sample : dict
+        Sample with keys cast as tensors
+    """
+    transform = transforms.ToTensor()
+    # Convert single items
+    for key in filter_dict(sample, [
+            'rgb',
+            'rgb_original',
+            'depth',
+    ]):
+        sample[key] = transform(sample[key]).type(tensor_type)
+    # Convert lists
+    for key in filter_dict(
+            sample, ['rgb_context', 'rgb_context_original', 'depth_context']):
+        sample[key] = [transform(k).type(tensor_type) for k in sample[key]]
+    # Return converted sample
+    return sample
+
+
+########################################################################################################################
+
+
+def duplicate_sample(sample):
+    """
+    Duplicates sample images and contexts to preserve their unaugmented versions.
+
+    Parameters
+    ----------
+    sample : dict
+        Input sample
+
+    Returns
+    -------
+    sample : dict
+        Sample including [+"_original"] keys with copies of images and contexts.
+    """
+    # Duplicate single items
+    for key in filter_dict(sample, ['rgb']):
+        sample['{}_original'.format(key)] = sample[key].copy()
+    # Duplicate lists
+    for key in filter_dict(sample, ['rgb_context']):
+        sample['{}_original'.format(key)] = [k.copy() for k in sample[key]]
+    # Return duplicated sample
+    return sample
+
+
+def colorjitter_sample(sample, parameters, prob=1.0):
+    """
+    Jitters input images as data augmentation.
+
+    Parameters
+    ----------
+    sample : dict
+        Input sample
+    parameters : tuple (brightness, contrast, saturation, hue)
+        Color jittering parameters
+    prob : float
+        Jittering probability
+
+    Returns
+    -------
+    sample : dict
+        Jittered sample
+    """
+    if random.random() < prob:
+        # Prepare transformation
+        color_augmentation = transforms.ColorJitter()
+        brightness, contrast, saturation, hue = parameters
+        augment_image = color_augmentation.get_params(
+            brightness=[max(0, 1 - brightness), 1 + brightness],
+            contrast=[max(0, 1 - contrast), 1 + contrast],
+            saturation=[max(0, 1 - saturation), 1 + saturation],
+            hue=[-hue, hue])
+        # Jitter single items
+        for key in filter_dict(sample, ['rgb']):
+            sample[key] = augment_image(sample[key])
+        # Jitter lists
+        for key in filter_dict(sample, ['rgb_context']):
+            sample[key] = [augment_image(k) for k in sample[key]]
+    # Return jittered sample
+    return sample
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_depth_estimation/utils/config.py b/modelscope/models/cv/video_depth_estimation/utils/config.py
new file mode 100644
index 00000000..727af287
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/config.py
@@ -0,0 +1,327 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import os
+from datetime import datetime
+
+import torch
+from yacs.config import CfgNode
+
+from modelscope.models.cv.video_depth_estimation.utils.horovod import on_rank_0
+from modelscope.models.cv.video_depth_estimation.utils.load import (
+    backwards_state_dict, load_class)
+from modelscope.models.cv.video_depth_estimation.utils.misc import make_list
+from modelscope.models.cv.video_depth_estimation.utils.types import (is_cfg,
+                                                                     is_list)
+
+
+def prep_dataset(config):
+    """
+    Expand dataset configuration to match split length
+
+    Parameters
+    ----------
+    config : CfgNode
+        Dataset configuration
+
+    Returns
+    -------
+    config : CfgNode
+        Updated dataset configuration
+    """
+    # If there is no dataset, do nothing
+    if len(config.path) == 0:
+        return config
+    # If cameras is not a double list, make it so
+    if not config.cameras or not is_list(config.cameras[0]):
+        config.cameras = [config.cameras]
+    # Get maximum length and expand other arguments to the same length
+    n = max(len(config.split), len(config.cameras), len(config.depth_type))
+    config.dataset = make_list(config.dataset, n)
+    config.path = make_list(config.path, n)
+    config.split = make_list(config.split, n)
+    config.depth_type = make_list(config.depth_type, n)
+    config.cameras = make_list(config.cameras, n)
+    if 'repeat' in config:
+        config.repeat = make_list(config.repeat, n)
+    # Return updated configuration
+    return config
+
+
+def set_name(config):
+    """
+    Set run name based on available information
+
+    Parameters
+    ----------
+    config : CfgNode
+        Model configuration
+
+    Returns
+    -------
+    name : str
+        Updated run name
+    """
+    # If there is a name already, do nothing
+    if config.name != '':
+        return config.name
+    else:
+        # Create a name based on available information
+        return '{}-{}-{}'.format(
+            os.path.basename(config.default),
+            os.path.splitext(os.path.basename(config.config))[0],
+            datetime.now().strftime('%Y.%m.%d-%Hh%Mm%Ss'))
+
+
+@on_rank_0
+def prep_logger_and_checkpoint(model):
+    """
+    Use logger and checkpoint information to update configuration
+
+    Parameters
+    ----------
+    model : nn.Module
+        Module to update
+    """
+    # Change run name to be the wandb assigned name
+    if model.logger and not model.config.wandb.dry_run:
+        model.config.name = model.config.wandb.name = model.logger.run_name
+        model.config.wandb.url = model.logger.run_url
+        # If we are saving models we need to update the path
+        if model.config.checkpoint.filepath != '':
+            # Change checkpoint filepath
+            filepath = model.config.checkpoint.filepath.split('/')
+            filepath[-2] = model.config.name
+            model.config.checkpoint.filepath = '/'.join(filepath)
+            # Change callback dirpath
+            dirpath = os.path.join(
+                os.path.dirname(model.trainer.checkpoint.dirpath),
+                model.config.name)
+            model.trainer.checkpoint.dirpath = dirpath
+            os.makedirs(dirpath, exist_ok=True)
+        # Log updated configuration
+        model.logger.log_config(model.config)
+
+
+def get_default_config(cfg_default):
+    """Get default configuration from file"""
+    config = load_class(
+        'get_cfg_defaults',
+        paths=[cfg_default.replace('/', '.')],
+        concat=False)()
+    config.merge_from_list(['default', cfg_default])
+    return config
+
+
+def merge_cfg_file(config, cfg_file=None):
+    """Merge configuration file"""
+    if cfg_file is not None:
+        config.merge_from_file(cfg_file)
+        config.merge_from_list(['config', cfg_file])
+    return config
+
+
+def merge_cfgs(original, override):
+    """
+    Updates CfgNode with information from another one
+
+    Parameters
+    ----------
+    original : CfgNode
+        Original configuration node
+    override : CfgNode
+        Another configuration node used for overriding
+
+    Returns
+    -------
+    updated : CfgNode
+        Updated configuration node
+    """
+    for key, value in original.items():
+        if key in override.keys():
+            if is_cfg(value):  # If it's a configuration node, recursion
+                original[key] = merge_cfgs(original[key], override[key])
+            else:  # Otherwise, simply update key
+                original[key] = override[key]
+    return original
+
+
+def backwards_config(config):
+    """
+    Add or update configuration for backwards compatibility
+    (no need for it right now, pretrained models are up-to-date with configuration files).
+
+    Parameters
+    ----------
+    config : CfgNode
+        Model configuration
+
+    Returns
+    -------
+    config : CfgNode
+        Updated model configuration
+    """
+    # Return updated configuration
+    return config
+
+
+def parse_train_config(cfg_default, cfg_file):
+    """
+    Parse model configuration for training
+
+    Parameters
+    ----------
+    cfg_default : str
+        Default **.py** configuration file
+    cfg_file : str
+        Configuration **.yaml** file to override the default parameters
+
+    Returns
+    -------
+    config : CfgNode
+        Parsed model configuration
+    """
+    # Loads default configuration
+    config = get_default_config(cfg_default)
+    # Merge configuration file
+    config = merge_cfg_file(config, cfg_file)
+    # Return prepared configuration
+    return prepare_train_config(config)
+
+
+def prepare_train_config(config):
+    """
+    Prepare model configuration for training
+
+    Parameters
+    ----------
+    config : CfgNode
+        Model configuration
+
+    Returns
+    -------
+    config : CfgNode
+        Prepared model configuration
+    """
+    # If arguments have already been prepared, don't prepare
+    if config.prepared:
+        return config
+
+    # Asserts
+    assert config.wandb.dry_run or config.wandb.entity != '', \
+        'You need a wandb entity'
+    assert config.wandb.dry_run or config.wandb.project != '', \
+        'You need a wandb project'
+    assert config.checkpoint.filepath == '' or \
+        (config.checkpoint.monitor_index < len(config.datasets.validation.split)), \
+        'You need to monitor a valid dataset'
+
+    # Prepare datasets
+    config.datasets.train = prep_dataset(config.datasets.train)
+    config.datasets.validation = prep_dataset(config.datasets.validation)
+    config.datasets.test = prep_dataset(config.datasets.test)
+    # Set name and checkpoint
+    config.name = set_name(config)
+    # Return configuration
+    return config
+
+
+def parse_test_file(ckpt_file, cfg_file=None):
+    """
+    Parse model configuration for testing
+
+    Parameters
+    ----------
+    ckpt_file : str
+        Checkpoint file, with pretrained model
+    cfg_file :
+        Configuration file, to update pretrained model configuration
+
+    Returns
+    -------
+    config : CfgNode
+        Parsed model configuration
+    state_dict : dict
+        Model state dict with pretrained weights
+    """
+    assert ckpt_file.endswith('.ckpt') or ckpt_file.endswith('.pth.tar') or ckpt_file.endswith('.pt'), \
+        'You need to provide a .ckpt or .pth.tar file for checkpoint, not {}'.format(ckpt_file)
+    assert cfg_file is None or cfg_file.endswith('yaml'), \
+        'You need to provide a .yaml file for configuration, not {}'.format(cfg_file)
+    cfg_default = 'modelscope/models/cv/video_depth_estimation/configs/default_config'
+    return parse_test_config(ckpt_file, cfg_default, cfg_file)
+
+
+def parse_test_config(ckpt_file, cfg_default, cfg_file):
+    """
+    Parse model configuration for testing
+
+    Parameters
+    ----------
+    ckpt_file : str
+        Checkpoint file, with pretrained model
+    cfg_default : str
+        Default configuration file, with default values
+    cfg_file : str
+        Configuration file with updated information
+
+    Returns
+    -------
+    Returns
+    -------
+    config : CfgNode
+        Parsed model configuration
+    state_dict : dict
+        Model state dict with pretrained weights
+    """
+    if ckpt_file.endswith('.ckpt') or ckpt_file.endswith('.pt'):
+        # Load checkpoint
+        ckpt = torch.load(ckpt_file, map_location='cpu')
+        # Get base configuration
+        config_default = get_default_config(cfg_default)
+        # Extract configuration and model state
+        config_model, state_dict = ckpt['config'], ckpt['state_dict']
+        # Override default configuration with model configuration
+        config = merge_cfgs(config_default, config_model)
+        # Update configuration for backwards compatibility
+        config = backwards_config(config)
+        # If another config file is provided, use it
+        config = merge_cfg_file(config, cfg_file)
+    # Backwards compatibility with older models
+    elif ckpt_file.endswith('.pth.tar'):
+        # Load model state and update it for backwards compatibility
+        state_dict = torch.load(ckpt_file, map_location='cpu')['state_dict']
+        state_dict = backwards_state_dict(state_dict)
+        # Get default configuration
+        config = get_default_config(cfg_default)
+        # If config file is present, update configuration
+        config = merge_cfg_file(config, cfg_file)
+    else:
+        raise ValueError('Unknown checkpoint {}'.format(ckpt_file))
+    # Set pretrained model name
+    config.save.pretrained = ckpt_file
+    # Return prepared configuration and model state
+    return prepare_test_config(config), state_dict
+
+
+def prepare_test_config(config):
+    """
+    Prepare model configuration for testing
+
+    Parameters
+    ----------
+    config : CfgNode
+        Model configuration
+
+    Returns
+    -------
+    config : CfgNode
+        Prepared model configuration
+    """
+    # Remove train and validation datasets
+    config.datasets.train.path = config.datasets.validation.path = []
+    config.datasets.test = prep_dataset(config.datasets.test)
+    # Don't save models or log to wandb
+    config.wandb.dry_run = True
+    config.checkpoint.filepath = ''
+    # Return updated configuration
+    return config
diff --git a/modelscope/models/cv/video_depth_estimation/utils/depth.py b/modelscope/models/cv/video_depth_estimation/utils/depth.py
new file mode 100644
index 00000000..e9f287e7
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/depth.py
@@ -0,0 +1,450 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from matplotlib.cm import get_cmap
+
+from modelscope.models.cv.video_depth_estimation.utils.image import (
+    flip_lr, gradient_x, gradient_y, interpolate_image, load_image)
+from modelscope.models.cv.video_depth_estimation.utils.types import (is_seq,
+                                                                     is_tensor)
+
+
+def load_depth(file):
+    """
+    Load a depth map from file
+    Parameters
+    ----------
+    file : str
+        Depth map filename (.npz or .png)
+
+    Returns
+    -------
+    depth : np.array [H,W]
+        Depth map (invalid pixels are 0)
+    """
+    if file.endswith('npz'):
+        return np.load(file)['depth']
+    elif file.endswith('png'):
+        depth_png = np.array(load_image(file), dtype=int)
+        assert (np.max(depth_png) > 255), 'Wrong .png depth file'
+        return depth_png.astype(np.float) / 256.
+    else:
+        raise NotImplementedError('Depth extension not supported.')
+
+
+def write_depth(filename, depth, intrinsics=None):
+    """
+    Write a depth map to file, and optionally its corresponding intrinsics.
+
+    Parameters
+    ----------
+    filename : str
+        File where depth map will be saved (.npz or .png)
+    depth : np.array [H,W]
+        Depth map
+    intrinsics : np.array [3,3]
+        Optional camera intrinsics matrix
+    """
+    # If depth is a tensor
+    if is_tensor(depth):
+        depth = depth.detach().squeeze().cpu()
+    # If intrinsics is a tensor
+    if is_tensor(intrinsics):
+        intrinsics = intrinsics.detach().cpu()
+    # If we are saving as a .npz
+    if filename.endswith('.npz'):
+        np.savez_compressed(filename, depth=depth, intrinsics=intrinsics)
+    # If we are saving as a .png
+    elif filename.endswith('.png'):
+        depth = transforms.ToPILImage()((depth * 256).int())
+        depth.save(filename)
+    # Something is wrong
+    else:
+        raise NotImplementedError('Depth filename not valid.')
+
+
+def viz_inv_depth(inv_depth,
+                  normalizer=None,
+                  percentile=95,
+                  colormap='plasma',
+                  filter_zeros=False):
+    """
+    Converts an inverse depth map to a colormap for visualization.
+
+    Parameters
+    ----------
+    inv_depth : torch.Tensor [B,1,H,W]
+        Inverse depth map to be converted
+    normalizer : float
+        Value for inverse depth map normalization
+    percentile : float
+        Percentile value for automatic normalization
+    colormap : str
+        Colormap to be used
+    filter_zeros : bool
+        If True, do not consider zero values during normalization
+
+    Returns
+    -------
+    colormap : np.array [H,W,3]
+        Colormap generated from the inverse depth map
+    """
+    # If a tensor is provided, convert to numpy
+    if is_tensor(inv_depth):
+        # Squeeze if depth channel exists
+        if len(inv_depth.shape) == 3:
+            inv_depth = inv_depth.squeeze(0)
+        inv_depth = inv_depth.detach().cpu().numpy()
+    cm = get_cmap(colormap)
+    if normalizer is None:
+        normalizer = np.percentile(
+            inv_depth[inv_depth > 0] if filter_zeros else inv_depth,
+            percentile)
+    inv_depth /= (normalizer + 1e-6)
+    return cm(np.clip(inv_depth, 0., 1.0))[:, :, :3]
+
+
+def inv2depth(inv_depth):
+    """
+    Invert an inverse depth map to produce a depth map
+
+    Parameters
+    ----------
+    inv_depth : torch.Tensor or list of torch.Tensor [B,1,H,W]
+        Inverse depth map
+
+    Returns
+    -------
+    depth : torch.Tensor or list of torch.Tensor [B,1,H,W]
+        Depth map
+    """
+    if is_seq(inv_depth):
+        return [inv2depth(item) for item in inv_depth]
+    else:
+        depth = 1. / inv_depth.clamp(min=1e-6)
+        depth[inv_depth <= 0.] = 0.
+        return depth
+
+
+def depth2inv(depth):
+    """
+    Invert a depth map to produce an inverse depth map
+
+    Parameters
+    ----------
+    depth : torch.Tensor or list of torch.Tensor [B,1,H,W]
+        Depth map
+
+    Returns
+    -------
+    inv_depth : torch.Tensor or list of torch.Tensor [B,1,H,W]
+        Inverse depth map
+
+    """
+    if is_seq(depth):
+        return [depth2inv(item) for item in depth]
+    else:
+        inv_depth = 1. / depth.clamp(min=1e-6)
+        inv_depth[depth <= 0.] = 0.
+        return inv_depth
+
+
+def inv_depths_normalize(inv_depths):
+    """
+    Inverse depth normalization
+
+    Parameters
+    ----------
+    inv_depths : list of torch.Tensor [B,1,H,W]
+        Inverse depth maps
+
+    Returns
+    -------
+    norm_inv_depths : list of torch.Tensor [B,1,H,W]
+        Normalized inverse depth maps
+    """
+    mean_inv_depths = [
+        inv_depth.mean(2, True).mean(3, True) for inv_depth in inv_depths
+    ]
+    return [
+        inv_depth / mean_inv_depth.clamp(min=1e-6)
+        for inv_depth, mean_inv_depth in zip(inv_depths, mean_inv_depths)
+    ]
+
+
+def calc_smoothness(inv_depths, images, num_scales):
+    """
+    Calculate smoothness values for inverse depths
+
+    Parameters
+    ----------
+    inv_depths : list of torch.Tensor [B,1,H,W]
+        Inverse depth maps
+    images : list of torch.Tensor [B,3,H,W]
+        Inverse depth maps
+    num_scales : int
+        Number of scales considered
+
+    Returns
+    -------
+    smoothness_x : list of torch.Tensor [B,1,H,W]
+        Smoothness values in direction x
+    smoothness_y : list of torch.Tensor [B,1,H,W]
+        Smoothness values in direction y
+    """
+    inv_depths_norm = inv_depths_normalize(inv_depths)
+    inv_depth_gradients_x = [gradient_x(d) for d in inv_depths_norm]
+    inv_depth_gradients_y = [gradient_y(d) for d in inv_depths_norm]
+
+    image_gradients_x = [gradient_x(image) for image in images]
+    image_gradients_y = [gradient_y(image) for image in images]
+
+    weights_x = [
+        torch.exp(-torch.mean(torch.abs(g), 1, keepdim=True))
+        for g in image_gradients_x
+    ]
+    weights_y = [
+        torch.exp(-torch.mean(torch.abs(g), 1, keepdim=True))
+        for g in image_gradients_y
+    ]
+
+    # Note: Fix gradient addition
+    smoothness_x = [
+        inv_depth_gradients_x[i] * weights_x[i] for i in range(num_scales)
+    ]
+    smoothness_y = [
+        inv_depth_gradients_y[i] * weights_y[i] for i in range(num_scales)
+    ]
+    return smoothness_x, smoothness_y
+
+
+def fuse_inv_depth(inv_depth, inv_depth_hat, method='mean'):
+    """
+    Fuse inverse depth and flipped inverse depth maps
+
+    Parameters
+    ----------
+    inv_depth : torch.Tensor [B,1,H,W]
+        Inverse depth map
+    inv_depth_hat : torch.Tensor [B,1,H,W]
+        Flipped inverse depth map produced from a flipped image
+    method : str
+        Method that will be used to fuse the inverse depth maps
+
+    Returns
+    -------
+    fused_inv_depth : torch.Tensor [B,1,H,W]
+        Fused inverse depth map
+    """
+    if method == 'mean':
+        return 0.5 * (inv_depth + inv_depth_hat)
+    elif method == 'max':
+        return torch.max(inv_depth, inv_depth_hat)
+    elif method == 'min':
+        return torch.min(inv_depth, inv_depth_hat)
+    else:
+        raise ValueError('Unknown post-process method {}'.format(method))
+
+
+def post_process_inv_depth(inv_depth, inv_depth_flipped, method='mean'):
+    """
+    Post-process an inverse and flipped inverse depth map
+
+    Parameters
+    ----------
+    inv_depth : torch.Tensor [B,1,H,W]
+        Inverse depth map
+    inv_depth_flipped : torch.Tensor [B,1,H,W]
+        Inverse depth map produced from a flipped image
+    method : str
+        Method that will be used to fuse the inverse depth maps
+
+    Returns
+    -------
+    inv_depth_pp : torch.Tensor [B,1,H,W]
+        Post-processed inverse depth map
+    """
+    B, C, H, W = inv_depth.shape
+    inv_depth_hat = flip_lr(inv_depth_flipped)
+    inv_depth_fused = fuse_inv_depth(inv_depth, inv_depth_hat, method=method)
+    xs = torch.linspace(
+        0., 1., W, device=inv_depth.device,
+        dtype=inv_depth.dtype).repeat(B, C, H, 1)
+    mask = 1.0 - torch.clamp(20. * (xs - 0.05), 0., 1.)
+    mask_hat = flip_lr(mask)
+    return mask_hat * inv_depth + mask * inv_depth_hat + \
+        (1.0 - mask - mask_hat) * inv_depth_fused
+
+
+def compute_depth_metrics(config, gt, pred, use_gt_scale=True):
+    """
+    Compute depth metrics from predicted and ground-truth depth maps
+
+    Parameters
+    ----------
+    config : CfgNode
+        Metrics parameters
+    gt : torch.Tensor [B,1,H,W]
+        Ground-truth depth map
+    pred : torch.Tensor [B,1,H,W]
+        Predicted depth map
+    use_gt_scale : bool
+        True if ground-truth median-scaling is to be used
+
+    Returns
+    -------
+    metrics : torch.Tensor [7]
+        Depth metrics (abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3)
+    """
+    crop = config.crop == 'garg'
+
+    # Initialize variables
+    batch_size, _, gt_height, gt_width = gt.shape
+    SI = SILog = iabs_diff = irmse = abs_diff = abs_rel = sq_rel = rmse = rmse_log = a1 = a2 = a3 = 0.0
+    # Interpolate predicted depth to ground-truth resolution
+    pred = interpolate_image(
+        pred, gt.shape, mode='bilinear', align_corners=True)
+    pred = pred.clamp(min=1e-6)
+    # If using crop
+    if config.crop == 'garg':
+        crop = True
+        crop_mask = torch.zeros(gt.shape[-2:]).byte().type_as(gt)
+        y1, y2 = int(0.40810811 * gt_height), int(0.99189189 * gt_height)
+        x1, x2 = int(0.03594771 * gt_width), int(0.96405229 * gt_width)
+        crop_mask[y1:y2, x1:x2] = 1
+    if config.crop == 'eigen_nyu':
+        crop = True
+        crop_mask = torch.zeros(gt.shape[-2:]).byte().type_as(gt)
+        y1, y2 = 20, 459
+        x1, x2 = 24, 615
+        crop_mask[y1:y2, x1:x2] = 1
+    # For each depth map
+    for pred_i, gt_i in zip(pred, gt):
+        gt_i, pred_i = torch.squeeze(gt_i), torch.squeeze(pred_i)
+        # Keep valid pixels (min/max depth and crop)
+        valid = (gt_i > config.min_depth) & (gt_i < config.max_depth)
+        valid = valid & crop_mask.bool() if crop else valid
+        # Stop if there are no remaining valid pixels
+        if valid.sum() == 0:
+            continue
+        # Keep only valid pixels
+        gt_i, pred_i = gt_i[valid], pred_i[valid]
+        # Ground-truth median scaling if needed
+        if use_gt_scale:
+            pred_i = pred_i * torch.median(gt_i / pred_i)
+            pred_i = torch.clamp(pred_i, config.min_depth, config.max_depth)
+
+        # Clamp predicted depth values to min/max values
+        pred_i = pred_i.clamp(config.min_depth, config.max_depth)
+
+        # Calculate depth metrics
+
+        thresh = torch.max((gt_i / pred_i), (pred_i / gt_i))
+        a1 += (thresh < 1.25).float().mean()
+        a2 += (thresh < 1.25**2).float().mean()
+        a3 += (thresh < 1.25**3).float().mean()
+
+        diff_i = gt_i - pred_i
+        abs_diff += torch.mean(torch.abs(diff_i))
+        abs_rel += torch.mean(torch.abs(diff_i) / gt_i)
+        sq_rel += torch.mean(diff_i**2 / gt_i)
+        rmse += torch.sqrt(torch.mean(diff_i**2))
+        rmse_log += torch.sqrt(
+            torch.mean((torch.log(gt_i) - torch.log(pred_i))**2))
+        iabs_diff += torch.mean(torch.abs(1.0 / pred_i - 1.0 / gt_i))
+        irmse += torch.sqrt(torch.mean((1.0 / gt_i - 1.0 / pred_i)**2))
+        d_i = gt_i.log() - pred_i.log()
+        SILog += ((d_i**2).mean() - (d_i.sum())**2 / len(d_i)**2)**0.5
+        SI += ((diff_i**2).mean() - (diff_i.sum())**2 / len(diff_i)**2)**0.5
+    # Return average values for each metric
+    return torch.tensor([
+        metric / batch_size for metric in
+        [abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3, SILog, iabs_diff]
+    ]).type_as(gt)
+
+
+def compute_depth_metrics_demon(config, gt, gt_pose, pred, use_gt_scale=True):
+    # Initialize variables
+    batch_size, _, gt_height, gt_width = gt.shape
+    SI = SILog = iabs_diff = irmse = abs_diff = abs_rel = sq_rel = rmse = rmse_log = a1 = a2 = a3 = 0.0
+    # Interpolate predicted depth to ground-truth resolution
+    pred = interpolate_image(
+        pred, gt.shape, mode='bilinear', align_corners=True)
+    pred = pred.clamp(min=1e-6)
+    # For each depth map
+    for pred_i, gt_i, gt_pose_i in zip(pred, gt, gt_pose):
+        gt_i, pred_i = torch.squeeze(gt_i), torch.squeeze(pred_i)
+        # Keep valid pixels (min/max depth and crop)
+        valid = (gt_i > config.min_depth) & (gt_i < config.max_depth)
+        # Stop if there are no remaining valid pixels
+        if valid.sum() == 0:
+            continue
+        # Keep only valid pixels
+        gt_i, pred_i = gt_i[valid], pred_i[valid]
+
+        # Ground-truth median scaling if needed
+        if use_gt_scale:
+            # gt normalization
+            translation_gt = gt_pose_i[:, :3, 3]
+            translation_norm = torch.sqrt(translation_gt[0].dot(
+                translation_gt[0]))
+            gt_i = gt_i / translation_norm
+
+            pred_i = pred_i * torch.median(gt_i / pred_i)
+            # d1d1 = pred_i * pred_i
+            # d1d2 = pred_i * gt_i
+            # mask = d1d2 > 0
+            # sum_d1d1 = d1d1[mask].sum()
+            # sum_d1d2 = d1d2[mask].sum()
+            # scale = sum_d1d2/sum_d1d1
+            # pred_i = pred_i * scale
+
+        # Calculate depth metrics
+        thresh = torch.max((gt_i / pred_i), (pred_i / gt_i))
+        a1 += (thresh < 1.25).float().mean()
+        a2 += (thresh < 1.25**2).float().mean()
+        a3 += (thresh < 1.25**3).float().mean()
+
+        diff_i = gt_i - pred_i
+        abs_diff += torch.mean(torch.abs(diff_i))
+        abs_rel += torch.mean(torch.abs(diff_i) / gt_i)
+        sq_rel += torch.mean(diff_i**2 / gt_i)
+        rmse += torch.sqrt(torch.mean(diff_i**2))
+        rmse_log += torch.sqrt(
+            torch.mean((torch.log(gt_i) - torch.log(pred_i))**2))
+        iabs_diff += torch.mean(torch.abs(1.0 / pred_i - 1.0 / gt_i))
+        irmse += torch.sqrt(torch.mean((1.0 / gt_i - 1.0 / pred_i)**2))
+        d_i = gt_i.log() - pred_i.log()
+        SILog += ((d_i**2).mean() - (d_i.sum())**2 / len(d_i)**2)**0.5
+        SI += ((diff_i**2).mean() - (diff_i.sum())**2 / len(diff_i)**2)**0.5
+    # Return average values for each metric
+    return torch.tensor([
+        metric / batch_size for metric in
+        [abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3, SILog, iabs_diff]
+    ]).type_as(gt)
+
+
+def compute_pose_metrics(config, gt, pred):
+    pr = pred[0].mat.squeeze().detach().cpu().numpy()
+    gt = gt[0].squeeze().detach().cpu().numpy()
+
+    # seperate rotations and translations
+    R1, t1 = gt[:3, :3], gt[:3, 3]
+    R2, t2 = pr[:3, :3], pr[:3, 3]
+
+    costheta = (np.trace(np.dot(R1.T, R2)) - 1.0) / 2.0
+    costheta = np.minimum(costheta, 1.0)
+    rdeg = np.arccos(costheta) * (180 / np.pi)
+
+    t1mag = np.sqrt(np.dot(t1, t1))
+    t2mag = np.sqrt(np.dot(t2, t2))
+    costheta = np.dot(t1, t2) / (t1mag * t2mag)
+    tdeg = np.arccos(costheta) * (180 / np.pi)
+
+    # fit scales to translations
+    a = np.dot(t1, t2) / np.dot(t2, t2)
+    tcm = 100 * np.sqrt(np.sum((t1 - a * t2)**2, axis=-1))
+
+    return torch.Tensor([rdeg, tdeg, tcm])
diff --git a/modelscope/models/cv/video_depth_estimation/utils/horovod.py b/modelscope/models/cv/video_depth_estimation/utils/horovod.py
new file mode 100644
index 00000000..2c845c5a
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/horovod.py
@@ -0,0 +1,61 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+try:
+    import horovod.torch as hvd
+    HAS_HOROVOD = True
+except ImportError:
+    HAS_HOROVOD = False
+
+
+def hvd_disable():
+    global HAS_HOROVOD
+    HAS_HOROVOD = False
+
+
+def hvd_init():
+    if HAS_HOROVOD:
+        hvd.init()
+    return HAS_HOROVOD
+
+
+def on_rank_0(func):
+
+    def wrapper(*args, **kwargs):
+        if rank() == 0:
+            func(*args, **kwargs)
+
+    return wrapper
+
+
+def rank():
+    return hvd.rank() if HAS_HOROVOD else 0
+
+
+def world_size():
+    return hvd.size() if HAS_HOROVOD else 1
+
+
+@on_rank_0
+def print0(string='\n'):
+    print(string)
+
+
+def reduce_value(value, average, name):
+    """
+    Reduce the mean value of a tensor from all GPUs
+
+    Parameters
+    ----------
+    value : torch.Tensor
+        Value to be reduced
+    average : bool
+        Whether values will be averaged or not
+    name : str
+        Value name
+
+    Returns
+    -------
+    value : torch.Tensor
+        reduced value
+    """
+    return hvd.allreduce(value, average=average, name=name)
diff --git a/modelscope/models/cv/video_depth_estimation/utils/image.py b/modelscope/models/cv/video_depth_estimation/utils/image.py
new file mode 100644
index 00000000..4d58de57
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/image.py
@@ -0,0 +1,422 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import os
+from functools import lru_cache
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as funct
+from PIL import Image
+
+from modelscope.models.cv.video_depth_estimation.utils.misc import same_shape
+
+
+def parse_video(video_file, save_root, sample_rate=10):
+    os.makedirs(save_root, exist_ok=True)
+
+    cap = cv2.VideoCapture(video_file)
+
+    # Check if camera opened successfully
+    if (cap.isOpened() is False):
+        print('Error opening video stream or file')
+    count = 0
+    sample_count = 0
+
+    while cap.isOpened():
+        ret, img = cap.read()
+        if ret:
+            if count % sample_rate == 0:
+                save_path = os.path.join(save_root,
+                                         f'{sample_count}'.zfill(6) + '.jpg')
+                cv2.imwrite(save_path, img)
+                sample_count += 1
+            count += 1
+        else:
+            break
+    print(
+        f'video total frames num: {count},  sampled frames num:{sample_count}')
+
+
+def get_intrinsics(image_shape_raw, image_shape, data_type):
+    if data_type == 'kitti':
+        intr = np.array([
+            7.215376999999999725e+02, 0.000000000000000000e+00,
+            6.095593000000000075e+02, 0.000000000000000000e+00,
+            7.215376999999999725e+02, 1.728540000000000134e+02,
+            0.000000000000000000e+00, 0.000000000000000000e+00,
+            1.000000000000000000e+00
+        ]).reshape(3, 3)
+    elif data_type == 'indoor':
+        intr = np.array([
+            1170.187988, 0.000000, 647.750000, 0.000000, 1170.187988,
+            483.750000, 0.000000, 0.000000, 1.000000
+        ]).reshape(3, 3)
+    else:
+        # print("fake intrinsics")
+        w, h = image_shape_raw
+        fx = w * 1.2
+        fy = w * 1.2
+        cx = w / 2.0
+        cy = h / 2.0
+        intr = np.array([[fx, 0., cx], [0., fy, cy], [0., 0., 1.]])
+
+    orig_w, orig_h = image_shape_raw
+    out_h, out_w = image_shape
+
+    # Scale intrinsics
+    intr[0] *= out_w / orig_w
+    intr[1] *= out_h / orig_h
+
+    return intr
+
+
+def load_image(path):
+    """
+    Read an image using PIL
+
+    Parameters
+    ----------
+    path : str
+        Path to the image
+
+    Returns
+    -------
+    image : PIL.Image
+        Loaded image
+    """
+    return Image.open(path)
+
+
+def write_image(filename, image):
+    """
+    Write an image to file.
+
+    Parameters
+    ----------
+    filename : str
+        File where image will be saved
+    image : np.array [H,W,3]
+        RGB image
+    """
+    cv2.imwrite(filename, image[:, :, ::-1])
+
+
+def flip_lr(image):
+    """
+    Flip image horizontally
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Image to be flipped
+
+    Returns
+    -------
+    image_flipped : torch.Tensor [B,3,H,W]
+        Flipped image
+    """
+    assert image.dim() == 4, 'You need to provide a [B,C,H,W] image to flip'
+    return torch.flip(image, [3])
+
+
+def flip_lr_intr(intr, width):
+    """
+    Flip image horizontally
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Image to be flipped
+
+    Returns
+    -------
+    image_flipped : torch.Tensor [B,3,H,W]
+        Flipped image
+    """
+    assert intr.shape[1:] == (3, 3)
+    # trans = torch.eye(3, dtype=intr.dtype, device=intr.device)
+    # trans[0, 0] = -1
+    # intr_trans = torch.matmul(trans.unsqueeze(0), intr)
+    intr[:, 0, 0] = -1 * intr[:, 0, 0]
+    intr[:, 0, 2] = width - intr[:, 0, 2]
+    return intr
+
+
+def flip_model(model, image, flip):
+    """
+    Flip input image and flip output inverse depth map
+
+    Parameters
+    ----------
+    model : nn.Module
+        Module to be used
+    image : torch.Tensor [B,3,H,W]
+        Input image
+    flip : bool
+        True if the flip is happening
+
+    Returns
+    -------
+    inv_depths : list of torch.Tensor [B,1,H,W]
+        List of predicted inverse depth maps
+    """
+    if flip:
+        return [flip_lr(inv_depth) for inv_depth in model(flip_lr(image))]
+    else:
+        return model(image)
+
+
+def flip_mf_model(model, image, ref_imgs, intrinsics, flip):
+    """
+    Flip input image and flip output inverse depth map
+
+    Parameters
+    ----------
+    model : nn.Module
+        Module to be used
+    image : torch.Tensor [B,3,H,W]
+        Input image
+    flip : bool
+        True if the flip is happening
+
+    Returns
+    -------
+    inv_depths : list of torch.Tensor [B,1,H,W]
+        List of predicted inverse depth maps
+    """
+    if flip:
+        if ref_imgs is not None:
+            return model(
+                flip_lr(image), [flip_lr(img) for img in ref_imgs], intrinsics)
+        else:
+            return model(flip_lr(image), None, intrinsics)
+    else:
+        return model(image, ref_imgs, intrinsics)
+
+
+########################################################################################################################
+
+
+def gradient_x(image):
+    """
+    Calculates the gradient of an image in the x dimension
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Input image
+
+    Returns
+    -------
+    gradient_x : torch.Tensor [B,3,H,W-1]
+        Gradient of image with respect to x
+    """
+    return image[:, :, :, :-1] - image[:, :, :, 1:]
+
+
+def gradient_y(image):
+    """
+    Calculates the gradient of an image in the y dimension
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Input image
+
+    Returns
+    -------
+    gradient_y : torch.Tensor [B,3,H-1,W]
+        Gradient of image with respect to y
+    """
+    return image[:, :, :-1, :] - image[:, :, 1:, :]
+
+
+########################################################################################################################
+
+
+def interpolate_image(image, shape, mode='bilinear', align_corners=True):
+    """
+    Interpolate an image to a different resolution
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,?,h,w]
+        Image to be interpolated
+    shape : tuple (H, W)
+        Output shape
+    mode : str
+        Interpolation mode
+    align_corners : bool
+        True if corners will be aligned after interpolation
+
+    Returns
+    -------
+    image : torch.Tensor [B,?,H,W]
+        Interpolated image
+    """
+    # Take last two dimensions as shape
+    if len(shape) > 2:
+        shape = shape[-2:]
+    # If the shapes are the same, do nothing
+    if same_shape(image.shape[-2:], shape):
+        return image
+    else:
+        # Interpolate image to match the shape
+        return funct.interpolate(
+            image, size=shape, mode=mode, align_corners=align_corners)
+
+
+def interpolate_scales(images,
+                       shape=None,
+                       mode='bilinear',
+                       align_corners=False):
+    """
+    Interpolate list of images to the same shape
+
+    Parameters
+    ----------
+    images : list of torch.Tensor [B,?,?,?]
+        Images to be interpolated, with different resolutions
+    shape : tuple (H, W)
+        Output shape
+    mode : str
+        Interpolation mode
+    align_corners : bool
+        True if corners will be aligned after interpolation
+
+    Returns
+    -------
+    images : list of torch.Tensor [B,?,H,W]
+        Interpolated images, with the same resolution
+    """
+    # If no shape is provided, interpolate to highest resolution
+    if shape is None:
+        shape = images[0].shape
+    # Take last two dimensions as shape
+    if len(shape) > 2:
+        shape = shape[-2:]
+    # Interpolate all images
+    return [
+        funct.interpolate(
+            image, shape, mode=mode, align_corners=align_corners)
+        for image in images
+    ]
+
+
+def match_scales(image,
+                 targets,
+                 num_scales,
+                 mode='bilinear',
+                 align_corners=True):
+    """
+    Interpolate one image to produce a list of images with the same shape as targets
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,?,h,w]
+        Input image
+    targets : list of torch.Tensor [B,?,?,?]
+        Tensors with the target resolutions
+    num_scales : int
+        Number of considered scales
+    mode : str
+        Interpolation mode
+    align_corners : bool
+        True if corners will be aligned after interpolation
+
+    Returns
+    -------
+    images : list of torch.Tensor [B,?,?,?]
+        List of images with the same resolutions as targets
+    """
+    # For all scales
+    images = []
+    image_shape = image.shape[-2:]
+    for i in range(num_scales):
+        target_shape = targets[i].shape
+        # If image shape is equal to target shape
+        if same_shape(image_shape, target_shape):
+            images.append(image)
+        else:
+            # Otherwise, interpolate
+            images.append(
+                interpolate_image(
+                    image,
+                    target_shape,
+                    mode=mode,
+                    align_corners=align_corners))
+    # Return scaled images
+    return images
+
+
+########################################################################################################################
+
+
+@lru_cache(maxsize=None)
+def meshgrid(B, H, W, dtype, device, normalized=False):
+    """
+    Create meshgrid with a specific resolution
+
+    Parameters
+    ----------
+    B : int
+        Batch size
+    H : int
+        Height size
+    W : int
+        Width size
+    dtype : torch.dtype
+        Meshgrid type
+    device : torch.device
+        Meshgrid device
+    normalized : bool
+        True if grid is normalized between -1 and 1
+
+    Returns
+    -------
+    xs : torch.Tensor [B,1,W]
+        Meshgrid in dimension x
+    ys : torch.Tensor [B,H,1]
+        Meshgrid in dimension y
+    """
+    if normalized:
+        xs = torch.linspace(-1, 1, W, device=device, dtype=dtype)
+        ys = torch.linspace(-1, 1, H, device=device, dtype=dtype)
+    else:
+        xs = torch.linspace(0, W - 1, W, device=device, dtype=dtype)
+        ys = torch.linspace(0, H - 1, H, device=device, dtype=dtype)
+    ys, xs = torch.meshgrid([ys, xs])
+    return xs.repeat([B, 1, 1]), ys.repeat([B, 1, 1])
+
+
+@lru_cache(maxsize=None)
+def image_grid(B, H, W, dtype, device, normalized=False):
+    """
+    Create an image grid with a specific resolution
+
+    Parameters
+    ----------
+    B : int
+        Batch size
+    H : int
+        Height size
+    W : int
+        Width size
+    dtype : torch.dtype
+        Meshgrid type
+    device : torch.device
+        Meshgrid device
+    normalized : bool
+        True if grid is normalized between -1 and 1
+
+    Returns
+    -------
+    grid : torch.Tensor [B,3,H,W]
+        Image grid containing a meshgrid in x, y and 1
+    """
+    xs, ys = meshgrid(B, H, W, dtype, device, normalized=normalized)
+    ones = torch.ones_like(xs)
+    grid = torch.stack([xs, ys, ones], dim=1)
+    return grid
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_depth_estimation/utils/image_gt.py b/modelscope/models/cv/video_depth_estimation/utils/image_gt.py
new file mode 100644
index 00000000..a751614b
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/image_gt.py
@@ -0,0 +1,370 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+from functools import lru_cache
+
+import cv2
+import torch
+import torch.nn.functional as funct
+from PIL import Image
+
+from modelscope.models.cv.video_depth_estimation.utils.misc import same_shape
+
+
+def load_image(path):
+    """
+    Read an image using PIL
+
+    Parameters
+    ----------
+    path : str
+        Path to the image
+
+    Returns
+    -------
+    image : PIL.Image
+        Loaded image
+    """
+    return Image.open(path)
+
+
+def write_image(filename, image):
+    """
+    Write an image to file.
+
+    Parameters
+    ----------
+    filename : str
+        File where image will be saved
+    image : np.array [H,W,3]
+        RGB image
+    """
+    cv2.imwrite(filename, image[:, :, ::-1])
+
+
+def flip_lr(image):
+    """
+    Flip image horizontally
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Image to be flipped
+
+    Returns
+    -------
+    image_flipped : torch.Tensor [B,3,H,W]
+        Flipped image
+    """
+    assert image.dim() == 4, 'You need to provide a [B,C,H,W] image to flip'
+    return torch.flip(image, [3])
+
+
+def flip_lr_intr(intr, width):
+    """
+    Flip image horizontally
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Image to be flipped
+
+    Returns
+    -------
+    image_flipped : torch.Tensor [B,3,H,W]
+        Flipped image
+    """
+    assert intr.shape[1:] == (3, 3)
+    # trans = torch.eye(3, dtype=intr.dtype, device=intr.device)
+    # trans[0, 0] = -1
+    # intr_trans = torch.matmul(trans.unsqueeze(0), intr)
+    intr[:, 0, 0] = -1 * intr[:, 0, 0]
+    intr[:, 0, 2] = width - intr[:, 0, 2]
+    return intr
+
+
+def flip_model(model, image, flip):
+    """
+    Flip input image and flip output inverse depth map
+
+    Parameters
+    ----------
+    model : nn.Module
+        Module to be used
+    image : torch.Tensor [B,3,H,W]
+        Input image
+    flip : bool
+        True if the flip is happening
+
+    Returns
+    -------
+    inv_depths : list of torch.Tensor [B,1,H,W]
+        List of predicted inverse depth maps
+    """
+    if flip:
+        return [flip_lr(inv_depth) for inv_depth in model(flip_lr(image))]
+    else:
+        return model(image)
+
+
+def flip_mf_model(model,
+                  image,
+                  ref_imgs,
+                  intrinsics,
+                  flip,
+                  gt_depth=None,
+                  gt_poses=None):
+    """
+    Flip input image and flip output inverse depth map
+
+    Parameters
+    ----------
+    model : nn.Module
+        Module to be used
+    image : torch.Tensor [B,3,H,W]
+        Input image
+    flip : bool
+        True if the flip is happening
+
+    Returns
+    -------
+    inv_depths : list of torch.Tensor [B,1,H,W]
+        List of predicted inverse depth maps
+    """
+    if flip:
+        if ref_imgs is not None:
+            return model(
+                flip_lr(image), [flip_lr(img) for img in ref_imgs], intrinsics,
+                None, flip_lr(gt_depth), gt_poses)
+        else:
+            return model(
+                flip_lr(image), None, intrinsics, None, flip_lr(gt_depth),
+                gt_poses)
+    else:
+        return model(image, ref_imgs, intrinsics, None, gt_depth, gt_poses)
+
+
+########################################################################################################################
+
+
+def gradient_x(image):
+    """
+    Calculates the gradient of an image in the x dimension
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Input image
+
+    Returns
+    -------
+    gradient_x : torch.Tensor [B,3,H,W-1]
+        Gradient of image with respect to x
+    """
+    return image[:, :, :, :-1] - image[:, :, :, 1:]
+
+
+def gradient_y(image):
+    """
+    Calculates the gradient of an image in the y dimension
+    Parameters
+    ----------
+    image : torch.Tensor [B,3,H,W]
+        Input image
+
+    Returns
+    -------
+    gradient_y : torch.Tensor [B,3,H-1,W]
+        Gradient of image with respect to y
+    """
+    return image[:, :, :-1, :] - image[:, :, 1:, :]
+
+
+########################################################################################################################
+
+
+def interpolate_image(image, shape, mode='bilinear', align_corners=True):
+    """
+    Interpolate an image to a different resolution
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,?,h,w]
+        Image to be interpolated
+    shape : tuple (H, W)
+        Output shape
+    mode : str
+        Interpolation mode
+    align_corners : bool
+        True if corners will be aligned after interpolation
+
+    Returns
+    -------
+    image : torch.Tensor [B,?,H,W]
+        Interpolated image
+    """
+    # Take last two dimensions as shape
+    if len(shape) > 2:
+        shape = shape[-2:]
+    # If the shapes are the same, do nothing
+    if same_shape(image.shape[-2:], shape):
+        return image
+    else:
+        # Interpolate image to match the shape
+        return funct.interpolate(
+            image, size=shape, mode=mode, align_corners=align_corners)
+
+
+def interpolate_scales(images,
+                       shape=None,
+                       mode='bilinear',
+                       align_corners=False):
+    """
+    Interpolate list of images to the same shape
+
+    Parameters
+    ----------
+    images : list of torch.Tensor [B,?,?,?]
+        Images to be interpolated, with different resolutions
+    shape : tuple (H, W)
+        Output shape
+    mode : str
+        Interpolation mode
+    align_corners : bool
+        True if corners will be aligned after interpolation
+
+    Returns
+    -------
+    images : list of torch.Tensor [B,?,H,W]
+        Interpolated images, with the same resolution
+    """
+    # If no shape is provided, interpolate to highest resolution
+    if shape is None:
+        shape = images[0].shape
+    # Take last two dimensions as shape
+    if len(shape) > 2:
+        shape = shape[-2:]
+    # Interpolate all images
+    return [
+        funct.interpolate(
+            image, shape, mode=mode, align_corners=align_corners)
+        for image in images
+    ]
+
+
+def match_scales(image,
+                 targets,
+                 num_scales,
+                 mode='bilinear',
+                 align_corners=True):
+    """
+    Interpolate one image to produce a list of images with the same shape as targets
+
+    Parameters
+    ----------
+    image : torch.Tensor [B,?,h,w]
+        Input image
+    targets : list of torch.Tensor [B,?,?,?]
+        Tensors with the target resolutions
+    num_scales : int
+        Number of considered scales
+    mode : str
+        Interpolation mode
+    align_corners : bool
+        True if corners will be aligned after interpolation
+
+    Returns
+    -------
+    images : list of torch.Tensor [B,?,?,?]
+        List of images with the same resolutions as targets
+    """
+    # For all scales
+    images = []
+    image_shape = image.shape[-2:]
+    for i in range(num_scales):
+        target_shape = targets[i].shape
+        # If image shape is equal to target shape
+        if same_shape(image_shape, target_shape):
+            images.append(image)
+        else:
+            # Otherwise, interpolate
+            images.append(
+                interpolate_image(
+                    image,
+                    target_shape,
+                    mode=mode,
+                    align_corners=align_corners))
+    # Return scaled images
+    return images
+
+
+########################################################################################################################
+
+
+@lru_cache(maxsize=None)
+def meshgrid(B, H, W, dtype, device, normalized=False):
+    """
+    Create meshgrid with a specific resolution
+
+    Parameters
+    ----------
+    B : int
+        Batch size
+    H : int
+        Height size
+    W : int
+        Width size
+    dtype : torch.dtype
+        Meshgrid type
+    device : torch.device
+        Meshgrid device
+    normalized : bool
+        True if grid is normalized between -1 and 1
+
+    Returns
+    -------
+    xs : torch.Tensor [B,1,W]
+        Meshgrid in dimension x
+    ys : torch.Tensor [B,H,1]
+        Meshgrid in dimension y
+    """
+    if normalized:
+        xs = torch.linspace(-1, 1, W, device=device, dtype=dtype)
+        ys = torch.linspace(-1, 1, H, device=device, dtype=dtype)
+    else:
+        xs = torch.linspace(0, W - 1, W, device=device, dtype=dtype)
+        ys = torch.linspace(0, H - 1, H, device=device, dtype=dtype)
+    ys, xs = torch.meshgrid([ys, xs])
+    return xs.repeat([B, 1, 1]), ys.repeat([B, 1, 1])
+
+
+@lru_cache(maxsize=None)
+def image_grid(B, H, W, dtype, device, normalized=False):
+    """
+    Create an image grid with a specific resolution
+
+    Parameters
+    ----------
+    B : int
+        Batch size
+    H : int
+        Height size
+    W : int
+        Width size
+    dtype : torch.dtype
+        Meshgrid type
+    device : torch.device
+        Meshgrid device
+    normalized : bool
+        True if grid is normalized between -1 and 1
+
+    Returns
+    -------
+    grid : torch.Tensor [B,3,H,W]
+        Image grid containing a meshgrid in x, y and 1
+    """
+    xs, ys = meshgrid(B, H, W, dtype, device, normalized=normalized)
+    ones = torch.ones_like(xs)
+    grid = torch.stack([xs, ys, ones], dim=1)
+    return grid
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_depth_estimation/utils/load.py b/modelscope/models/cv/video_depth_estimation/utils/load.py
new file mode 100644
index 00000000..8c2b326c
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/load.py
@@ -0,0 +1,204 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import importlib
+import logging
+import os
+import warnings
+from collections import OrderedDict
+from inspect import signature
+
+import torch
+
+from modelscope.models.cv.video_depth_estimation.utils.horovod import print0
+from modelscope.models.cv.video_depth_estimation.utils.misc import (make_list,
+                                                                    pcolor,
+                                                                    same_shape)
+from modelscope.models.cv.video_depth_estimation.utils.types import is_str
+
+
+def set_debug(debug):
+    """
+    Enable or disable debug terminal logging
+
+    Parameters
+    ----------
+    debug : bool
+        Debugging flag (True to enable)
+    """
+    # Disable logging if requested
+    if not debug:
+        os.environ['NCCL_DEBUG'] = ''
+        os.environ['WANDB_SILENT'] = 'false'
+        warnings.filterwarnings('ignore')
+        logging.disable(logging.CRITICAL)
+
+
+def filter_args(func, keys):
+    """
+    Filters a dictionary so it only contains keys that are arguments of a function
+
+    Parameters
+    ----------
+    func : Function
+        Function for which we are filtering the dictionary
+    keys : dict
+        Dictionary with keys we are filtering
+
+    Returns
+    -------
+    filtered : dict
+        Dictionary containing only keys that are arguments of func
+    """
+    filtered = {}
+    sign = list(signature(func).parameters.keys())
+    for k, v in {**keys}.items():
+        if k in sign:
+            filtered[k] = v
+    return filtered
+
+
+def filter_args_create(func, keys):
+    """
+    Filters a dictionary so it only contains keys that are arguments of a function
+    and creates a function with those arguments
+
+    Parameters
+    ----------
+    func : Function
+        Function for which we are filtering the dictionary
+    keys : dict
+        Dictionary with keys we are filtering
+
+    Returns
+    -------
+    func : Function
+        Function with filtered keys as arguments
+    """
+    return func(**filter_args(func, keys))
+
+
+def load_class(filename, paths, concat=True):
+    """
+    Look for a file in different locations and return its method with the same name
+    Optionally, you can use concat to search in path.filename instead
+
+    Parameters
+    ----------
+    filename : str
+        Name of the file we are searching for
+    paths : str or list of str
+        Folders in which the file will be searched
+    concat : bool
+        Flag to concatenate filename to each path during the search
+
+    Returns
+    -------
+    method : Function
+        Loaded method
+    """
+    # for each path in paths
+    for path in make_list(paths):
+        # Create full path
+        full_path = '{}.{}'.format(path, filename) if concat else path
+        if importlib.util.find_spec(full_path):
+            # Return method with same name as the file
+            return getattr(importlib.import_module(full_path), filename)
+    raise ValueError('Unknown class {}'.format(filename))
+
+
+def load_class_args_create(filename, paths, args={}, concat=True):
+    """Loads a class (filename) and returns an instance with filtered arguments (args)"""
+    class_type = load_class(filename, paths, concat)
+    return filter_args_create(class_type, args)
+
+
+def load_network(network, path, prefixes=''):
+    """
+    Loads a pretrained network
+
+    Parameters
+    ----------
+    network : nn.Module
+        Network that will receive the pretrained weights
+    path : str
+        File containing a 'state_dict' key with pretrained network weights
+    prefixes : str or list of str
+        Layer name prefixes to consider when loading the network
+
+    Returns
+    -------
+    network : nn.Module
+        Updated network with pretrained weights
+    """
+    prefixes = make_list(prefixes)
+    # If path is a string
+    if is_str(path):
+        saved_state_dict = torch.load(path, map_location='cpu')['state_dict']
+        if path.endswith('.pth.tar'):
+            saved_state_dict = backwards_state_dict(saved_state_dict)
+    # If state dict is already provided
+    else:
+        saved_state_dict = path
+    # Get network state dict
+    network_state_dict = network.state_dict()
+
+    updated_state_dict = OrderedDict()
+    n, n_total = 0, len(network_state_dict.keys())
+    for key, val in saved_state_dict.items():
+        for prefix in prefixes:
+            prefix = prefix + '.'
+            if prefix in key:
+                idx = key.find(prefix) + len(prefix)
+                key = key[idx:]
+                if key in network_state_dict.keys() and \
+                        same_shape(val.shape, network_state_dict[key].shape):
+                    updated_state_dict[key] = val
+                    n += 1
+    try:
+        network.load_state_dict(updated_state_dict, strict=True)
+    except Exception as e:
+        print(e)
+        network.load_state_dict(updated_state_dict, strict=False)
+    base_color, attrs = 'cyan', ['bold', 'dark']
+    color = 'green' if n == n_total else 'yellow' if n > 0 else 'red'
+    print0(
+        pcolor(
+            '=====###### Pretrained {} loaded:'.format(prefixes[0]),
+            base_color,
+            attrs=attrs)
+        + pcolor(' {}/{} '.format(n, n_total), color, attrs=attrs)
+        + pcolor('tensors', base_color, attrs=attrs))
+    return network
+
+
+def backwards_state_dict(state_dict):
+    """
+    Modify the state dict of older models for backwards compatibility
+
+    Parameters
+    ----------
+    state_dict : dict
+        Model state dict with pretrained weights
+
+    Returns
+    -------
+    state_dict : dict
+        Updated model state dict with modified layer names
+    """
+    # List of layer names to change
+    changes = (('model.model', 'model'), ('pose_network', 'pose_net'),
+               ('disp_network', 'depth_net'))
+    # Iterate over all keys and values
+    updated_state_dict = OrderedDict()
+    for key, val in state_dict.items():
+        # Ad hoc changes due to version changes
+        key = '{}.{}'.format('model', key)
+        if 'disp_network' in key:
+            key = key.replace('conv3.0.weight', 'conv3.weight')
+            key = key.replace('conv3.0.bias', 'conv3.bias')
+        # Change layer names
+        for change in changes:
+            key = key.replace('{}.'.format(change[0]), '{}.'.format(change[1]))
+        updated_state_dict[key] = val
+    # Return updated state dict
+    return updated_state_dict
diff --git a/modelscope/models/cv/video_depth_estimation/utils/misc.py b/modelscope/models/cv/video_depth_estimation/utils/misc.py
new file mode 100644
index 00000000..b66a8c74
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/misc.py
@@ -0,0 +1,107 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+from termcolor import colored
+
+from modelscope.models.cv.video_depth_estimation.utils.types import is_list
+
+########################################################################################################################
+
+
+def filter_dict(dictionary, keywords):
+    """
+    Returns only the keywords that are part of a dictionary
+
+    Parameters
+    ----------
+    dictionary : dict
+        Dictionary for filtering
+    keywords : list of str
+        Keywords that will be filtered
+
+    Returns
+    -------
+    keywords : list of str
+        List containing the keywords that are keys in dictionary
+    """
+    return [key for key in keywords if key in dictionary]
+
+
+########################################################################################################################
+
+
+def make_list(var, n=None):
+    """
+    Wraps the input into a list, and optionally repeats it to be size n
+
+    Parameters
+    ----------
+    var : Any
+        Variable to be wrapped in a list
+    n : int
+        How much the wrapped variable will be repeated
+
+    Returns
+    -------
+    var_list : list
+        List generated from var
+    """
+    var = var if is_list(var) else [var]
+    if n is None:
+        return var
+    else:
+        assert len(var) == 1 or len(
+            var) == n, 'Wrong list length for make_list'
+        return var * n if len(var) == 1 else var
+
+
+########################################################################################################################
+
+
+def same_shape(shape1, shape2):
+    """
+    Checks if two shapes are the same
+
+    Parameters
+    ----------
+    shape1 : tuple
+        First shape
+    shape2 : tuple
+        Second shape
+
+    Returns
+    -------
+    flag : bool
+        True if both shapes are the same (same length and dimensions)
+    """
+    if len(shape1) != len(shape2):
+        return False
+    for i in range(len(shape1)):
+        if shape1[i] != shape2[i]:
+            return False
+    return True
+
+
+########################################################################################################################
+
+
+def pcolor(string, color, on_color=None, attrs=None):
+    """
+    Produces a colored string for printing
+
+    Parameters
+    ----------
+    string : str
+        String that will be colored
+    color : str
+        Color to use
+    on_color : str
+        Background color to use
+    attrs : list of str
+        Different attributes for the string
+
+    Returns
+    -------
+    string: str
+        Colored string
+    """
+    return colored(string, color, on_color, attrs)
diff --git a/modelscope/models/cv/video_depth_estimation/utils/types.py b/modelscope/models/cv/video_depth_estimation/utils/types.py
new file mode 100644
index 00000000..d9cbd98a
--- /dev/null
+++ b/modelscope/models/cv/video_depth_estimation/utils/types.py
@@ -0,0 +1,55 @@
+# Part of the implementation is borrowed and modified from PackNet-SfM,
+# made publicly available under the MIT License at https://github.com/TRI-ML/packnet-sfm
+import numpy as np
+import torch
+import yacs
+
+########################################################################################################################
+
+
+def is_numpy(data):
+    """Checks if data is a numpy array."""
+    return isinstance(data, np.ndarray)
+
+
+def is_tensor(data):
+    """Checks if data is a torch tensor."""
+    return type(data) == torch.Tensor
+
+
+def is_tuple(data):
+    """Checks if data is a tuple."""
+    return isinstance(data, tuple)
+
+
+def is_list(data):
+    """Checks if data is a list."""
+    return isinstance(data, list)
+
+
+def is_dict(data):
+    """Checks if data is a dictionary."""
+    return isinstance(data, dict)
+
+
+def is_str(data):
+    """Checks if data is a string."""
+    return isinstance(data, str)
+
+
+def is_int(data):
+    """Checks if data is an integer."""
+    return isinstance(data, int)
+
+
+def is_seq(data):
+    """Checks if data is a list or tuple."""
+    return is_tuple(data) or is_list(data)
+
+
+def is_cfg(data):
+    """Checks if data is a configuration node"""
+    return type(data) == yacs.config.CfgNode
+
+
+########################################################################################################################
diff --git a/modelscope/models/cv/video_frame_interpolation/VFINet_arch.py b/modelscope/models/cv/video_frame_interpolation/VFINet_arch.py
new file mode 100644
index 00000000..33486cf9
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/VFINet_arch.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_frame_interpolation.flow_model.raft import RAFT
+from modelscope.models.cv.video_frame_interpolation.interp_model.IFNet_swin import \
+    IFNet
+from modelscope.models.cv.video_frame_interpolation.interp_model.refinenet_arch import (
+    InterpNet, InterpNetDs)
+
+
+class VFINet(nn.Module):
+
+    def __init__(self, args, Ds_flag=False):
+        super(VFINet, self).__init__()
+        self.flownet = RAFT(args)
+        self.internet = InterpNet()
+        if Ds_flag:
+            self.internet_Ds = InterpNetDs()
+
+    def img_trans(self, img_tensor):  # in format of RGB
+        img_tensor = img_tensor / 255.0
+        mean = torch.Tensor([0.429, 0.431, 0.397]).view(1, 3, 1,
+                                                        1).type_as(img_tensor)
+        img_tensor -= mean
+        return img_tensor
+
+    def add_mean(self, x):
+        mean = torch.Tensor([0.429, 0.431, 0.397]).view(1, 3, 1, 1).type_as(x)
+        return x + mean
+
+    def forward(self, imgs, timestep=0.5):
+        self.flownet.eval()
+        self.internet.eval()
+        with torch.no_grad():
+            img0 = imgs[:, :3]
+            img1 = imgs[:, 3:6]
+            img2 = imgs[:, 6:9]
+            img3 = imgs[:, 9:12]
+
+            _, F10_up = self.flownet(img1, img0, iters=12, test_mode=True)
+            _, F12_up = self.flownet(img1, img2, iters=12, test_mode=True)
+            _, F21_up = self.flownet(img2, img1, iters=12, test_mode=True)
+            _, F23_up = self.flownet(img2, img3, iters=12, test_mode=True)
+
+            img1 = self.img_trans(img1.clone())
+            img2 = self.img_trans(img2.clone())
+
+            It_warp = self.internet(
+                img1, img2, F10_up, F12_up, F21_up, F23_up, timestep=timestep)
+            It_warp = self.add_mean(It_warp)
+
+        return It_warp
diff --git a/modelscope/models/cv/video_frame_interpolation/VFINet_for_video_frame_interpolation.py b/modelscope/models/cv/video_frame_interpolation/VFINet_for_video_frame_interpolation.py
new file mode 100644
index 00000000..a7ea00e1
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/VFINet_for_video_frame_interpolation.py
@@ -0,0 +1,98 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from copy import deepcopy
+from typing import Any, Dict, Union
+
+import torch.cuda
+import torch.nn.functional as F
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+# from modelscope.models.cv.video_super_resolution.common import charbonnier_loss
+from modelscope.models.cv.video_frame_interpolation.VFINet_arch import VFINet
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['VFINetForVideoFrameInterpolation']
+
+
+def convert(param):
+    return {
+        k.replace('module.', ''): v
+        for k, v in param.items() if 'module.' in k
+    }
+
+
+@MODELS.register_module(
+    Tasks.video_frame_interpolation,
+    module_name=Models.video_frame_interpolation)
+class VFINetForVideoFrameInterpolation(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video frame-interpolation model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        flownet_path = os.path.join(model_dir, 'raft-sintel.pt')
+        internet_path = os.path.join(model_dir, 'interpnet.pt')
+
+        self.model = VFINet(self.config.model.network, Ds_flag=True)
+        self._load_pretrained(flownet_path, internet_path)
+
+    def _load_pretrained(self, flownet_path, internet_path):
+        state_dict_flownet = torch.load(
+            flownet_path, map_location=self._device)
+        state_dict_internet = torch.load(
+            internet_path, map_location=self._device)
+
+        self.model.flownet.load_state_dict(
+            convert(state_dict_flownet), strict=True)
+        self.model.internet.load_state_dict(
+            convert(state_dict_internet), strict=True)
+        self.model.internet_Ds.load_state_dict(
+            convert(state_dict_internet), strict=True)
+        logger.info('load model done.')
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        return {'output': self.model(input)}
+
+    def _evaluate_postprocess(self, input: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        preds = self.model(input)
+        del input
+        torch.cuda.empty_cache()
+        return {'pred': preds, 'target': target}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+
+        if 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/video_frame_interpolation/__init__.py b/modelscope/models/cv/video_frame_interpolation/__init__.py
new file mode 100644
index 00000000..657a375a
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .VFINet_arch import VFINet
+
+else:
+    _import_structure = {'VFINet_arch': ['VFINet']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_frame_interpolation/flow_model/__init__.py b/modelscope/models/cv/video_frame_interpolation/flow_model/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_frame_interpolation/flow_model/corr.py b/modelscope/models/cv/video_frame_interpolation/flow_model/corr.py
new file mode 100644
index 00000000..86009405
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/flow_model/corr.py
@@ -0,0 +1,92 @@
+# The implementation is adopted from RAFT,
+# made publicly available under the BSD-3-Clause license at https://github.com/princeton-vl/RAFT
+
+import torch
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_frame_interpolation.utils.utils import (
+    bilinear_sampler, coords_grid)
+
+
+class CorrBlock:
+
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+
+        # all pairs correlation
+        corr = CorrBlock.corr(fmap1, fmap2)
+
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
+
+        self.corr_pyramid.append(corr)
+        for i in range(self.num_levels - 1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            self.corr_pyramid.append(corr)
+
+    def __call__(self, coords):
+        r = self.radius
+        coords = coords.permute(0, 2, 3, 1)
+        batch, h1, w1, _ = coords.shape
+
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            dx = torch.linspace(-r, r, 2 * r + 1, device=coords.device)
+            dy = torch.linspace(-r, r, 2 * r + 1, device=coords.device)
+            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)
+
+            centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+
+            corr = bilinear_sampler(corr, coords_lvl)
+            corr = corr.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+
+        out = torch.cat(out_pyramid, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float()
+
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht * wd)
+        fmap2 = fmap2.view(batch, dim, ht * wd)
+
+        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr / torch.sqrt(torch.tensor(dim).float())
+
+
+class AlternateCorrBlock:
+
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+
+        self.pyramid = [(fmap1, fmap2)]
+        for i in range(self.num_levels):
+            fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
+            fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
+            self.pyramid.append((fmap1, fmap2))
+
+    def __call__(self, coords):
+        coords = coords.permute(0, 2, 3, 1)
+        B, H, W, _ = coords.shape
+        dim = self.pyramid[0][0].shape[1]
+
+        corr_list = []
+        for i in range(self.num_levels):
+            r = self.radius
+            fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
+            fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()
+
+            coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
+            corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
+            corr_list.append(corr.squeeze(1))
+
+        corr = torch.stack(corr_list, dim=1)
+        corr = corr.reshape(B, -1, H, W)
+        return corr / torch.sqrt(torch.tensor(dim).float())
diff --git a/modelscope/models/cv/video_frame_interpolation/flow_model/extractor.py b/modelscope/models/cv/video_frame_interpolation/flow_model/extractor.py
new file mode 100644
index 00000000..c0ebef47
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/flow_model/extractor.py
@@ -0,0 +1,288 @@
+# The implementation is adopted from RAFT,
+# made publicly available under the BSD-3-Clause license at https://github.com/princeton-vl/RAFT
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(
+                    num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm3)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BottleneckBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes // 4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(
+            planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes // 4)
+            self.norm2 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes // 4)
+            self.norm3 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(
+                    num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm4)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BasicEncoder(nn.Module):
+
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(
+            self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
+
+
+class SmallEncoder(nn.Module):
+
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(32)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(32)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32, stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(
+            self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
diff --git a/modelscope/models/cv/video_frame_interpolation/flow_model/raft.py b/modelscope/models/cv/video_frame_interpolation/flow_model/raft.py
new file mode 100644
index 00000000..87b7a2ed
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/flow_model/raft.py
@@ -0,0 +1,157 @@
+# The implementation is adopted from RAFT,
+# made publicly available under the BSD-3-Clause license at https://github.com/princeton-vl/RAFT
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_frame_interpolation.flow_model.corr import (
+    AlternateCorrBlock, CorrBlock)
+from modelscope.models.cv.video_frame_interpolation.flow_model.extractor import (
+    BasicEncoder, SmallEncoder)
+from modelscope.models.cv.video_frame_interpolation.flow_model.update import (
+    BasicUpdateBlock, SmallUpdateBlock)
+from modelscope.models.cv.video_frame_interpolation.utils.utils import (
+    bilinear_sampler, coords_grid, upflow8)
+
+autocast = torch.cuda.amp.autocast
+
+
+class RAFT(nn.Module):
+
+    def __init__(self, args):
+        super(RAFT, self).__init__()
+        self.args = args
+
+        if args.small:
+            self.hidden_dim = hdim = 96
+            self.context_dim = cdim = 64
+            self.args.corr_levels = 4
+            self.args.corr_radius = 3
+
+        else:
+            self.hidden_dim = hdim = 128
+            self.context_dim = cdim = 128
+            self.args.corr_levels = 4
+            self.args.corr_radius = 4
+
+        if 'dropout' not in self.args:
+            self.args.dropout = 0
+
+        if 'alternate_corr' not in self.args:
+            self.args.alternate_corr = False
+
+        # feature network, context network, and update block
+        if args.small:
+            self.fnet = SmallEncoder(
+                output_dim=128, norm_fn='instance', dropout=self.args.dropout)
+            self.cnet = SmallEncoder(
+                output_dim=hdim + cdim,
+                norm_fn='none',
+                dropout=self.args.dropout)
+            self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim)
+
+        else:
+            self.fnet = BasicEncoder(
+                output_dim=256, norm_fn='instance', dropout=self.args.dropout)
+            self.cnet = BasicEncoder(
+                output_dim=hdim + cdim,
+                norm_fn='batch',
+                dropout=self.args.dropout)
+            self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def initialize_flow(self, img):
+        """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
+        N, C, H, W = img.shape
+        coords0 = coords_grid(N, H // 8, W // 8, device=img.device)
+        coords1 = coords_grid(N, H // 8, W // 8, device=img.device)
+
+        # optical flow computed as difference: flow = coords1 - coords0
+        return coords0, coords1
+
+    def upsample_flow(self, flow, mask):
+        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+
+        up_flow = F.unfold(8 * flow, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8 * H, 8 * W)
+
+    def forward(self,
+                image1,
+                image2,
+                iters=12,
+                flow_init=None,
+                upsample=True,
+                test_mode=False):
+        """ Estimate optical flow between pair of frames """
+
+        image1 = 2 * (image1 / 255.0) - 1.0
+        image2 = 2 * (image2 / 255.0) - 1.0
+
+        image1 = image1.contiguous()
+        image2 = image2.contiguous()
+
+        hdim = self.hidden_dim
+        cdim = self.context_dim
+
+        # run the feature network
+        with autocast(enabled=self.args.mixed_precision):
+            fmap1, fmap2 = self.fnet([image1, image2])
+
+        fmap1 = fmap1.float()
+        fmap2 = fmap2.float()
+        if self.args.alternate_corr:
+            corr_fn = AlternateCorrBlock(
+                fmap1, fmap2, radius=self.args.corr_radius)
+        else:
+            corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
+
+        # run the context network
+        with autocast(enabled=self.args.mixed_precision):
+            cnet = self.cnet(image1)
+            net, inp = torch.split(cnet, [hdim, cdim], dim=1)
+            net = torch.tanh(net)
+            inp = torch.relu(inp)
+
+        coords0, coords1 = self.initialize_flow(image1)
+
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+
+        flow_predictions = []
+        for itr in range(iters):
+            coords1 = coords1.detach()
+            corr = corr_fn(coords1)  # index correlation volume
+
+            flow = coords1 - coords0
+            with autocast(enabled=self.args.mixed_precision):
+                net, up_mask, delta_flow = self.update_block(
+                    net, inp, corr, flow)
+
+            # F(t+1) = F(t) + \Delta(t)
+            coords1 = coords1 + delta_flow
+
+            # upsample predictions
+            if up_mask is None:
+                flow_up = upflow8(coords1 - coords0)
+            else:
+                flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+
+            flow_predictions.append(flow_up)
+
+        if test_mode:
+            return coords1 - coords0, flow_up
+
+        return flow_predictions
diff --git a/modelscope/models/cv/video_frame_interpolation/flow_model/update.py b/modelscope/models/cv/video_frame_interpolation/flow_model/update.py
new file mode 100644
index 00000000..29a20db1
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/flow_model/update.py
@@ -0,0 +1,160 @@
+# The implementation is adopted from RAFT,
+# made publicly available under the BSD-3-Clause license at https://github.com/princeton-vl/RAFT
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowHead(nn.Module):
+
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+
+
+class ConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+
+        h = (1 - z) * h + z * q
+        return h
+
+
+class SepConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convr1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convq1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+
+        self.convz2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convr2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convq2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        return h
+
+
+class SmallMotionEncoder(nn.Module):
+
+    def __init__(self, args):
+        super(SmallMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
+        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
+        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
+        self.conv = nn.Conv2d(128, 80, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class BasicMotionEncoder(nn.Module):
+
+    def __init__(self, args):
+        super(BasicMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64 + 192, 128 - 2, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class SmallUpdateBlock(nn.Module):
+
+    def __init__(self, args, hidden_dim=96):
+        super(SmallUpdateBlock, self).__init__()
+        self.encoder = SmallMotionEncoder(args)
+        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82 + 64)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)
+
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        return net, None, delta_flow
+
+
+class BasicUpdateBlock(nn.Module):
+
+    def __init__(self, args, hidden_dim=128, input_dim=128):
+        super(BasicUpdateBlock, self).__init__()
+        self.args = args
+        self.encoder = BasicMotionEncoder(args)
+        self.gru = SepConvGRU(
+            hidden_dim=hidden_dim, input_dim=128 + hidden_dim)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(256, 64 * 9, 1, padding=0))
+
+    def forward(self, net, inp, corr, flow, upsample=True):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        # scale mask to balence gradients
+        mask = .25 * self.mask(net)
+        return net, mask, delta_flow
diff --git a/modelscope/models/cv/video_frame_interpolation/interp_model/IFNet_swin.py b/modelscope/models/cv/video_frame_interpolation/interp_model/IFNet_swin.py
new file mode 100644
index 00000000..3e82bde2
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/interp_model/IFNet_swin.py
@@ -0,0 +1,434 @@
+# Part of the implementation is borrowed and modified from RIFE,
+# publicly available at https://github.com/megvii-research/ECCV2022-RIFE
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_
+
+from modelscope.models.cv.video_frame_interpolation.interp_model.transformer_layers import (
+    RTFL, PatchEmbed, PatchUnEmbed)
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+backwarp_tenGrid = {}
+
+
+def warp(tenInput, tenFlow):
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = torch.linspace(
+            -1.0, 1.0, tenFlow.shape[3], device=device).view(
+                1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1,
+                                                  tenFlow.shape[2], -1)
+        tenVertical = torch.linspace(
+            -1.0, 1.0, tenFlow.shape[2],
+            device=device).view(1, 1, tenFlow.shape[2],
+                                1).expand(tenFlow.shape[0], -1, -1,
+                                          tenFlow.shape[3])
+        backwarp_tenGrid[k] = torch.cat([tenHorizontal, tenVertical],
+                                        1).to(device)
+
+    tmp1 = tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0)
+    tmp2 = tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)
+    tenFlow = torch.cat([tmp1, tmp2], 1)
+
+    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(
+        input=tenInput,
+        grid=torch.clamp(g, -1, 1),
+        mode='bilinear',
+        padding_mode='border',
+        align_corners=True)
+
+
+def conv_wo_act(in_planes,
+                out_planes,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=True), )
+
+
+def conv(in_planes,
+         out_planes,
+         kernel_size=3,
+         stride=1,
+         padding=1,
+         dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=True), nn.PReLU(out_planes))
+
+
+def conv_bn(in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=False), nn.BatchNorm2d(out_planes), nn.PReLU(out_planes))
+
+
+class TransModel(nn.Module):
+
+    def __init__(self,
+                 img_size=64,
+                 patch_size=1,
+                 embed_dim=64,
+                 depths=[[3, 3]],
+                 num_heads=[[2, 2]],
+                 window_size=4,
+                 mlp_ratio=2,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 resi_connection='1conv',
+                 use_crossattn=[[[False, False, False, False],
+                                 [True, True, True, True]]]):
+        super(TransModel, self).__init__()
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = embed_dim
+        self.mlp_ratio = mlp_ratio
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # merge non-overlapping patches into image
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=embed_dim,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr0 = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths[0]))
+        ]  # stochastic depth decay rule
+
+        self.layers0 = nn.ModuleList()
+        num_layers = len(depths[0])
+        for i_layer in range(num_layers):
+            layer = RTFL(
+                dim=embed_dim,
+                input_resolution=(patches_resolution[0],
+                                  patches_resolution[1]),
+                depth=depths[0][i_layer],
+                num_heads=num_heads[0][i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr0[sum(depths[0][:i_layer]):sum(depths[0][:i_layer
+                                                                      + 1])],
+                norm_layer=norm_layer,
+                downsample=None,
+                use_checkpoint=use_checkpoint,
+                img_size=(img_size[0], img_size[1]),
+                patch_size=patch_size,
+                resi_connection=resi_connection,
+                use_crossattn=use_crossattn[0][i_layer])
+            self.layers0.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+    def forward_features(self, x, layers):
+        x_size = (x.shape[2], x.shape[3])
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        if isinstance(layers, nn.ModuleList):
+            for layer in layers:
+                x = layer(x, x_size)
+        else:
+            x = layers(x, x_size)
+
+        x = self.norm(x)  # B L C
+        x = self.patch_unembed(x, x_size)
+
+        return x
+
+    def forward(self, x):
+        out = self.forward_features(x, self.layers0)
+        return out
+
+
+class IFBlock(nn.Module):
+
+    def __init__(self, in_planes, scale=1, c=64):
+        super(IFBlock, self).__init__()
+        self.scale = scale
+        self.conv0 = nn.Sequential(
+            conv(in_planes, c // 2, 3, 2, 1),
+            conv(c // 2, c, 3, 2, 1),
+            conv(c, c, 3, 1, 1),
+        )
+
+        self.trans = TransModel(
+            img_size=(128 // scale, 128 // scale),
+            patch_size=1,
+            embed_dim=c,
+            depths=[[3, 3]],
+            num_heads=[[2, 2]])
+
+        self.conv1 = nn.Sequential(
+            conv(c, c, 3, 1, 1),
+            conv(c, c, 3, 1, 1),
+        )
+
+        self.up = nn.ConvTranspose2d(c, 4, 4, 2, 1)
+
+        self.conv2 = nn.Conv2d(4, 4, 3, 1, 1)
+
+    def forward(self, x, flow0, flow1):
+        if self.scale != 1:
+            x = F.interpolate(
+                x,
+                scale_factor=1. / self.scale,
+                mode='bilinear',
+                align_corners=False)
+            flow0 = F.interpolate(
+                flow0,
+                scale_factor=1. / self.scale,
+                mode='bilinear',
+                align_corners=False) * (1. / self.scale)
+            flow1 = F.interpolate(
+                flow1,
+                scale_factor=1. / self.scale,
+                mode='bilinear',
+                align_corners=False) * (1. / self.scale)
+
+        x = torch.cat((x, flow0, flow1), 1)
+
+        x = self.conv0(x)
+        x = self.trans(x)
+        x = self.conv1(x) + x
+
+        # upsample 2.0
+        x = self.up(x)
+
+        # upsample 2.0
+        x = self.conv2(x)
+        flow = F.interpolate(
+            x, scale_factor=2.0, mode='bilinear', align_corners=False) * 2.0
+
+        if self.scale != 1:
+            flow = F.interpolate(
+                flow,
+                scale_factor=self.scale,
+                mode='bilinear',
+                align_corners=False) * self.scale
+
+        flow0 = flow[:, :2, :, :]
+        flow1 = flow[:, 2:, :, :]
+
+        return flow0, flow1
+
+
+class IFBlock_wo_Swin(nn.Module):
+
+    def __init__(self, in_planes, scale=1, c=64):
+        super(IFBlock_wo_Swin, self).__init__()
+        self.scale = scale
+        self.conv0 = nn.Sequential(
+            conv(in_planes, c // 2, 3, 2, 1),
+            conv(c // 2, c, 3, 2, 1),
+        )
+
+        self.convblock1 = nn.Sequential(conv(c, c), conv(c, c), conv(c, c))
+        self.convblock2 = nn.Sequential(conv(c, c), conv(c, c), conv(c, c))
+
+        self.up = nn.ConvTranspose2d(c, 4, 4, 2, 1)
+
+        self.conv2 = nn.Conv2d(4, 4, 3, 1, 1)
+
+    def forward(self, x, flow0, flow1):
+        if self.scale != 1:
+            x = F.interpolate(
+                x,
+                scale_factor=1. / self.scale,
+                mode='bilinear',
+                align_corners=False)
+            flow0 = F.interpolate(
+                flow0,
+                scale_factor=1. / self.scale,
+                mode='bilinear',
+                align_corners=False) * (1. / self.scale)
+            flow1 = F.interpolate(
+                flow1,
+                scale_factor=1. / self.scale,
+                mode='bilinear',
+                align_corners=False) * (1. / self.scale)
+
+        x = torch.cat((x, flow0, flow1), 1)
+
+        x = self.conv0(x)
+        x = self.convblock1(x) + x
+        x = self.convblock2(x) + x
+        # upsample 2.0
+        x = self.up(x)
+
+        # upsample 2.0
+        x = self.conv2(x)
+        flow = F.interpolate(
+            x, scale_factor=2.0, mode='bilinear', align_corners=False) * 2.0
+
+        if self.scale != 1:
+            flow = F.interpolate(
+                flow,
+                scale_factor=self.scale,
+                mode='bilinear',
+                align_corners=False) * self.scale
+
+        flow0 = flow[:, :2, :, :]
+        flow1 = flow[:, 2:, :, :]
+
+        return flow0, flow1
+
+
+class IFNet(nn.Module):
+
+    def __init__(self):
+        super(IFNet, self).__init__()
+        self.block1 = IFBlock_wo_Swin(16, scale=4, c=128)
+        self.block2 = IFBlock(16, scale=2, c=64)
+        self.block3 = IFBlock(16, scale=1, c=32)
+
+    # flow0: flow from img0 to img1
+    # flow1: flow from img1 to img0
+    def forward(self, img0, img1, flow0, flow1, sc_mode=2):
+
+        if sc_mode == 0:
+            sc = 0.25
+        elif sc_mode == 1:
+            sc = 0.5
+        else:
+            sc = 1
+
+        if sc != 1:
+            img0_sc = F.interpolate(
+                img0, scale_factor=sc, mode='bilinear', align_corners=False)
+            img1_sc = F.interpolate(
+                img1, scale_factor=sc, mode='bilinear', align_corners=False)
+            flow0_sc = F.interpolate(
+                flow0, scale_factor=sc, mode='bilinear',
+                align_corners=False) * sc
+            flow1_sc = F.interpolate(
+                flow1, scale_factor=sc, mode='bilinear',
+                align_corners=False) * sc
+        else:
+            img0_sc = img0
+            img1_sc = img1
+            flow0_sc = flow0
+            flow1_sc = flow1
+
+        warped_img0 = warp(img1_sc, flow0_sc)  # -> img0
+        warped_img1 = warp(img0_sc, flow1_sc)  # -> img1
+        flow0_1, flow1_1 = self.block1(
+            torch.cat((img0_sc, img1_sc, warped_img0, warped_img1), 1),
+            flow0_sc, flow1_sc)
+        F0_2 = (flow0_sc + flow0_1)
+        F1_2 = (flow1_sc + flow1_1)
+
+        warped_img0 = warp(img1_sc, F0_2)  # -> img0
+        warped_img1 = warp(img0_sc, F1_2)  # -> img1
+        flow0_2, flow1_2 = self.block2(
+            torch.cat((img0_sc, img1_sc, warped_img0, warped_img1), 1), F0_2,
+            F1_2)
+        F0_3 = (F0_2 + flow0_2)
+        F1_3 = (F1_2 + flow1_2)
+
+        warped_img0 = warp(img1_sc, F0_3)  # -> img0
+        warped_img1 = warp(img0_sc, F1_3)  # -> img1
+        flow0_3, flow1_3 = self.block3(
+            torch.cat((img0_sc, img1_sc, warped_img0, warped_img1), dim=1),
+            F0_3, F1_3)
+        flow_res_0 = flow0_1 + flow0_2 + flow0_3
+        flow_res_1 = flow1_1 + flow1_2 + flow1_3
+
+        if sc != 1:
+            flow_res_0 = F.interpolate(
+                flow_res_0,
+                scale_factor=1 / sc,
+                mode='bilinear',
+                align_corners=False) / sc
+            flow_res_1 = F.interpolate(
+                flow_res_1,
+                scale_factor=1 / sc,
+                mode='bilinear',
+                align_corners=False) / sc
+
+        F0_4 = flow0 + flow_res_0
+        F1_4 = flow1 + flow_res_1
+
+        return F0_4, F1_4
diff --git a/modelscope/models/cv/video_frame_interpolation/interp_model/UNet.py b/modelscope/models/cv/video_frame_interpolation/interp_model/UNet.py
new file mode 100644
index 00000000..34b5be19
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/interp_model/UNet.py
@@ -0,0 +1,127 @@
+# Part of the implementation is borrowed and modified from QVI, publicly available at https://github.com/xuxy09/QVI
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class down(nn.Module):
+
+    def __init__(self, inChannels, outChannels, filterSize):
+        super(down, self).__init__()
+        self.conv1 = nn.Conv2d(
+            inChannels,
+            outChannels,
+            filterSize,
+            stride=1,
+            padding=int((filterSize - 1) / 2))
+        self.conv2 = nn.Conv2d(
+            outChannels,
+            outChannels,
+            filterSize,
+            stride=1,
+            padding=int((filterSize - 1) / 2))
+
+    def forward(self, x):
+        x = F.avg_pool2d(x, 2)
+        x = F.leaky_relu(self.conv1(x), negative_slope=0.1)
+        x = F.leaky_relu(self.conv2(x), negative_slope=0.1)
+        return x
+
+
+class up(nn.Module):
+
+    def __init__(self, inChannels, outChannels):
+        super(up, self).__init__()
+        self.conv1 = nn.Conv2d(inChannels, outChannels, 3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            2 * outChannels, outChannels, 3, stride=1, padding=1)
+
+    def forward(self, x, skpCn):
+        x = F.interpolate(
+            x,
+            size=[skpCn.size(2), skpCn.size(3)],
+            mode='bilinear',
+            align_corners=False)
+        x = F.leaky_relu(self.conv1(x), negative_slope=0.1)
+        x = F.leaky_relu(
+            self.conv2(torch.cat((x, skpCn), 1)), negative_slope=0.1)
+        return x
+
+
+class Small_UNet(nn.Module):
+
+    def __init__(self, inChannels, outChannels):
+        super(Small_UNet, self).__init__()
+        self.conv1 = nn.Conv2d(inChannels, 32, 7, stride=1, padding=3)
+        self.conv2 = nn.Conv2d(32, 32, 7, stride=1, padding=3)
+        self.down1 = down(32, 64, 5)
+        self.down2 = down(64, 128, 3)
+        self.down3 = down(128, 128, 3)
+        self.up1 = up(128, 128)
+        self.up2 = up(128, 64)
+        self.up3 = up(64, 32)
+        self.conv3 = nn.Conv2d(32, outChannels, 3, stride=1, padding=1)
+
+    def forward(self, x):
+        x = F.leaky_relu(self.conv1(x), negative_slope=0.1)
+        s1 = F.leaky_relu(self.conv2(x), negative_slope=0.1)
+        s2 = self.down1(s1)
+        s3 = self.down2(s2)
+        x = self.down3(s3)
+        x = self.up1(x, s3)
+        x = self.up2(x, s2)
+        x1 = self.up3(x, s1)  # feature
+        x = self.conv3(x1)  # flow
+        return x, x1
+
+
+class Small_UNet_Ds(nn.Module):
+
+    def __init__(self, inChannels, outChannels):
+        super(Small_UNet_Ds, self).__init__()
+        self.conv1_1 = nn.Conv2d(inChannels, 32, 5, stride=1, padding=2)
+        self.conv1_2 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
+        self.conv2_1 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
+        self.conv2_2 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
+
+        self.down1 = down(32, 64, 5)
+        self.down2 = down(64, 128, 3)
+        self.down3 = down(128, 128, 3)
+        self.up1 = up(128, 128)
+        self.up2 = up(128, 64)
+        self.up3 = up(64, 32)
+        self.conv3 = nn.Conv2d(32, 32, 3, stride=1, padding=1)
+        self.conv4 = nn.Conv2d(32, outChannels, 3, stride=1, padding=1)
+
+    def forward(self, x):
+
+        x0 = F.leaky_relu(self.conv1_1(x), negative_slope=0.1)
+        x0 = F.leaky_relu(self.conv1_2(x0), negative_slope=0.1)
+
+        x = F.interpolate(
+            x0,
+            size=[x0.size(2) // 2, x0.size(3) // 2],
+            mode='bilinear',
+            align_corners=False)
+
+        x = F.leaky_relu(self.conv2_1(x), negative_slope=0.1)
+        s1 = F.leaky_relu(self.conv2_2(x), negative_slope=0.1)
+
+        s2 = self.down1(s1)
+        s3 = self.down2(s2)
+        x = self.down3(s3)
+
+        x = self.up1(x, s3)
+        x = self.up2(x, s2)
+        x1 = self.up3(x, s1)
+
+        x1 = F.interpolate(
+            x1,
+            size=[x0.size(2), x0.size(3)],
+            mode='bilinear',
+            align_corners=False)
+
+        x1 = F.leaky_relu(self.conv3(x1), negative_slope=0.1)  # feature
+        x = self.conv4(x1)  # flow
+        return x, x1
diff --git a/modelscope/models/cv/video_frame_interpolation/interp_model/__init__.py b/modelscope/models/cv/video_frame_interpolation/interp_model/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_frame_interpolation/interp_model/flow_reversal.py b/modelscope/models/cv/video_frame_interpolation/interp_model/flow_reversal.py
new file mode 100644
index 00000000..daac2ead
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/interp_model/flow_reversal.py
@@ -0,0 +1,115 @@
+# The implementation is adopted from QVI,
+# made publicly available at https://github.com/xuxy09/QVI
+
+# class WarpLayer warps image x based on optical flow flo.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowReversal(nn.Module):
+    """docstring for WarpLayer"""
+
+    def __init__(self, ):
+        super(FlowReversal, self).__init__()
+
+    def forward(self, img, flo):
+        """
+            -img: image (N, C, H, W)
+            -flo: optical flow (N, 2, H, W)
+            elements of flo is in [0, H] and [0, W] for dx, dy
+
+        """
+
+        N, C, _, _ = img.size()
+
+        # translate start-point optical flow to end-point optical flow
+        y = flo[:, 0:1:, :]
+        x = flo[:, 1:2, :, :]
+
+        x = x.repeat(1, C, 1, 1)
+        y = y.repeat(1, C, 1, 1)
+
+        x1 = torch.floor(x)
+        x2 = x1 + 1
+        y1 = torch.floor(y)
+        y2 = y1 + 1
+
+        # firstly, get gaussian weights
+        w11, w12, w21, w22 = self.get_gaussian_weights(x, y, x1, x2, y1, y2)
+
+        # secondly, sample each weighted corner
+        img11, o11 = self.sample_one(img, x1, y1, w11)
+        img12, o12 = self.sample_one(img, x1, y2, w12)
+        img21, o21 = self.sample_one(img, x2, y1, w21)
+        img22, o22 = self.sample_one(img, x2, y2, w22)
+
+        imgw = img11 + img12 + img21 + img22
+        o = o11 + o12 + o21 + o22
+
+        return imgw, o
+
+    def get_gaussian_weights(self, x, y, x1, x2, y1, y2):
+        w11 = torch.exp(-((x - x1)**2 + (y - y1)**2))
+        w12 = torch.exp(-((x - x1)**2 + (y - y2)**2))
+        w21 = torch.exp(-((x - x2)**2 + (y - y1)**2))
+        w22 = torch.exp(-((x - x2)**2 + (y - y2)**2))
+
+        return w11, w12, w21, w22
+
+    def sample_one(self, img, shiftx, shifty, weight):
+        """
+        Input:
+            -img (N, C, H, W)
+            -shiftx, shifty (N, c, H, W)
+        """
+
+        N, C, H, W = img.size()
+
+        # flatten all (all restored as Tensors)
+        flat_shiftx = shiftx.view(-1)
+        flat_shifty = shifty.view(-1)
+        flat_basex = torch.arange(
+            0, H, requires_grad=False).view(-1, 1)[None,
+                                                   None].cuda().long().repeat(
+                                                       N, C, 1, W).view(-1)
+        flat_basey = torch.arange(
+            0, W, requires_grad=False).view(1, -1)[None,
+                                                   None].cuda().long().repeat(
+                                                       N, C, H, 1).view(-1)
+        flat_weight = weight.view(-1)
+        flat_img = img.view(-1)
+
+        idxn = torch.arange(
+            0, N,
+            requires_grad=False).view(N, 1, 1,
+                                      1).long().cuda().repeat(1, C, H,
+                                                              W).view(-1)
+        idxc = torch.arange(
+            0, C,
+            requires_grad=False).view(1, C, 1,
+                                      1).long().cuda().repeat(N, 1, H,
+                                                              W).view(-1)
+        idxx = flat_shiftx.long() + flat_basex
+        idxy = flat_shifty.long() + flat_basey
+
+        mask = idxx.ge(0) & idxx.lt(H) & idxy.ge(0) & idxy.lt(W)
+
+        ids = (idxn * C * H * W + idxc * H * W + idxx * W + idxy)
+        ids_mask = torch.masked_select(ids, mask).clone().cuda()
+
+        img_warp = torch.zeros([
+            N * C * H * W,
+        ]).cuda()
+        img_warp.put_(
+            ids_mask,
+            torch.masked_select(flat_img * flat_weight, mask),
+            accumulate=True)
+
+        one_warp = torch.zeros([
+            N * C * H * W,
+        ]).cuda()
+        one_warp.put_(
+            ids_mask, torch.masked_select(flat_weight, mask), accumulate=True)
+
+        return img_warp.view(N, C, H, W), one_warp.view(N, C, H, W)
diff --git a/modelscope/models/cv/video_frame_interpolation/interp_model/refinenet_arch.py b/modelscope/models/cv/video_frame_interpolation/interp_model/refinenet_arch.py
new file mode 100644
index 00000000..1fa0136f
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/interp_model/refinenet_arch.py
@@ -0,0 +1,488 @@
+# Part of the implementation is borrowed and modified from QVI, publicly available at https://github.com/xuxy09/QVI
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_frame_interpolation.interp_model.flow_reversal import \
+    FlowReversal
+from modelscope.models.cv.video_frame_interpolation.interp_model.IFNet_swin import \
+    IFNet
+from modelscope.models.cv.video_frame_interpolation.interp_model.UNet import \
+    Small_UNet_Ds
+
+
+class AcFusionLayer(nn.Module):
+
+    def __init__(self, ):
+        super(AcFusionLayer, self).__init__()
+
+    def forward(self, flo10, flo12, flo21, flo23, t=0.5):
+        return 0.5 * ((t + t**2) * flo12 - (t - t**2) * flo10), \
+            0.5 * (((1 - t) + (1 - t)**2) * flo21 - ((1 - t) - (1 - t)**2) * flo23)
+        # return 0.375 * flo12 - 0.125 * flo10, 0.375 * flo21 - 0.125 * flo23
+
+
+class Get_gradient(nn.Module):
+
+    def __init__(self):
+        super(Get_gradient, self).__init__()
+        kernel_v = [[0, -1, 0], [0, 0, 0], [0, 1, 0]]
+        kernel_h = [[0, 0, 0], [-1, 0, 1], [0, 0, 0]]
+        kernel_h = torch.FloatTensor(kernel_h).unsqueeze(0).unsqueeze(0)
+        kernel_v = torch.FloatTensor(kernel_v).unsqueeze(0).unsqueeze(0)
+        self.weight_h = nn.Parameter(data=kernel_h, requires_grad=False)
+        self.weight_v = nn.Parameter(data=kernel_v, requires_grad=False)
+
+    def forward(self, x):
+        x0 = x[:, 0]  # R
+        x1 = x[:, 1]  # G
+        x2 = x[:, 2]  # B
+        x0_v = F.conv2d(x0.unsqueeze(1), self.weight_v, padding=1)
+        x0_h = F.conv2d(x0.unsqueeze(1), self.weight_h, padding=1)
+
+        x1_v = F.conv2d(x1.unsqueeze(1), self.weight_v, padding=1)
+        x1_h = F.conv2d(x1.unsqueeze(1), self.weight_h, padding=1)
+
+        x2_v = F.conv2d(x2.unsqueeze(1), self.weight_v, padding=1)
+        x2_h = F.conv2d(x2.unsqueeze(1), self.weight_h, padding=1)
+
+        x0 = torch.sqrt(torch.pow(x0_v, 2) + torch.pow(x0_h, 2) + 1e-6)
+        x1 = torch.sqrt(torch.pow(x1_v, 2) + torch.pow(x1_h, 2) + 1e-6)
+        x2 = torch.sqrt(torch.pow(x2_v, 2) + torch.pow(x2_h, 2) + 1e-6)
+
+        x = torch.cat([x0, x1, x2], dim=1)
+        return x
+
+
+class LowPassFilter(nn.Module):
+
+    def __init__(self):
+        super(LowPassFilter, self).__init__()
+        kernel_lpf = [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1],
+                      [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1],
+                      [1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1],
+                      [1, 1, 1, 1, 1, 1, 1]]
+
+        kernel_lpf = torch.FloatTensor(kernel_lpf).unsqueeze(0).unsqueeze(
+            0) / 49
+
+        self.weight_lpf = nn.Parameter(data=kernel_lpf, requires_grad=False)
+
+    def forward(self, x):
+        x0 = x[:, 0]
+        x1 = x[:, 1]
+        y0 = F.conv2d(x0.unsqueeze(1), self.weight_lpf, padding=3)
+        y1 = F.conv2d(x1.unsqueeze(1), self.weight_lpf, padding=3)
+
+        y = torch.cat([y0, y1], dim=1)
+
+        return y
+
+
+def backwarp(img, flow):
+    _, _, H, W = img.size()
+
+    u = flow[:, 0, :, :]
+    v = flow[:, 1, :, :]
+
+    gridX, gridY = np.meshgrid(np.arange(W), np.arange(H))
+
+    gridX = torch.tensor(
+        gridX,
+        requires_grad=False,
+    ).cuda()
+    gridY = torch.tensor(
+        gridY,
+        requires_grad=False,
+    ).cuda()
+    x = gridX.unsqueeze(0).expand_as(u).float() + u
+    y = gridY.unsqueeze(0).expand_as(v).float() + v
+
+    x = 2 * (x / (W - 1) - 0.5)
+    y = 2 * (y / (H - 1) - 0.5)
+
+    grid = torch.stack((x, y), dim=3)
+
+    imgOut = torch.nn.functional.grid_sample(
+        img, grid, padding_mode='border', align_corners=True)
+
+    return imgOut
+
+
+class SmallMaskNet(nn.Module):
+    """A three-layer network for predicting mask"""
+
+    def __init__(self, input, output):
+        super(SmallMaskNet, self).__init__()
+        self.conv1 = nn.Conv2d(input, 32, 5, padding=2)
+        self.conv2 = nn.Conv2d(32, 16, 3, padding=1)
+        self.conv3 = nn.Conv2d(16, output, 3, padding=1)
+
+    def forward(self, x):
+        x = F.leaky_relu(self.conv1(x), negative_slope=0.1)
+        x = F.leaky_relu(self.conv2(x), negative_slope=0.1)
+        x = self.conv3(x)
+        return x
+
+
+class StaticMaskNet(nn.Module):
+    """static mask"""
+
+    def __init__(self, input, output):
+        super(StaticMaskNet, self).__init__()
+
+        modules_body = []
+        modules_body.append(
+            nn.Conv2d(
+                in_channels=input,
+                out_channels=32,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        modules_body.append(nn.LeakyReLU(inplace=False, negative_slope=0.1))
+        modules_body.append(
+            nn.Conv2d(
+                in_channels=32,
+                out_channels=32,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        modules_body.append(nn.LeakyReLU(inplace=False, negative_slope=0.1))
+        modules_body.append(
+            nn.Conv2d(
+                in_channels=32,
+                out_channels=16,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        modules_body.append(nn.LeakyReLU(inplace=False, negative_slope=0.1))
+        modules_body.append(
+            nn.Conv2d(
+                in_channels=16,
+                out_channels=16,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        modules_body.append(nn.LeakyReLU(inplace=False, negative_slope=0.1))
+        modules_body.append(
+            nn.Conv2d(
+                in_channels=16,
+                out_channels=output,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        modules_body.append(nn.Sigmoid())
+
+        self.body = nn.Sequential(*modules_body)
+
+    def forward(self, x):
+        y = self.body(x)
+        return y
+
+
+def tensor_erode(bin_img, ksize=5):
+    B, C, H, W = bin_img.shape
+    pad = (ksize - 1) // 2
+    bin_img = F.pad(bin_img, [pad, pad, pad, pad], mode='constant', value=0)
+
+    patches = bin_img.unfold(dimension=2, size=ksize, step=1)
+    patches = patches.unfold(dimension=3, size=ksize, step=1)
+
+    eroded, _ = patches.reshape(B, C, H, W, -1).max(dim=-1)
+    return eroded
+
+
+class QVI_inter_Ds(nn.Module):
+    """Given flow, implement Quadratic Video Interpolation"""
+
+    def __init__(self, debug_en=False, is_training=False):
+        super(QVI_inter_Ds, self).__init__()
+        self.acc = AcFusionLayer()
+        self.fwarp = FlowReversal()
+        self.refinenet = Small_UNet_Ds(20, 8)
+        self.masknet = SmallMaskNet(38, 1)
+
+        self.staticnet = StaticMaskNet(56, 1)
+        self.lpfilter = LowPassFilter()
+
+        self.get_grad = Get_gradient()
+        self.debug_en = debug_en
+        self.is_training = is_training
+
+    def fill_flow_hole(self, ft, norm, ft_fill):
+        (N, C, H, W) = ft.shape
+        ft[norm == 0] = ft_fill[norm == 0]
+
+        ft_1 = self.lpfilter(ft.clone())
+        ft_ds = torch.nn.functional.interpolate(
+            input=ft_1,
+            size=(H // 4, W // 4),
+            mode='bilinear',
+            align_corners=False)
+        ft_up = torch.nn.functional.interpolate(
+            input=ft_ds, size=(H, W), mode='bilinear', align_corners=False)
+
+        ft[norm == 0] = ft_up[norm == 0]
+
+        return ft
+
+    def forward(self, F10_Ds, F12_Ds, F21_Ds, F23_Ds, I1_Ds, I2_Ds, I1, I2, t):
+        if F12_Ds is None or F21_Ds is None:
+            return I1
+
+        if F10_Ds is not None and F23_Ds is not None:
+            F1t_Ds, F2t_Ds = self.acc(F10_Ds, F12_Ds, F21_Ds, F23_Ds, t)
+
+        else:
+            F1t_Ds = t * F12_Ds
+            F2t_Ds = (1 - t) * F21_Ds
+
+        # Flow Reversal
+        F1t_Ds2 = F.interpolate(
+            F1t_Ds, scale_factor=1.0 / 3, mode='nearest') / 3
+        F2t_Ds2 = F.interpolate(
+            F2t_Ds, scale_factor=1.0 / 3, mode='nearest') / 3
+        Ft1_Ds2, norm1_Ds2 = self.fwarp(F1t_Ds2, F1t_Ds2)
+        Ft1_Ds2 = -Ft1_Ds2
+        Ft2_Ds2, norm2_Ds2 = self.fwarp(F2t_Ds2, F2t_Ds2)
+        Ft2_Ds2 = -Ft2_Ds2
+
+        Ft1_Ds2[norm1_Ds2 > 0] \
+            = Ft1_Ds2[norm1_Ds2 > 0] / norm1_Ds2[norm1_Ds2 > 0].clone()
+        Ft2_Ds2[norm2_Ds2 > 0] \
+            = Ft2_Ds2[norm2_Ds2 > 0] / norm2_Ds2[norm2_Ds2 > 0].clone()
+        if 1:
+            Ft1_Ds2_fill = -F1t_Ds2
+            Ft2_Ds2_fill = -F2t_Ds2
+            Ft1_Ds2 = self.fill_flow_hole(Ft1_Ds2, norm1_Ds2, Ft1_Ds2_fill)
+            Ft2_Ds2 = self.fill_flow_hole(Ft2_Ds2, norm2_Ds2, Ft2_Ds2_fill)
+
+        Ft1_Ds = F.interpolate(
+            Ft1_Ds2, size=[F1t_Ds.size(2), F1t_Ds.size(3)], mode='nearest') * 3
+        Ft2_Ds = F.interpolate(
+            Ft2_Ds2, size=[F2t_Ds.size(2), F2t_Ds.size(3)], mode='nearest') * 3
+
+        I1t_Ds = backwarp(I1_Ds, Ft1_Ds)
+        I2t_Ds = backwarp(I2_Ds, Ft2_Ds)
+
+        output_Ds, feature_Ds = self.refinenet(
+            torch.cat(
+                [I1_Ds, I2_Ds, I1t_Ds, I2t_Ds, F12_Ds, F21_Ds, Ft1_Ds, Ft2_Ds],
+                dim=1))
+
+        # Adaptive filtering
+        Ft1r_Ds = backwarp(
+            Ft1_Ds, 10 * torch.tanh(output_Ds[:, 4:6])) + output_Ds[:, :2]
+        Ft2r_Ds = backwarp(
+            Ft2_Ds, 10 * torch.tanh(output_Ds[:, 6:8])) + output_Ds[:, 2:4]
+
+        # warping and fusing
+        I1tf_Ds = backwarp(I1_Ds, Ft1r_Ds)
+        I2tf_Ds = backwarp(I2_Ds, Ft2r_Ds)
+
+        G1_Ds = self.get_grad(I1_Ds)
+        G2_Ds = self.get_grad(I2_Ds)
+        G1tf_Ds = backwarp(G1_Ds, Ft1r_Ds)
+        G2tf_Ds = backwarp(G2_Ds, Ft2r_Ds)
+
+        M_Ds = torch.sigmoid(
+            self.masknet(torch.cat([I1tf_Ds, I2tf_Ds, feature_Ds],
+                                   dim=1))).repeat(1, 3, 1, 1)
+
+        Ft1r = F.interpolate(
+            Ft1r_Ds * 2, scale_factor=2, mode='bilinear', align_corners=False)
+        Ft2r = F.interpolate(
+            Ft2r_Ds * 2, scale_factor=2, mode='bilinear', align_corners=False)
+
+        I1tf = backwarp(I1, Ft1r)
+        I2tf = backwarp(I2, Ft2r)
+
+        M = F.interpolate(
+            M_Ds, scale_factor=2, mode='bilinear', align_corners=False)
+
+        # fuse
+        It_warp = ((1 - t) * M * I1tf + t * (1 - M) * I2tf) \
+            / ((1 - t) * M + t * (1 - M)).clone()
+
+        # static blending
+        It_static = (1 - t) * I1 + t * I2
+        tmp = torch.cat((I1tf_Ds, I2tf_Ds, G1tf_Ds, G2tf_Ds, I1_Ds, I2_Ds,
+                         G1_Ds, G2_Ds, feature_Ds),
+                        dim=1)
+        M_static_Ds = self.staticnet(tmp)
+        M_static_dilate = tensor_erode(M_static_Ds)
+        M_static_dilate = tensor_erode(M_static_dilate)
+        M_static = F.interpolate(
+            M_static_dilate,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=False)
+
+        It_warp = (1 - M_static) * It_warp + M_static * It_static
+
+        if self.is_training:
+            return It_warp, Ft1r, Ft2r
+        else:
+            if self.debug_en:
+                return It_warp, M, M_static, I1tf, I2tf, Ft1r, Ft2r
+            else:
+                return It_warp
+
+
+class QVI_inter(nn.Module):
+    """Given flow, implement Quadratic Video Interpolation"""
+
+    def __init__(self, debug_en=False, is_training=False):
+        super(QVI_inter, self).__init__()
+        self.acc = AcFusionLayer()
+        self.fwarp = FlowReversal()
+        self.refinenet = Small_UNet_Ds(20, 8)
+        self.masknet = SmallMaskNet(38, 1)
+
+        self.staticnet = StaticMaskNet(56, 1)
+        self.lpfilter = LowPassFilter()
+
+        self.get_grad = Get_gradient()
+        self.debug_en = debug_en
+        self.is_training = is_training
+
+    def fill_flow_hole(self, ft, norm, ft_fill):
+        (N, C, H, W) = ft.shape
+        ft[norm == 0] = ft_fill[norm == 0]
+
+        ft_1 = self.lpfilter(ft.clone())
+        ft_ds = torch.nn.functional.interpolate(
+            input=ft_1,
+            size=(H // 4, W // 4),
+            mode='bilinear',
+            align_corners=False)
+        ft_up = torch.nn.functional.interpolate(
+            input=ft_ds, size=(H, W), mode='bilinear', align_corners=False)
+
+        ft[norm == 0] = ft_up[norm == 0]
+
+        return ft
+
+    def forward(self, F10, F12, F21, F23, I1, I2, t):
+        if F12 is None or F21 is None:
+            return I1
+
+        if F10 is not None and F23 is not None:
+            F1t, F2t = self.acc(F10, F12, F21, F23, t)
+
+        else:
+            F1t = t * F12
+            F2t = (1 - t) * F21
+
+        # Flow Reversal
+        F1t_Ds = F.interpolate(F1t, scale_factor=1.0 / 3, mode='nearest') / 3
+        F2t_Ds = F.interpolate(F2t, scale_factor=1.0 / 3, mode='nearest') / 3
+        Ft1_Ds, norm1_Ds = self.fwarp(F1t_Ds, F1t_Ds)
+        Ft1_Ds = -Ft1_Ds
+        Ft2_Ds, norm2_Ds = self.fwarp(F2t_Ds, F2t_Ds)
+        Ft2_Ds = -Ft2_Ds
+
+        Ft1_Ds[norm1_Ds > 0] \
+            = Ft1_Ds[norm1_Ds > 0] / norm1_Ds[norm1_Ds > 0].clone()
+        Ft2_Ds[norm2_Ds > 0] \
+            = Ft2_Ds[norm2_Ds > 0] / norm2_Ds[norm2_Ds > 0].clone()
+        if 1:
+            Ft1_fill = -F1t_Ds
+            Ft2_fill = -F2t_Ds
+            Ft1_Ds = self.fill_flow_hole(Ft1_Ds, norm1_Ds, Ft1_fill)
+            Ft2_Ds = self.fill_flow_hole(Ft2_Ds, norm2_Ds, Ft2_fill)
+
+        Ft1 = F.interpolate(
+            Ft1_Ds, size=[F1t.size(2), F1t.size(3)], mode='nearest') * 3
+        Ft2 = F.interpolate(
+            Ft2_Ds, size=[F2t.size(2), F2t.size(3)], mode='nearest') * 3
+
+        I1t = backwarp(I1, Ft1)
+        I2t = backwarp(I2, Ft2)
+
+        output, feature = self.refinenet(
+            torch.cat([I1, I2, I1t, I2t, F12, F21, Ft1, Ft2], dim=1))
+
+        # Adaptive filtering
+        Ft1r = backwarp(Ft1, 10 * torch.tanh(output[:, 4:6])) + output[:, :2]
+        Ft2r = backwarp(Ft2, 10 * torch.tanh(output[:, 6:8])) + output[:, 2:4]
+
+        # warping and fusing
+        I1tf = backwarp(I1, Ft1r)
+        I2tf = backwarp(I2, Ft2r)
+
+        M = torch.sigmoid(
+            self.masknet(torch.cat([I1tf, I2tf, feature],
+                                   dim=1))).repeat(1, 3, 1, 1)
+
+        It_warp = ((1 - t) * M * I1tf + t * (1 - M) * I2tf) \
+            / ((1 - t) * M + t * (1 - M)).clone()
+
+        G1 = self.get_grad(I1)
+        G2 = self.get_grad(I2)
+        G1tf = backwarp(G1, Ft1r)
+        G2tf = backwarp(G2, Ft2r)
+
+        # static blending
+        It_static = (1 - t) * I1 + t * I2
+        M_static = self.staticnet(
+            torch.cat([I1tf, I2tf, G1tf, G2tf, I1, I2, G1, G2, feature],
+                      dim=1))
+        M_static_dilate = tensor_erode(M_static)
+        M_static_dilate = tensor_erode(M_static_dilate)
+        It_warp = (1 - M_static_dilate) * It_warp + M_static_dilate * It_static
+
+        if self.is_training:
+            return It_warp, Ft1r, Ft2r
+        else:
+            if self.debug_en:
+                return It_warp, M, M_static, I1tf, I2tf, Ft1r, Ft2r
+            else:
+                return It_warp
+
+
+class InterpNetDs(nn.Module):
+
+    def __init__(self, debug_en=False, is_training=False):
+        super(InterpNetDs, self).__init__()
+        self.ifnet = IFNet()
+        self.internet = QVI_inter_Ds(
+            debug_en=debug_en, is_training=is_training)
+
+    def forward(self,
+                img1,
+                img2,
+                F10_up,
+                F12_up,
+                F21_up,
+                F23_up,
+                UHD=2,
+                timestep=0.5):
+        F12, F21 = self.ifnet(img1, img2, F12_up, F21_up, UHD)
+        It_warp = self.internet(F10_up, F12, F21, F23_up, img1, img2, timestep)
+
+        return It_warp
+
+
+class InterpNet(nn.Module):
+
+    def __init__(self, debug_en=False, is_training=False):
+        super(InterpNet, self).__init__()
+        self.ifnet = IFNet()
+        self.internet = QVI_inter(debug_en=debug_en, is_training=is_training)
+
+    def forward(self,
+                img1,
+                img2,
+                F10_up,
+                F12_up,
+                F21_up,
+                F23_up,
+                UHD=2,
+                timestep=0.5):
+        F12, F21 = self.ifnet(img1, img2, F12_up, F21_up, UHD)
+        It_warp = self.internet(F10_up, F12, F21, F23_up, img1, img2, timestep)
+
+        return It_warp
diff --git a/modelscope/models/cv/video_frame_interpolation/interp_model/transformer_layers.py b/modelscope/models/cv/video_frame_interpolation/interp_model/transformer_layers.py
new file mode 100644
index 00000000..81ce114b
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/interp_model/transformer_layers.py
@@ -0,0 +1,989 @@
+# The implementation is adopted from VFIformer,
+# made publicly available at https://github.com/dvlab-research/VFIformer
+
+# -----------------------------------------------------------------------------------
+# modified from:
+# SwinIR: Image Restoration Using Swin Transformer, https://github.com/JingyunLiang/SwinIR
+# -----------------------------------------------------------------------------------
+
+import functools
+import math
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+
+class Mlp(nn.Module):
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1).contiguous())
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C).contiguous()
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class WindowCrossAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table_x = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        self.relative_position_bias_table_y = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.merge1 = nn.Linear(dim * 2, dim)
+        self.merge2 = nn.Linear(dim, dim)
+        self.act = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        self.proj = nn.Linear(dim, dim)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table_x, std=.02)
+        trunc_normal_(self.relative_position_bias_table_y, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, y, mask_x=None, mask_y=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1).contiguous())
+
+        relative_position_bias = self.relative_position_bias_table_x[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask_x is not None:
+            nW = mask_x.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask_x.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C).contiguous()
+
+        B_, N, C = y.shape
+        kv = self.kv(y).reshape(B_, N, 2, self.num_heads,
+                                C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[
+            1]  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1).contiguous())
+
+        relative_position_bias = self.relative_position_bias_table_y[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask_y is not None:
+            nW = mask_y.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask_y.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        y = (attn @ v).transpose(1, 2).reshape(B_, N, C).contiguous()
+
+        x = self.merge2(self.act(self.merge1(torch.cat([x, y], dim=-1)))) + x
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class TFL(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_crossattn=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.use_crossattn = use_crossattn
+
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        if not use_crossattn:
+            self.attn = WindowAttention(
+                dim,
+                window_size=to_2tuple(self.window_size),
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop)
+        else:
+            self.attn = WindowCrossAttention(
+                dim,
+                window_size=to_2tuple(self.window_size),
+                num_heads=num_heads,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+        if self.shift_size > 0:
+            if not use_crossattn:
+                attn_mask = self.calculate_mask(self.input_resolution)
+                self.register_buffer('attn_mask', attn_mask)
+            else:
+                attn_mask_x = self.calculate_mask(self.input_resolution)
+                attn_mask_y = self.calculate_mask2(self.input_resolution)
+                self.register_buffer('attn_mask_x', attn_mask_x)
+                self.register_buffer('attn_mask_y', attn_mask_y)
+
+        else:
+            if not use_crossattn:
+                attn_mask = None
+                self.register_buffer('attn_mask', attn_mask)
+            else:
+                attn_mask_x = None
+                attn_mask_y = None
+                self.register_buffer('attn_mask_x', attn_mask_x)
+                self.register_buffer('attn_mask_y', attn_mask_y)
+
+    def calculate_mask(self, x_size):
+        # calculate attention mask for SW-MSA
+        H, W = x_size
+        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+
+        return attn_mask
+
+    def calculate_mask2(self, x_size):
+        # calculate attention mask for SW-MSA
+        H, W = x_size
+        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+
+        # downscale
+        img_mask_down = F.interpolate(
+            img_mask.permute(0, 3, 1, 2).contiguous(),
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=False)
+        img_mask_down = F.pad(
+            img_mask_down, (self.window_size // 4, self.window_size // 4,
+                            self.window_size // 4, self.window_size // 4),
+            mode='reflect')
+        mask_windows_down = F.unfold(
+            img_mask_down,
+            kernel_size=self.window_size,
+            dilation=1,
+            padding=0,
+            stride=self.window_size // 2)
+        mask_windows_down = mask_windows_down.view(
+            self.window_size * self.window_size,
+            -1).permute(1, 0).contiguous()  # nW, window_size*window_size
+
+        attn_mask = mask_windows_down.unsqueeze(1) - mask_windows.unsqueeze(
+            2)  # nW, window_size*window_size, window_size*window_size
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+
+        return attn_mask
+
+    def forward(self, x, x_size):
+        H, W = x_size
+        B, L, C = x.shape
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+
+        if not self.use_crossattn:
+            if self.input_resolution == x_size:
+                attn_windows = self.attn(
+                    x_windows,
+                    mask=self.attn_mask)  # nW*B, window_size*window_size, C
+            else:
+                attn_windows = self.attn(
+                    x_windows, mask=self.calculate_mask(x_size).to(x.device))
+        else:
+            shifted_x_down = F.interpolate(
+                shifted_x.permute(0, 3, 1, 2).contiguous(),
+                scale_factor=0.5,
+                mode='bilinear',
+                align_corners=False)
+            shifted_x_down = F.pad(
+                shifted_x_down, (self.window_size // 4, self.window_size // 4,
+                                 self.window_size // 4, self.window_size // 4),
+                mode='reflect')
+            x_windows_down = F.unfold(
+                shifted_x_down,
+                kernel_size=self.window_size,
+                dilation=1,
+                padding=0,
+                stride=self.window_size // 2)
+            x_windows_down = x_windows_down.view(
+                B, C, self.window_size * self.window_size, -1)
+            x_windows_down = x_windows_down.permute(
+                0, 3, 2,
+                1).contiguous().view(-1, self.window_size * self.window_size,
+                                     C)  # nW*B, window_size*window_size, C
+
+            if self.input_resolution == x_size:
+                attn_windows = self.attn(
+                    x_windows,
+                    x_windows_down,
+                    mask_x=self.attn_mask_x,
+                    mask_y=self.attn_mask_y
+                )  # nW*B, window_size*window_size, C
+            else:
+                attn_windows = self.attn(
+                    x_windows,
+                    x_windows_down,
+                    mask_x=self.calculate_mask(x_size).to(x.device),
+                    mask_y=self.calculate_mask2(x_size).to(x.device))
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H,
+                                   W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, ' \
+               f'window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}'
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, 'input feature has wrong size'
+        assert H % 2 == 0 and W % 2 == 0, f'x size ({H}*{W}) are not even.'
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f'input_resolution={self.input_resolution}, dim={self.dim}'
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 use_crossattn=None):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        if use_crossattn is None:
+            use_crossattn = [False for i in range(depth)]
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            TFL(dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_crossattn=use_crossattn[i]) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, x_size):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, x_size)
+            else:
+                x = blk(x, x_size)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}'
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class RTFL(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 img_size=224,
+                 patch_size=4,
+                 resi_connection='1conv',
+                 use_crossattn=None):
+        super(RTFL, self).__init__()
+
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.use_crossattn = use_crossattn
+
+        self.residual_group = BasicLayer(
+            dim=dim,
+            input_resolution=input_resolution,
+            depth=depth,
+            num_heads=num_heads,
+            window_size=window_size,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            norm_layer=norm_layer,
+            downsample=downsample,
+            use_checkpoint=use_checkpoint,
+            use_crossattn=use_crossattn)
+
+        if resi_connection == '1conv':
+            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
+        elif resi_connection == '3conv':
+            # to save parameters and memory
+            self.conv = nn.Sequential(
+                nn.Conv2d(dim, dim // 4, 3, 1, 1),
+                nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
+                nn.LeakyReLU(negative_slope=0.2, inplace=True),
+                nn.Conv2d(dim // 4, dim, 3, 1, 1))
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=0,
+            embed_dim=dim,
+            norm_layer=None)
+
+        self.patch_unembed = PatchUnEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=0,
+            embed_dim=dim,
+            norm_layer=None)
+
+    def forward(self, x, x_size):
+        return self.patch_embed(
+            self.conv(
+                self.patch_unembed(self.residual_group(x, x_size),
+                                   x_size))) + x
+
+    def flops(self):
+        flops = 0
+        flops += self.residual_group.flops()
+        H, W = self.input_resolution
+        flops += H * W * self.dim * self.dim * 9
+        flops += self.patch_embed.flops()
+        flops += self.patch_unembed.flops()
+
+        return flops
+
+
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2).contiguous()  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        H, W = self.img_size
+        if self.norm is not None:
+            flops += H * W * self.embed_dim
+        return flops
+
+
+class PatchUnEmbed(nn.Module):
+    r""" Image to Patch Unembedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+    def forward(self, x, x_size):
+        B, HW, C = x.shape
+        x = x.transpose(1, 2).contiguous().view(B, self.embed_dim, x_size[0],
+                                                x_size[1])  # B Ph*Pw C
+        return x
+
+    def flops(self):
+        flops = 0
+        return flops
+
+
+class Upsample(nn.Sequential):
+    """Upsample module.
+
+    Args:
+        scale (int): Scale factor. Supported scales: 2^n and 3.
+        num_feat (int): Channel number of intermediate features.
+    """
+
+    def __init__(self, scale, num_feat):
+        m = []
+        if (scale & (scale - 1)) == 0:  # scale = 2^n
+            for _ in range(int(math.log(scale, 2))):
+                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
+                m.append(nn.PixelShuffle(2))
+        elif scale == 3:
+            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
+            m.append(nn.PixelShuffle(3))
+        else:
+            raise ValueError(f'scale {scale} is not supported. '
+                             'Supported scales: 2^n and 3.')
+        super(Upsample, self).__init__(*m)
+
+
+class UpsampleOneStep(nn.Sequential):
+    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
+       Used in lightweight SR to save parameters.
+
+    Args:
+        scale (int): Scale factor. Supported scales: 2^n and 3.
+        num_feat (int): Channel number of intermediate features.
+
+    """
+
+    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
+        self.num_feat = num_feat
+        self.input_resolution = input_resolution
+        m = []
+        m.append(nn.Conv2d(num_feat, (scale**2) * num_out_ch, 3, 1, 1))
+        m.append(nn.PixelShuffle(scale))
+        super(UpsampleOneStep, self).__init__(*m)
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.num_feat * 3 * 9
+        return flops
diff --git a/modelscope/models/cv/video_frame_interpolation/utils/__init__.py b/modelscope/models/cv/video_frame_interpolation/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py b/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py
new file mode 100644
index 00000000..4cbe60a7
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py
@@ -0,0 +1,97 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def calc_hist(img_tensor):
+    hist = torch.histc(img_tensor, bins=64, min=0, max=255)
+    return hist / img_tensor.numel()
+
+
+def do_scene_detect(F01_tensor, F10_tensor, img0_tensor, img1_tensor):
+    device = img0_tensor.device
+    scene_change = False
+    img0_tensor = img0_tensor.clone()
+    img1_tensor = img1_tensor.clone()
+
+    img0_gray = 0.299 * img0_tensor[:, 0:
+                                    1] + 0.587 * img0_tensor[:, 1:
+                                                             2] + 0.114 * img0_tensor[:,
+                                                                                      2:
+                                                                                      3]
+    img1_gray = 0.299 * img1_tensor[:, 0:
+                                    1] + 0.587 * img1_tensor[:, 1:
+                                                             2] + 0.114 * img1_tensor[:,
+                                                                                      2:
+                                                                                      3]
+    img0_gray = torch.clamp(img0_gray, 0, 255).byte().float().cpu()
+    img1_gray = torch.clamp(img1_gray, 0, 255).byte().float().cpu()
+
+    hist0 = calc_hist(img0_gray)
+    hist1 = calc_hist(img1_gray)
+    diff = torch.abs(hist0 - hist1)
+    diff[diff < 0.01] = 0
+    if torch.sum(diff) > 0.8 or diff.max() > 0.4:
+        return True
+    img0_gray = img0_gray.to(device)
+    img1_gray = img1_gray.to(device)
+
+    # second stage: detect mv and pix mismatch
+
+    (n, c, h, w) = F01_tensor.size()
+    scale_x = w / 1920
+    scale_y = h / 1080
+
+    # compare mv
+    (y, x) = torch.meshgrid(torch.arange(h), torch.arange(w))
+    (y_grid, x_grid) = torch.meshgrid(
+        torch.arange(64, h - 64, 8), torch.arange(64, w - 64, 8))
+    x = x.to(device)
+    y = y.to(device)
+    y_grid = y_grid.to(device)
+    x_grid = x_grid.to(device)
+    fx = F01_tensor[0, 0]
+    fy = F01_tensor[0, 1]
+    x_ = x.float() + fx
+    y_ = y.float() + fy
+    x_ = torch.clamp(x_ + 0.5, 0, w - 1).long()
+    y_ = torch.clamp(y_ + 0.5, 0, h - 1).long()
+
+    grid_fx = fx[y_grid, x_grid]
+    grid_fy = fy[y_grid, x_grid]
+
+    x_grid_ = x_[y_grid, x_grid]
+    y_grid_ = y_[y_grid, x_grid]
+
+    grid_fx_ = F10_tensor[0, 0, y_grid_, x_grid_]
+    grid_fy_ = F10_tensor[0, 1, y_grid_, x_grid_]
+
+    sum_x = grid_fx + grid_fx_
+    sum_y = grid_fy + grid_fy_
+    distance = torch.sqrt(sum_x**2 + sum_y**2)
+
+    fx_len = torch.abs(grid_fx) * scale_x
+    fy_len = torch.abs(grid_fy) * scale_y
+    ori_len = torch.where(fx_len > fy_len, fx_len, fy_len)
+
+    thres = torch.clamp(0.1 * ori_len + 4, 5, 14)
+
+    # compare pix diff
+    ori_img = img0_gray
+    ref_img = img1_gray[:, :, y_, x_]
+
+    img_diff = ori_img.float() - ref_img.float()
+    img_diff = torch.abs(img_diff)
+
+    kernel = np.ones([8, 8], np.float) / 64
+    kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
+    diff = F.conv2d(img_diff, kernel, padding=4)
+
+    diff = diff[0, 0, y_grid, x_grid]
+
+    index = (distance > thres) * (diff > 5)
+    if index.sum().float() / distance.numel() > 0.5:
+        scene_change = True
+    return scene_change
diff --git a/modelscope/models/cv/video_frame_interpolation/utils/utils.py b/modelscope/models/cv/video_frame_interpolation/utils/utils.py
new file mode 100644
index 00000000..68a8b99d
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/utils/utils.py
@@ -0,0 +1,96 @@
+# The implementation is adopted from RAFT,
+# made publicly available under the BSD-3-Clause license at https://github.com/princeton-vl/RAFT
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy import interpolate
+
+
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [
+                pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2,
+                pad_ht - pad_ht // 2
+            ]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+
+
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+
+    x1 = x0 + dx
+    y1 = y0 + dy
+
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+
+    flow_x = interpolate.griddata((x1, y1),
+                                  dx, (x0, y0),
+                                  method='nearest',
+                                  fill_value=0)
+
+    flow_y = interpolate.griddata((x1, y1),
+                                  dy, (x0, y0),
+                                  method='nearest',
+                                  fill_value=0)
+
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+
+
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+
+    return img
+
+
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(
+        torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+def upflow8(flow, mode='bilinear'):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return 8 * F.interpolate(
+        flow, size=new_size, mode=mode, align_corners=True)
diff --git a/modelscope/models/cv/video_multi_object_tracking/__init__.py b/modelscope/models/cv/video_multi_object_tracking/__init__.py
new file mode 100644
index 00000000..985a8da5
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .video_multi_object_tracking import VideoMultiObjectTracking
+
+else:
+    _import_structure = {
+        'video_multi_object_tracking': ['VideoMultiObjectTracking'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_multi_object_tracking/models/__init__.py b/modelscope/models/cv/video_multi_object_tracking/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_multi_object_tracking/models/common.py b/modelscope/models/cv/video_multi_object_tracking/models/common.py
new file mode 100644
index 00000000..e5eaa686
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/models/common.py
@@ -0,0 +1,104 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+
+import torch
+import torch.nn as nn
+
+
+def autopad(k, p=None):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
+    return p
+
+
+class Conv(nn.Module):
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        super(Conv, self).__init__()
+        self.conv = nn.Conv2d(
+            c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = nn.SiLU() if act is True else (
+            act if isinstance(act, nn.Module) else nn.Identity())
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class Bottleneck(nn.Module):
+
+    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):
+        super(Bottleneck, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_, c2, 3, 1, g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class C3(nn.Module):
+    """
+    CSP Bottleneck with 3 convolutions
+    """
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        super(C3, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)
+        self.m = nn.Sequential(
+            *[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
+
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+
+
+class SPP(nn.Module):
+    """
+    Spatial pyramid pooling layer used in YOLOv3-SPP
+    """
+
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super(SPP, self).__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList(
+            [nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+
+    def forward(self, x):
+        x = self.cv1(x)
+        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+
+
+class Focus(nn.Module):
+    """
+    Focus wh information into c-space
+    """
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        super(Focus, self).__init__()
+        self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
+
+    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+        return self.conv(
+            torch.cat([
+                x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2],
+                x[..., 1::2, 1::2]
+            ], 1))
+
+
+class Concat(nn.Module):
+
+    def __init__(self, dimension=1):
+        super(Concat, self).__init__()
+        self.d = dimension
+
+    def forward(self, x):
+        return torch.cat(x, self.d)
diff --git a/modelscope/models/cv/video_multi_object_tracking/models/decode.py b/modelscope/models/cv/video_multi_object_tracking/models/decode.py
new file mode 100644
index 00000000..4c2ad91e
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/models/decode.py
@@ -0,0 +1,73 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_multi_object_tracking.utils.utils import (
+    _gather_feat, _tranpose_and_gather_feat)
+
+
+def _nms(heat, kernel=3):
+    pad = (kernel - 1) // 2
+
+    hmax = nn.functional.max_pool2d(
+        heat, (kernel, kernel), stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def _topk(scores, K=40):
+    batch, cat, height, width = scores.size()
+
+    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+    topk_inds = topk_inds % (height * width)
+    topk_ys = torch.true_divide(topk_inds, width).int().float()
+    topk_xs = (topk_inds % width).int().float()
+
+    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+    topk_clses = torch.true_divide(topk_ind, K).int()
+    topk_inds = _gather_feat(topk_inds.view(batch, -1, 1),
+                             topk_ind).view(batch, K)
+    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
+
+    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+
+def mot_decode(heat, wh, reg=None, ltrb=False, K=100):
+    batch, cat, height, width = heat.size()
+
+    heat = _nms(heat)
+
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+        reg = _tranpose_and_gather_feat(reg, inds)
+        reg = reg.view(batch, K, 2)
+        xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+        ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+        xs = xs.view(batch, K, 1) + 0.5
+        ys = ys.view(batch, K, 1) + 0.5
+    wh = _tranpose_and_gather_feat(wh, inds)
+    if ltrb:
+        wh = wh.view(batch, K, 4)
+    else:
+        wh = wh.view(batch, K, 2)
+    clses = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    if ltrb:
+        a = xs - wh[..., 0:1]
+        b = ys - wh[..., 1:2]
+        c = xs + wh[..., 2:3]
+        d = ys + wh[..., 3:4]
+        bboxes = torch.cat([a, b, c, d], dim=2)
+    else:
+        a = xs - wh[..., 0:1] / 2
+        b = ys - wh[..., 1:2] / 2
+        c = xs + wh[..., 0:1] / 2
+        d = ys + wh[..., 1:2] / 2
+        bboxes = torch.cat([a, b, c, d], dim=2)
+    detections = torch.cat([bboxes, scores, clses], dim=2)
+
+    return detections, inds
diff --git a/modelscope/models/cv/video_multi_object_tracking/models/model.py b/modelscope/models/cv/video_multi_object_tracking/models/model.py
new file mode 100644
index 00000000..255e92de
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/models/model.py
@@ -0,0 +1,52 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import torch
+
+from modelscope.utils.logger import get_logger
+from .yolo import get_pose_net as get_pose_net_yolo
+
+logger = get_logger()
+
+_model_factory = {'yolo': get_pose_net_yolo}
+
+
+def create_model(arch, heads, head_conv):
+    num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
+    arch = arch[:arch.find('_')] if '_' in arch else arch
+    get_model = _model_factory[arch]
+    model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
+    return model
+
+
+def load_model(model, model_path):
+    checkpoint = torch.load(
+        model_path, map_location=lambda storage, loc: storage)
+    state_dict_ = checkpoint['state_dict']
+    state_dict = {}
+
+    # convert data_parallal to model
+    for k in state_dict_:
+        if k.startswith('module') and not k.startswith('module_list'):
+            state_dict[k[7:]] = state_dict_[k]
+        else:
+            state_dict[k] = state_dict_[k]
+    model_state_dict = model.state_dict()
+
+    # check loaded parameters and created model parameters
+    msg = 'If you see this, your model does not fully load the ' + \
+          'pre-trained weight. Please make sure ' + \
+          'you have correctly specified --arch xxx ' + \
+          'or set the correct --num_classes for your own dataset.'
+    for k in state_dict:
+        if k in model_state_dict:
+            if state_dict[k].shape != model_state_dict[k].shape:
+                state_dict[k] = model_state_dict[k]
+        else:
+            logger.info('Drop parameter {}.'.format(k) + msg)
+    for k in model_state_dict:
+        if not (k in state_dict):
+            logger.info('No param {}.'.format(k) + msg)
+            state_dict[k] = model_state_dict[k]
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
diff --git a/modelscope/models/cv/video_multi_object_tracking/models/yolo.py b/modelscope/models/cv/video_multi_object_tracking/models/yolo.py
new file mode 100644
index 00000000..df66060b
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/models/yolo.py
@@ -0,0 +1,149 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import math
+from copy import deepcopy
+
+import torch.nn as nn
+
+from modelscope.models.base import TorchModel
+from modelscope.utils.logger import get_logger
+from .common import C3, SPP, Concat, Conv, Focus
+
+logger = get_logger()
+
+backbone_param = {
+    'nc':
+    80,
+    'depth_multiple':
+    0.33,
+    'width_multiple':
+    0.5,
+    'backbone': [[-1, 1, 'Focus', [64, 3]], [-1, 1, 'Conv', [128, 3, 2]],
+                 [-1, 3, 'C3', [128]], [-1, 1, 'Conv', [256, 3, 2]],
+                 [-1, 9, 'C3', [256]], [-1, 1, 'Conv', [512, 3, 2]],
+                 [-1, 9, 'C3', [512]], [-1, 1, 'Conv', [1024, 3, 2]],
+                 [-1, 1, 'SPP', [1024, [5, 9, 13]]],
+                 [-1, 3, 'C3', [1024, False]], [-1, 1, 'Conv', [512, 1, 1]],
+                 [-1, 1, 'nn.Upsample', ['None', 2, 'nearest']],
+                 [[-1, 6], 1, 'Concat', [1]], [-1, 3, 'C3', [512, False]],
+                 [-1, 1, 'Conv', [256, 1, 1]],
+                 [-1, 1, 'nn.Upsample', ['None', 2, 'nearest']],
+                 [[-1, 4], 1, 'Concat', [1]], [-1, 3, 'C3', [256, False]],
+                 [-1, 1, 'Conv', [128, 1, 1]],
+                 [-1, 1, 'nn.Upsample', ['None', 2, 'nearest']],
+                 [[-1, 2], 1, 'Concat', [1]], [-1, 3, 'C3', [128, False]]]
+}
+
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+
+class Model(nn.Module):
+
+    def __init__(self, config=backbone_param, ch=3, nc=None, anchors=None):
+        super(Model, self).__init__()
+        self.yaml = config  # model dict
+
+        # Define model
+        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
+        if nc and nc != self.yaml['nc']:
+            self.yaml['nc'] = nc  # override yaml value
+        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])
+        self.names = [str(i) for i in range(self.yaml['nc'])]
+        self.inplace = self.yaml.get('inplace', True)
+
+    def forward(self, x, augment=False, profile=False):
+        return self.forward_once(x, profile)
+
+    def forward_once(self, x, profile=False):
+        y = []
+        for m in self.model:
+            if m.f != -1:  # if not from previous layer
+                x = y[m.f] if isinstance(
+                    m.f, int) else [x if j == -1 else y[j] for j in m.f]
+
+            x = m(x)  # run
+            y.append(x if m.i in self.save else None)
+
+        return x
+
+
+def parse_model(d, ch):
+    gd, gw = d['depth_multiple'], d['width_multiple']
+
+    layers, save, c2 = [], [], ch[-1]
+    for i, (f, n, m, args) in enumerate(d['backbone']):
+        m = eval(m) if isinstance(m, str) else m
+        for j, a in enumerate(args):
+            try:
+                args[j] = eval(a) if isinstance(a, str) else a
+            except Exception:
+                pass
+
+        n = max(round(n * gd), 1) if n > 1 else n
+        if m in [Conv, SPP, Focus, C3]:
+            c1, c2 = ch[f], args[0]
+            c2 = make_divisible(c2 * gw, 8)
+
+            args = [c1, c2, *args[1:]]
+            if m in [C3]:
+                args.insert(2, n)
+                n = 1
+        elif m is nn.BatchNorm2d:
+            args = [ch[f]]
+        elif m is Concat:
+            c2 = sum([ch[x] for x in f])
+        else:
+            c2 = ch[f]
+
+        m_ = nn.Sequential(*[m(*args)
+                             for _ in range(n)]) if n > 1 else m(*args)
+        t = str(m)[8:-2].replace('__main__.', '')
+        np = sum([x.numel() for x in m_.parameters()])
+        m_.i, m_.f, m_.type, m_.np = i, f, t, np
+        save.extend(x % i for x in ([f] if isinstance(f, int) else f)
+                    if x != -1)
+        layers.append(m_)
+        if i == 0:
+            ch = []
+        ch.append(c2)
+    return nn.Sequential(*layers), sorted(save)
+
+
+class PoseYOLO(TorchModel):
+
+    def __init__(self, heads):
+        self.heads = heads
+        super(PoseYOLO, self).__init__()
+        self.backbone = Model()
+        for head in sorted(self.heads):
+            num_output = self.heads[head]
+            fc = nn.Sequential(
+                nn.Conv2d(64, 64, kernel_size=3, padding=1, bias=True),
+                nn.SiLU(),
+                nn.Conv2d(64, num_output, kernel_size=1, stride=1, padding=0))
+            self.__setattr__(head, fc)
+            if 'hm' in head:
+                fc[-1].bias.data.fill_(-2.19)
+            else:
+                fill_fc_weights(fc)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+
+def get_pose_net(num_layers, heads, head_conv):
+    model = PoseYOLO(heads)
+    return model
+
+
+def make_divisible(x, divisor):
+    return math.ceil(x / divisor) * divisor
diff --git a/modelscope/models/cv/video_multi_object_tracking/tracker/__init__.py b/modelscope/models/cv/video_multi_object_tracking/tracker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_multi_object_tracking/tracker/basetrack.py b/modelscope/models/cv/video_multi_object_tracking/tracker/basetrack.py
new file mode 100644
index 00000000..f8763308
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/basetrack.py
@@ -0,0 +1,55 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+from collections import OrderedDict
+
+import numpy as np
+
+
+class TrackState(object):
+    New = 0
+    Tracked = 1
+    Lost = 2
+    Removed = 3
+
+
+class BaseTrack(object):
+    _count = 0
+
+    track_id = 0
+    is_activated = False
+    state = TrackState.New
+
+    history = OrderedDict()
+    features = []
+    curr_feature = None
+    score = 0
+    start_frame = 0
+    frame_id = 0
+    time_since_update = 0
+
+    # multi-camera
+    location = (np.inf, np.inf)
+
+    @property
+    def end_frame(self):
+        return self.frame_id
+
+    @staticmethod
+    def next_id():
+        BaseTrack._count += 1
+        return BaseTrack._count
+
+    def activate(self, *args):
+        raise NotImplementedError
+
+    def predict(self):
+        raise NotImplementedError
+
+    def update(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def mark_lost(self):
+        self.state = TrackState.Lost
+
+    def mark_removed(self):
+        self.state = TrackState.Removed
diff --git a/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py b/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py
new file mode 100644
index 00000000..45d2f5c0
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py
@@ -0,0 +1,95 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import lap
+import numpy as np
+from scipy.spatial.distance import cdist
+
+from modelscope.models.cv.video_multi_object_tracking.utils import \
+    kalman_filter
+from modelscope.models.cv.video_multi_object_tracking.utils.utils import \
+    bbox_iou
+
+
+def linear_assignment(cost_matrix, thresh):
+    if cost_matrix.size == 0:
+        return np.empty((0, 2),
+                        dtype=int), tuple(range(cost_matrix.shape[0])), tuple(
+                            range(cost_matrix.shape[1]))
+    matches, unmatched_a, unmatched_b = [], [], []
+    cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
+    for ix, mx in enumerate(x):
+        if mx >= 0:
+            matches.append([ix, mx])
+    unmatched_a = np.where(x < 0)[0]
+    unmatched_b = np.where(y < 0)[0]
+    matches = np.asarray(matches)
+    return matches, unmatched_a, unmatched_b
+
+
+def ious(atlbrs, btlbrs):
+    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
+    if ious.size == 0:
+        return ious
+
+    ious = bbox_iou(atlbrs, btlbrs, True).numpy()
+
+    return ious
+
+
+def iou_distance(atracks, btracks):
+    if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or (
+            len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
+        atlbrs = atracks
+        btlbrs = btracks
+    else:
+        atlbrs = [track.tlbr for track in atracks]
+        btlbrs = [track.tlbr for track in btracks]
+    _ious = ious(atlbrs, btlbrs)
+    cost_matrix = 1 - _ious
+
+    return cost_matrix
+
+
+def embedding_distance(tracks, detections, metric='cosine'):
+    """
+    Args:
+        tracks: list[STrack]
+        detections: list[BaseTrack]
+        metric: str
+    Returns:
+        cost_matrix: np.ndarray
+    """
+
+    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
+    if cost_matrix.size == 0:
+        return cost_matrix
+    det_features = np.asarray([track.curr_feat for track in detections],
+                              dtype=np.float)
+    track_features = np.asarray([track.smooth_feat for track in tracks],
+                                dtype=np.float)
+    cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
+    return cost_matrix
+
+
+def fuse_motion(kf,
+                cost_matrix,
+                tracks,
+                detections,
+                only_position=False,
+                lambda_=0.98):
+    if cost_matrix.size == 0:
+        return cost_matrix
+    gating_dim = 2 if only_position else 4
+    gating_threshold = kalman_filter.chi2inv95[gating_dim]
+    measurements = np.asarray([det.to_xyah() for det in detections])
+    for row, track in enumerate(tracks):
+        gating_distance = kf.gating_distance(
+            track.mean,
+            track.covariance,
+            measurements,
+            only_position,
+            metric='maha')
+        cost_matrix[row, gating_distance > gating_threshold] = np.inf
+        cost_matrix[row] = lambda_ * cost_matrix[row] + (
+            1 - lambda_) * gating_distance
+    return cost_matrix
diff --git a/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py b/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py
new file mode 100644
index 00000000..1dc3297f
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py
@@ -0,0 +1,418 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+from collections import deque
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_multi_object_tracking.models.decode import \
+    mot_decode
+from modelscope.models.cv.video_multi_object_tracking.models.model import (
+    create_model, load_model)
+from modelscope.models.cv.video_multi_object_tracking.tracker import matching
+from modelscope.models.cv.video_multi_object_tracking.tracker.basetrack import (
+    BaseTrack, TrackState)
+from modelscope.models.cv.video_multi_object_tracking.utils.kalman_filter import \
+    KalmanFilter
+from modelscope.models.cv.video_multi_object_tracking.utils.utils import (
+    _tranpose_and_gather_feat, ctdet_post_process)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class STrack(BaseTrack):
+    shared_kalman = KalmanFilter()
+
+    def __init__(self, tlwh, score, temp_feat, buffer_size=30):
+
+        # wait activate
+        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self.kalman_filter = None
+        self.mean, self.covariance = None, None
+        self.is_activated = False
+
+        self.score = score
+        self.tracklet_len = 0
+
+        self.smooth_feat = None
+        self.update_features(temp_feat)
+        self.features = deque([], maxlen=buffer_size)
+        self.alpha = 0.9
+
+    def update_features(self, feat):
+        feat /= np.linalg.norm(feat)
+        self.curr_feat = feat
+        if self.smooth_feat is None:
+            self.smooth_feat = feat
+        else:
+            self.smooth_feat = self.alpha * self.smooth_feat + (
+                1 - self.alpha) * feat
+        self.features.append(feat)
+        self.smooth_feat /= np.linalg.norm(self.smooth_feat)
+
+    def predict(self):
+        mean_state = self.mean.copy()
+        if self.state != TrackState.Tracked:
+            mean_state[7] = 0
+        self.mean, self.covariance = self.kalman_filter.predict(
+            mean_state, self.covariance)
+
+    @staticmethod
+    def multi_predict(stracks):
+        if len(stracks) > 0:
+            multi_mean = np.asarray([st.mean.copy() for st in stracks])
+            multi_covariance = np.asarray([st.covariance for st in stracks])
+            for i, st in enumerate(stracks):
+                if st.state != TrackState.Tracked:
+                    multi_mean[i][7] = 0
+            multi_mean, multi_covariance = STrack.shared_kalman.multi_predict(
+                multi_mean, multi_covariance)
+            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
+                stracks[i].mean = mean
+                stracks[i].covariance = cov
+
+    def activate(self, kalman_filter, frame_id):
+        self.kalman_filter = kalman_filter
+        self.track_id = self.next_id()
+        self.mean, self.covariance = self.kalman_filter.initiate(
+            self.tlwh_to_xyah(self._tlwh))
+
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        if frame_id == 1:
+            self.is_activated = True
+        self.frame_id = frame_id
+        self.start_frame = frame_id
+
+    def re_activate(self, new_track, frame_id, new_id=False):
+        self.mean, self.covariance = self.kalman_filter.update(
+            self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh))
+
+        self.update_features(new_track.curr_feat)
+        self.tracklet_len = 0
+        self.state = TrackState.Tracked
+        self.is_activated = True
+        self.frame_id = frame_id
+        if new_id:
+            self.track_id = self.next_id()
+
+    def update(self, new_track, frame_id, update_feature=True):
+        """
+        Update a matched track
+        Args:
+            new_track: STrack
+            frame_id: int
+            update_feature: bool
+        """
+        self.frame_id = frame_id
+        self.tracklet_len += 1
+
+        new_tlwh = new_track.tlwh
+        self.mean, self.covariance = self.kalman_filter.update(
+            self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh))
+        self.state = TrackState.Tracked
+        self.is_activated = True
+
+        self.score = new_track.score
+        if update_feature:
+            self.update_features(new_track.curr_feat)
+
+    @property
+    def tlwh(self):
+        """Get current position in bounding box format `(top left x, top left y,
+                width, height)`.
+        """
+        if self.mean is None:
+            return self._tlwh.copy()
+        ret = self.mean[:4].copy()
+        ret[2] *= ret[3]
+        ret[:2] -= ret[2:] / 2
+        return ret
+
+    @property
+    def tlbr(self):
+        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
+        `(top left, bottom right)`.
+        """
+        ret = self.tlwh.copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    @staticmethod
+    def tlwh_to_xyah(tlwh):
+        """Convert bounding box to format `(center x, center y, aspect ratio,
+        height)`, where the aspect ratio is `width / height`.
+        """
+        ret = np.asarray(tlwh).copy()
+        ret[:2] += ret[2:] / 2
+        ret[2] /= ret[3]
+        return ret
+
+    def to_xyah(self):
+        return self.tlwh_to_xyah(self.tlwh)
+
+    @staticmethod
+    def tlbr_to_tlwh(tlbr):
+        ret = np.asarray(tlbr).copy()
+        ret[2:] -= ret[:2]
+        return ret
+
+    @staticmethod
+    def tlwh_to_tlbr(tlwh):
+        ret = np.asarray(tlwh).copy()
+        ret[2:] += ret[:2]
+        return ret
+
+    def __repr__(self):
+        return 'OT_{}_({}-{})'.format(self.track_id, self.start_frame,
+                                      self.end_frame)
+
+
+class JDETracker(object):
+
+    def __init__(self, opt, model_path, device):
+        self.opt = opt
+        self.device = device
+        self.model = create_model(opt.arch, opt.heads, opt.head_conv)
+        self.model = load_model(self.model, model_path)
+        self.model = self.model.to(device)
+        self.model.eval()
+
+        self.tracked_stracks = []  # type: list[STrack]
+        self.lost_stracks = []  # type: list[STrack]
+        self.removed_stracks = []  # type: list[STrack]
+
+        self.frame_id = 0
+        self.det_thresh = opt.conf_thres
+        self.max_per_image = opt.K
+        self.mean = np.array(opt.mean, dtype=np.float32).reshape(1, 1, 3)
+        self.std = np.array(opt.std, dtype=np.float32).reshape(1, 1, 3)
+
+        self.kalman_filter = KalmanFilter()
+
+    def set_buffer_len(self, frame_rate):
+        self.buffer_size = int(frame_rate / 30.0 * self.opt.track_buffer)
+        self.max_time_lost = self.buffer_size
+
+    def post_process(self, dets, meta):
+        dets = dets.detach().cpu().numpy()
+        dets = dets.reshape(1, -1, dets.shape[2])
+        dets = ctdet_post_process(dets.copy(), [meta['c']], [meta['s']],
+                                  meta['out_height'], meta['out_width'],
+                                  self.opt.num_classes)
+        for j in range(1, self.opt.num_classes + 1):
+            dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
+        return dets[0]
+
+    def merge_outputs(self, detections):
+        results = {}
+        for j in range(1, self.opt.num_classes + 1):
+            results[j] = np.concatenate(
+                [detection[j] for detection in detections],
+                axis=0).astype(np.float32)
+
+        scores = np.hstack(
+            [results[j][:, 4] for j in range(1, self.opt.num_classes + 1)])
+        if len(scores) > self.max_per_image:
+            kth = len(scores) - self.max_per_image
+            thresh = np.partition(scores, kth)[kth]
+            for j in range(1, self.opt.num_classes + 1):
+                keep_inds = (results[j][:, 4] >= thresh)
+                results[j] = results[j][keep_inds]
+        return results
+
+    def update(self, im_blob, img0):
+        self.frame_id += 1
+        activated_starcks = []
+        refind_stracks = []
+        lost_stracks = []
+        removed_stracks = []
+
+        width = img0.shape[1]
+        height = img0.shape[0]
+        inp_height = im_blob.shape[2]
+        inp_width = im_blob.shape[3]
+        if self.device.type == 'cuda':
+            im_blob = im_blob.cuda()
+        c = np.array([width / 2., height / 2.], dtype=np.float32)
+        s = max(float(inp_width) / float(inp_height) * height, width) * 1.0
+        meta = {
+            'c': c,
+            's': s,
+            'out_height': inp_height // self.opt.down_ratio,
+            'out_width': inp_width // self.opt.down_ratio
+        }
+
+        # Step 1: Network forward, get detections & embeddings
+        with torch.no_grad():
+            output = self.model(im_blob)[-1]
+            hm = output['hm'].sigmoid_()
+            wh = output['wh']
+            id_feature = output['id']
+            id_feature = F.normalize(id_feature, dim=1)
+
+            reg = output['reg'] if self.opt.reg_offset else None
+            dets, inds = mot_decode(
+                hm, wh, reg=reg, ltrb=self.opt.ltrb, K=self.opt.K)
+            id_feature = _tranpose_and_gather_feat(id_feature, inds)
+            id_feature = id_feature.squeeze(0)
+            id_feature = id_feature.cpu().numpy()
+
+        dets = self.post_process(dets, meta)
+        dets = self.merge_outputs([dets])[1]
+
+        remain_inds = dets[:, 4] > self.opt.conf_thres
+        dets = dets[remain_inds]
+        id_feature = id_feature[remain_inds]
+
+        if len(dets) > 0:
+            '''Detections'''
+            detections = [
+                STrack(STrack.tlbr_to_tlwh(tlbrs[:4]), tlbrs[4], f, 30)
+                for (tlbrs, f) in zip(dets[:, :5], id_feature)
+            ]
+        else:
+            detections = []
+
+        # Add newly detected tracklets to tracked_stracks
+        unconfirmed = []
+        tracked_stracks = []  # type: list[STrack]
+        for track in self.tracked_stracks:
+            if not track.is_activated:
+                unconfirmed.append(track)
+            else:
+                tracked_stracks.append(track)
+
+        # Step 2: First association, with embedding
+        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
+        STrack.multi_predict(strack_pool)
+        dists = matching.embedding_distance(strack_pool, detections)
+        dists = matching.fuse_motion(self.kalman_filter, dists, strack_pool,
+                                     detections)
+        matches, u_track, u_detection = matching.linear_assignment(
+            dists, thresh=0.4)
+
+        for itracked, idet in matches:
+            track = strack_pool[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(detections[idet], self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        # Step 3: Second association, with IOU
+        detections = [detections[i] for i in u_detection]
+        r_tracked_stracks = [
+            strack_pool[i] for i in u_track
+            if strack_pool[i].state == TrackState.Tracked
+        ]
+        dists = matching.iou_distance(r_tracked_stracks, detections)
+        matches, u_track, u_detection = matching.linear_assignment(
+            dists, thresh=0.5)
+
+        for itracked, idet in matches:
+            track = r_tracked_stracks[itracked]
+            det = detections[idet]
+            if track.state == TrackState.Tracked:
+                track.update(det, self.frame_id)
+                activated_starcks.append(track)
+            else:
+                track.re_activate(det, self.frame_id, new_id=False)
+                refind_stracks.append(track)
+
+        for it in u_track:
+            track = r_tracked_stracks[it]
+            if not track.state == TrackState.Lost:
+                track.mark_lost()
+                lost_stracks.append(track)
+
+        detections = [detections[i] for i in u_detection]
+        dists = matching.iou_distance(unconfirmed, detections)
+        matches, u_unconfirmed, u_detection = matching.linear_assignment(
+            dists, thresh=0.7)
+        for itracked, idet in matches:
+            unconfirmed[itracked].update(detections[idet], self.frame_id)
+            activated_starcks.append(unconfirmed[itracked])
+        for it in u_unconfirmed:
+            track = unconfirmed[it]
+            track.mark_removed()
+            removed_stracks.append(track)
+
+        # Step 4: Init new stracks
+        for inew in u_detection:
+            track = detections[inew]
+            if track.score < self.det_thresh:
+                continue
+            track.activate(self.kalman_filter, self.frame_id)
+            activated_starcks.append(track)
+        # Step 5: Update state
+        for track in self.lost_stracks:
+            if self.frame_id - track.end_frame > self.max_time_lost:
+                track.mark_removed()
+                removed_stracks.append(track)
+
+        self.tracked_stracks = [
+            t for t in self.tracked_stracks if t.state == TrackState.Tracked
+        ]
+        self.tracked_stracks = joint_stracks(self.tracked_stracks,
+                                             activated_starcks)
+        self.tracked_stracks = joint_stracks(self.tracked_stracks,
+                                             refind_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks,
+                                        self.tracked_stracks)
+        self.lost_stracks.extend(lost_stracks)
+        self.lost_stracks = sub_stracks(self.lost_stracks,
+                                        self.removed_stracks)
+        self.removed_stracks.extend(removed_stracks)
+        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(
+            self.tracked_stracks, self.lost_stracks)
+        output_stracks = [
+            track for track in self.tracked_stracks if track.is_activated
+        ]
+
+        return output_stracks
+
+
+def joint_stracks(tlista, tlistb):
+    exists = {}
+    res = []
+    for t in tlista:
+        exists[t.track_id] = 1
+        res.append(t)
+    for t in tlistb:
+        tid = t.track_id
+        if not exists.get(tid, 0):
+            exists[tid] = 1
+            res.append(t)
+    return res
+
+
+def sub_stracks(tlista, tlistb):
+    stracks = {}
+    for t in tlista:
+        stracks[t.track_id] = t
+    for t in tlistb:
+        tid = t.track_id
+        if stracks.get(tid, 0):
+            del stracks[tid]
+    return list(stracks.values())
+
+
+def remove_duplicate_stracks(stracksa, stracksb):
+    pdist = matching.iou_distance(stracksa, stracksb)
+    pairs = np.where(pdist < 0.15)
+    dupa, dupb = list(), list()
+    for p, q in zip(*pairs):
+        timep = stracksa[p].frame_id - stracksa[p].start_frame
+        timeq = stracksb[q].frame_id - stracksb[q].start_frame
+        if timep > timeq:
+            dupb.append(q)
+        else:
+            dupa.append(p)
+    resa = [t for i, t in enumerate(stracksa) if i not in dupa]
+    resb = [t for i, t in enumerate(stracksb) if i not in dupb]
+    return resa, resb
diff --git a/modelscope/models/cv/video_multi_object_tracking/utils/__init__.py b/modelscope/models/cv/video_multi_object_tracking/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_multi_object_tracking/utils/image.py b/modelscope/models/cv/video_multi_object_tracking/utils/image.py
new file mode 100644
index 00000000..1e8cf76b
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/utils/image.py
@@ -0,0 +1,73 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import cv2
+import numpy as np
+
+
+def flip(img):
+    return img[:, :, ::-1].copy()
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        scale = np.array([scale, scale], dtype=np.float32)
+
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
diff --git a/modelscope/models/cv/video_multi_object_tracking/utils/kalman_filter.py b/modelscope/models/cv/video_multi_object_tracking/utils/kalman_filter.py
new file mode 100644
index 00000000..c9cb0b41
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/utils/kalman_filter.py
@@ -0,0 +1,264 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import numpy as np
+import scipy.linalg
+
+chi2inv95 = {
+    1: 3.8415,
+    2: 5.9915,
+    3: 7.8147,
+    4: 9.4877,
+    5: 11.070,
+    6: 12.592,
+    7: 14.067,
+    8: 15.507,
+    9: 16.919
+}
+
+
+class KalmanFilter(object):
+    """
+    A simple Kalman filter for tracking bounding boxes in image space.
+
+    The 8-dimensional state space
+
+        x, y, a, h, vx, vy, va, vh
+
+    contains the bounding box center position (x, y), aspect ratio a, height h,
+    and their respective velocities.
+
+    Object motion follows a constant velocity model. The bounding box location
+    (x, y, a, h) is taken as direct observation of the state space (linear
+    observation model).
+
+    """
+
+    def __init__(self):
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current
+        # state estimate. These weights control the amount of uncertainty in
+        # the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement):
+        """Create track from unassociated measurement.
+
+        Args:
+        measurement : ndarray
+            Bounding box coordinates (x, y, a, h) with center position (x, y),
+            aspect ratio a, and height h.
+
+        Returns:
+            (ndarray, ndarray): Returns the mean vector (8 dimensional) and covariance matrix
+            (8x8 dimensional) of the new track. Unobserved velocities are initialized
+            to 0 mean.
+
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[3], 1e-2,
+            2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3], 1e-5,
+            10 * self._std_weight_velocity * measurement[3]
+        ]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean, covariance):
+        """Run Kalman filter prediction step.
+
+        Args:
+            mean : ndarray
+                The 8 dimensional mean vector of the object state at the previous
+                time step.
+            covariance : ndarray
+                The 8x8 dimensional covariance matrix of the object state at the
+                previous time step.
+
+        Returns:
+            (ndarray, ndarray)
+                Returns the mean vector and covariance matrix of the predicted
+                state. Unobserved velocities are initialized to 0 mean.
+
+        """
+        std_pos = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-2,
+            self._std_weight_position * mean[3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[3],
+            self._std_weight_velocity * mean[3], 1e-5,
+            self._std_weight_velocity * mean[3]
+        ]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        mean = np.dot(mean, self._motion_mat.T)
+        covariance = np.linalg.multi_dot(
+            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self, mean, covariance):
+        """Project state distribution to measurement space.
+
+        Args:
+            mean : ndarray
+                The state's mean vector (8 dimensional array).
+            covariance : ndarray
+                The state's covariance matrix (8x8 dimensional).
+
+        Returns:
+            (ndarray, ndarray)
+                Returns the projected mean and covariance matrix of the given state
+                estimate.
+
+        """
+        std = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-1,
+            self._std_weight_position * mean[3]
+        ]
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def multi_predict(self, mean, covariance):
+        """Run Kalman filter prediction step (Vectorized version).
+
+        Args:
+            mean : ndarray
+                The Nx8 dimensional mean matrix of the object states at the previous
+                time step.
+            covariance : ndarray
+                The Nx8x8 dimensional covariance matrics of the object states at the
+                previous time step.
+
+        Returns:
+            (ndarray, ndarray)
+                Returns the mean vector and covariance matrix of the predicted
+                state. Unobserved velocities are initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[:, 3],
+            self._std_weight_position * mean[:, 3],
+            1e-2 * np.ones_like(mean[:, 3]),
+            self._std_weight_position * mean[:, 3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[:, 3],
+            self._std_weight_velocity * mean[:, 3],
+            1e-5 * np.ones_like(mean[:, 3]),
+            self._std_weight_velocity * mean[:, 3]
+        ]
+        sqr = np.square(np.r_[std_pos, std_vel]).T
+
+        motion_cov = []
+        for i in range(len(mean)):
+            motion_cov.append(np.diag(sqr[i]))
+        motion_cov = np.asarray(motion_cov)
+
+        mean = np.dot(mean, self._motion_mat.T)
+        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
+        covariance = np.dot(left, self._motion_mat.T) + motion_cov
+
+        return mean, covariance
+
+    def update(self, mean, covariance, measurement):
+        """Run Kalman filter correction step.
+
+        Args:
+            mean : ndarray
+                The predicted state's mean vector (8 dimensional).
+            covariance : ndarray
+                The state's covariance matrix (8x8 dimensional).
+            measurement : ndarray
+                The 4 dimensional measurement vector (x, y, a, h), where (x, y)
+                is the center position, a the aspect ratio, and h the height of the
+                bounding box.
+
+        Returns:
+            (ndarray, ndarray)
+                Returns the measurement-corrected state distribution.
+
+        """
+        projected_mean, projected_cov = self.project(mean, covariance)
+
+        chol_factor, lower = scipy.linalg.cho_factor(
+            projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve((chol_factor, lower),
+                                             np.dot(covariance,
+                                                    self._update_mat.T).T,
+                                             check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot(
+            (kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self,
+                        mean,
+                        covariance,
+                        measurements,
+                        only_position=False,
+                        metric='maha'):
+        """Compute gating distance between state distribution and measurements.
+        A suitable distance threshold can be obtained from `chi2inv95`. If
+        `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+
+        Args:
+            mean : ndarray
+                Mean vector over the state distribution (8 dimensional).
+            covariance : ndarray
+                Covariance of the state distribution (8x8 dimensional).
+            measurements : ndarray
+                An Nx4 dimensional matrix of N measurements, each in
+                format (x, y, a, h) where (x, y) is the bounding box center
+                position, a the aspect ratio, and h the height.
+            only_position : Optional[bool]
+                If True, distance computation is done with respect to the bounding
+                box center position only.
+
+        Returns:
+            an array of length N, where the i-th element contains the
+            squared Mahalanobis distance between (mean, covariance) and
+            `measurements[i]`.
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        d = measurements - mean
+        if metric == 'gaussian':
+            return np.sum(d * d, axis=1)
+        elif metric == 'maha':
+            cholesky_factor = np.linalg.cholesky(covariance)
+            z = scipy.linalg.solve_triangular(
+                cholesky_factor,
+                d.T,
+                lower=True,
+                check_finite=False,
+                overwrite_b=True)
+            squared_maha = np.sum(z * z, axis=0)
+            return squared_maha
+        else:
+            raise ValueError('invalid distance metric')
diff --git a/modelscope/models/cv/video_multi_object_tracking/utils/utils.py b/modelscope/models/cv/video_multi_object_tracking/utils/utils.py
new file mode 100644
index 00000000..e961b720
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/utils/utils.py
@@ -0,0 +1,208 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import cv2
+import numpy as np
+import torch
+
+from .image import transform_preds
+
+
+def xyxy2xywh(x):
+    # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
+    y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2
+    y[:, 2] = x[:, 2] - x[:, 0]
+    y[:, 3] = x[:, 3] - x[:, 1]
+    return y
+
+
+def xywh2xyxy(x):
+    # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
+    y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
+    y[:, 0] = (x[:, 0] - x[:, 2] / 2)
+    y[:, 1] = (x[:, 1] - x[:, 3] / 2)
+    y[:, 2] = (x[:, 0] + x[:, 2] / 2)
+    y[:, 3] = (x[:, 1] + x[:, 3] / 2)
+    return y
+
+
+def bbox_iou(box1, box2, x1y1x2y2=False):
+    """
+    Returns the IoU of two bounding boxes
+    """
+    N, M = len(box1), len(box2)
+    box1 = torch.from_numpy(np.stack(box1))
+    box2 = torch.from_numpy(np.stack(box2))
+    if x1y1x2y2:
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:,
+                                                                  2], box1[:,
+                                                                           3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:,
+                                                                  2], box2[:,
+                                                                           3]
+    else:
+        # Transform from center and width to exact coordinates
+        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
+        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
+        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
+        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
+
+    # get the coordinates of the intersection rectangle
+    inter_rect_x1 = torch.max(b1_x1.unsqueeze(1), b2_x1)
+    inter_rect_y1 = torch.max(b1_y1.unsqueeze(1), b2_y1)
+    inter_rect_x2 = torch.min(b1_x2.unsqueeze(1), b2_x2)
+    inter_rect_y2 = torch.min(b1_y2.unsqueeze(1), b2_y2)
+    # Intersection area
+    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(
+        inter_rect_y2 - inter_rect_y1, 0)
+    # Union Area
+    b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1))
+    b1_area = ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).view(-1, 1).expand(N, M)
+    b2_area = ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).view(1, -1).expand(N, M)
+
+    return inter_area / (b1_area + b2_area - inter_area + 1e-16)
+
+
+class LoadVideo:  # for inference
+
+    def __init__(self, path, img_size=(1088, 608)):
+        self.cap = cv2.VideoCapture(path)
+        self.frame_rate = int(round(self.cap.get(cv2.CAP_PROP_FPS)))
+        self.vw = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        self.vh = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        self.vn = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        self.width = img_size[0]
+        self.height = img_size[1]
+        self.count = 0
+
+        self.w, self.h = 1920, 1080
+        print('Length of the video: {:d} frames'.format(self.vn))
+
+    def get_size(self, vw, vh, dw, dh):
+        wa, ha = float(dw) / vw, float(dh) / vh
+        a = min(wa, ha)
+        return int(vw * a), int(vh * a)
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        if self.count == len(self):
+            raise StopIteration
+        # Read image
+        res, img0 = self.cap.read()  # BGR
+        assert img0 is not None, 'Failed to load frame {:d}'.format(self.count)
+        img0 = cv2.resize(img0, (self.w, self.h))
+
+        # Padded resize
+        shape = [self.height, self.width]
+        img, ratio, pad = letterbox(img0, shape)
+
+        # Normalize RGB
+        img = img[:, :, ::-1].transpose(2, 0, 1)
+        img = np.ascontiguousarray(img, dtype=np.float32)
+        img /= 255.0
+
+        return self.count, img, img0
+
+    def __len__(self):
+        return self.vn  # number of files
+
+
+def letterbox(img,
+              new_shape=(640, 640),
+              color=(114, 114, 114),
+              auto=True,
+              scaleFill=False,
+              scaleup=True):
+    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better test mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
+        1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[
+            0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    img = cv2.copyMakeBorder(
+        img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+        value=color)  # add border
+    return img, ratio, (dw, dh)
+
+
+def _gather_feat(feat, ind, mask=None):
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def _tranpose_and_gather_feat(feat, ind):
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = _gather_feat(feat, ind)
+    return feat
+
+
+class cfg_opt:
+    K = 500
+    arch = 'yolo'
+    conf_thres = 0.4
+    down_ratio = 4
+    head_conv = 256
+    heads = {'hm': 1, 'wh': 4, 'id': 64, 'reg': 2}
+    img_size = (1088, 608)
+    ltrb = True
+    mean = [0.408, 0.447, 0.47]
+    min_box_area = 100
+    num_classes = 1
+    reg_offset = True
+    reid_dim = 64
+    std = [0.289, 0.274, 0.278]
+    track_buffer = 30
+
+
+def ctdet_post_process(dets, c, s, h, w, num_classes):
+    ret = []
+    for i in range(dets.shape[0]):
+        top_preds = {}
+        dets[i, :, :2] = transform_preds(dets[i, :, 0:2], c[i], s[i], (w, h))
+        dets[i, :, 2:4] = transform_preds(dets[i, :, 2:4], c[i], s[i], (w, h))
+        classes = dets[i, :, -1]
+        for j in range(num_classes):
+            inds = (classes == j)
+            det4 = dets[i, inds, :4].astype(np.float32)
+            det5 = dets[i, inds, 4:5].astype(np.float32)
+            top_preds[j + 1] = np.concatenate([det4, det5], axis=1).tolist()
+        ret.append(top_preds)
+    return ret
diff --git a/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py b/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py
new file mode 100644
index 00000000..8ed7b601
--- /dev/null
+++ b/modelscope/models/cv/video_multi_object_tracking/utils/visualization.py
@@ -0,0 +1,85 @@
+# The implementation is adopted from FairMOT,
+# made publicly available under the MIT License at https://github.com/ifzhang/FairMOT
+import cv2
+import numpy as np
+
+
+def get_color(idx):
+    idx = idx * 3
+    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
+
+    return color
+
+
+def plot_tracking(image,
+                  tlwhs,
+                  obj_ids,
+                  scores=None,
+                  frame_id=0,
+                  fps=0.,
+                  ids2=None):
+    im = np.ascontiguousarray(np.copy(image))
+    text_scale = max(1, image.shape[1] / 1600.)
+    text_thickness = 2
+    line_thickness = max(1, int(image.shape[1] / 500.))
+
+    cv2.putText(
+        im,
+        'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
+        (0, int(15 * text_scale)),
+        cv2.FONT_HERSHEY_PLAIN,
+        text_scale, (0, 0, 255),
+        thickness=2)
+
+    for i, tlwh in enumerate(tlwhs):
+        x1, y1, w, h = tlwh
+        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
+        obj_id = int(obj_ids[i])
+        id_text = '{}'.format(int(obj_id))
+        if ids2 is not None:
+            id_text = id_text + ', {}'.format(int(ids2[i]))
+        color = get_color(abs(obj_id))
+        cv2.rectangle(
+            im,
+            intbox[0:2],
+            intbox[2:4],
+            color=color,
+            thickness=line_thickness)
+        cv2.putText(
+            im,
+            id_text, (intbox[0], intbox[1] + 30),
+            cv2.FONT_HERSHEY_PLAIN,
+            text_scale, (0, 0, 255),
+            thickness=text_thickness)
+    return im
+
+
+def show_multi_object_tracking_result(video_in_path, bboxes, video_save_path):
+    cap = cv2.VideoCapture(video_in_path)
+    frame_idx = 0
+    while (cap.isOpened()):
+        frame_idx += 1
+        success, frame = cap.read()
+        if not success:
+            if frame_idx == 1:
+                raise Exception(video_in_path,
+                                ' can not be correctly decoded by OpenCV.')
+            else:
+                break
+        cur_frame_boxes = []
+        cur_obj_ids = []
+        for box in bboxes:
+            if box[0] == frame_idx:
+                cur_frame_boxes.append(
+                    [box[2], box[3], box[4] - box[2], box[5] - box[3]])
+                cur_obj_ids.append(box[1])
+        if frame_idx == 1:
+            size = (frame.shape[1], frame.shape[0])
+            fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G')
+            video_writer = cv2.VideoWriter(video_save_path, fourcc,
+                                           cap.get(cv2.CAP_PROP_FPS), size,
+                                           True)
+        frame = plot_tracking(frame, cur_frame_boxes, cur_obj_ids, frame_idx)
+        video_writer.write(frame)
+    video_writer.release
+    cap.release()
diff --git a/modelscope/models/cv/video_stabilization/DUT/DUT_raft.py b/modelscope/models/cv/video_stabilization/DUT/DUT_raft.py
new file mode 100644
index 00000000..9eee68d7
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/DUT_raft.py
@@ -0,0 +1,410 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import sys
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_stabilization.utils.image_utils import topk_map
+from modelscope.models.cv.video_stabilization.utils.IterativeSmooth import \
+    generateSmooth
+from modelscope.models.cv.video_stabilization.utils.MedianFilter import (
+    MultiMotionPropagate, SingleMotionPropagate)
+from modelscope.models.cv.video_stabilization.utils.RAFTUtils import \
+    InputPadder
+from .config import cfg
+from .MotionPro import MotionPro
+from .RAFT.raft import RAFT
+from .rf_det_so import RFDetSO
+from .Smoother import Smoother
+
+
+class KeypointDetction(nn.Module):
+
+    def __init__(self, RFDetPath='', topK=cfg.TRAIN.TOPK, detectorType=0):
+        super(KeypointDetction, self).__init__()
+        self.feature_params = dict(
+            maxCorners=topK, qualityLevel=0.3, minDistance=7, blockSize=7)
+
+        self.TOPK = topK
+        self.type = detectorType
+
+    def forward(self, im_data):
+        '''
+        @param im_data [B, 1, H, W] gray images
+        @return im_topk [B, 1, H, W]
+        @return kpts [[N, 4] for B] (B, 0, H, W)
+        '''
+
+        device = im_data.device
+        im1 = im_data
+        im1 = (im1.cpu().numpy() * 255).astype(np.uint8)
+        batch = im1.shape[0]
+        assert im1.shape[1] == 1
+        im_topK = torch.zeros((batch, 1, im1.shape[2], im1.shape[3]),
+                              device=device)
+        for idx in range(batch):
+            im = im1[idx, 0]
+
+            if self.type == 0:
+                p = cv2.goodFeaturesToTrack(
+                    im, mask=None, **self.feature_params)
+            p = p[:, 0, :]  # N, 2
+            im_topK[idx, 0, p[:, 1], p[:, 0]] = 1.
+        kpts = im_topK.nonzero()
+        kpts = [kpts[kpts[:, 0] == idx, :] for idx in range(batch)]  # B, N, 4
+        return im_topK, kpts
+
+
+class RFDetection(nn.Module):
+
+    def __init__(self, RFDetPath, topK=cfg.TRAIN.TOPK):
+        super(RFDetection, self).__init__()
+
+        self.det = RFDetSO(
+            cfg.TRAIN.score_com_strength,
+            cfg.TRAIN.scale_com_strength,
+            cfg.TRAIN.NMS_THRESH,
+            cfg.TRAIN.NMS_KSIZE,
+            cfg.TRAIN.TOPK,
+            cfg.MODEL.GAUSSIAN_KSIZE,
+            cfg.MODEL.GAUSSIAN_SIGMA,
+            cfg.MODEL.KSIZE,
+            cfg.MODEL.padding,
+            cfg.MODEL.dilation,
+            cfg.MODEL.scale_list,
+        )
+
+        self.TOPK = topK
+
+    def forward(self, im_data, batch=2, allInfer=False):
+        '''
+        @param im_data [B, 1, H, W]
+        @return im_topk [B, 1, H, W]
+        @return kpts [[N, 4] for B] (B, 0, H, W)
+        '''
+        if allInfer:
+            im_data = im_data
+            im_rawsc, _, _ = self.det(im_data)
+            im_score = self.det.process(im_rawsc)[0]
+            im_topk = topk_map(im_score, self.TOPK).permute(0, 3, 1,
+                                                            2)  # B, 1, H, W
+            kpts = im_topk.nonzero()  # (B*topk, 4)
+            kpts = [
+                kpts[kpts[:, 0] == idx, :] for idx in range(im_data.shape[0])
+            ]  # [[N, 4] for B]
+            im_topk = im_topk.float()
+        else:
+            im_topK_ = []
+            kpts_ = []
+            for j in range(0, im_data.shape[0], batch):
+                im_data_clip = im_data[j:j + batch]
+                im_rawsc, _, _ = self.det(im_data_clip)
+                im_score = self.det.process(im_rawsc)[0]
+                im_topk = topk_map(im_score,
+                                   self.TOPK).permute(0, 3, 1, 2)  # B, 1, H, W
+                kpts = im_topk.nonzero()  # (B*topk, 4)
+                kpts = [
+                    kpts[kpts[:, 0] == idx, :]
+                    for idx in range(im_data_clip.shape[0])
+                ]  # [[N, 4] for B]
+                im_topk = im_topk.float()
+                im_topK_.append(im_topk)
+                kpts_ = kpts_ + kpts
+            kpts = kpts_
+            im_topk = torch.cat(im_topK_, 0)
+
+        return im_topk, kpts  # B, 1, H, W; N, 4;
+
+    def reload(self, RFDetPath):
+
+        print('reload RFDet Model')
+        pretrained_dict = torch.load(RFDetPath)['state_dict']
+        model_dict = self.det.state_dict()
+        pretrained_dict = {
+            k[4:]: v
+            for k, v in pretrained_dict.items()
+            if k[:3] == 'det' and k[4:] in model_dict
+        }
+        assert len(pretrained_dict.keys()) > 0
+        model_dict.update(pretrained_dict)
+        assert len(model_dict.keys()) == len(
+            pretrained_dict.keys()), 'mismatch for RFDet'
+        self.det.load_state_dict(model_dict)
+        print('successfully load {} params for RFDet'.format(len(model_dict)))
+
+
+class MotionEstimation(nn.Module):
+
+    def __init__(self, args, RAFTPath=''):
+        super(MotionEstimation, self).__init__()
+        self.RAFT = RAFT(args)
+        # self.RAFT.eval()
+        # self.RAFT.cuda()
+
+    def forward(self, x, x_RGB, im_topk, kpts):
+        '''
+        @param im_data [B, 1, H, W]
+        @param im_topk [B, 1, H, W]
+        @param kpts [[N, 4] for B] (B, 0, H, W)
+        @param OpticalFlow [B, 2, H, W] precomputed optical flow; optional, default None
+        @param RGBImages [B, 3, H, W] RGB images for optical flow computation, optional, default None
+        '''
+        if self.RAFT is None:
+            raise NotImplementedError()
+
+        optical_flow = []
+        for i in range(0, x_RGB.shape[1] - 1):
+            padder = InputPadder(x_RGB[:, i, :, :, :].shape)
+            image1, image2 = padder.pad(x_RGB[:, i, :, :, :],
+                                        x_RGB[:, (i + 1), :, :, :])
+            flow_low, flow_up = self.RAFT(
+                image1.cuda(), image2.cuda(), iters=20, test_mode=True)
+            optical_flow.append(flow_up)
+
+        x_RGB = x_RGB.cpu()
+        torch.cuda.empty_cache()
+        optical_flow = torch.cat(optical_flow, 0)
+
+        flow_masked = optical_flow * im_topk[:-1]  # B - 1, 2, H, W
+
+        return flow_masked
+
+    def reload(self, RAFTPath):
+        self.RAFT.load_state_dict({
+            strKey.replace('module.', ''): tenWeight
+            for strKey, tenWeight in torch.load(RAFTPath).items()
+        })
+        print('successfully load all params for RAFT')
+
+
+class KLT(nn.Module):
+
+    def __init__(self, RAFTPath=''):
+        super(KLT, self).__init__()
+        self.lk_params = dict(
+            winSize=(15, 15),
+            maxLevel=2,
+            criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 20,
+                      0.03))
+
+    def forward(self, x, x_RGB, im_topk, kpts):
+        '''
+        @param im_data [B, 1, H, W]
+        @param im_topk [B, 1, H, W]
+        @param kpts [[N, 4] for B] (B, 0, H, W)
+        @param OpticalFlow [B, 2, H, W] precomputed optical flow; optional, default None
+        @param RGBImages [B, 3, H, W] RGB images for optical flow computation, optional, default None
+        '''
+        batch, _, height, width = x.shape
+        im_cpu = (x.cpu().numpy() * 255.).astype(np.uint8)[:, 0, :, :]
+        OpticalFlow = np.zeros((batch - 1, 2, height, width))
+        for j in range(batch - 1):
+            p0 = kpts[j].detach().cpu().numpy()[:, ::-1]
+            p0 = np.expand_dims(p0[:, :2], 1).astype(np.float32)
+
+            p1, _, _ = cv2.calcOpticalFlowPyrLK(im_cpu[j], im_cpu[j + 1], p0,
+                                                None, **self.lk_params)
+            op = p1 - p0
+            p0 = p0.astype(np.uint8)
+            OpticalFlow[j, :, p0[:, 0, 1], p0[:, 0, 0]] = op[:, 0, :]
+
+        return torch.from_numpy(OpticalFlow.astype(np.float32)).to(x.device)
+
+
+class motionPropagate(object):
+
+    def __init__(self, inferenceMethod):
+        self.inference = inferenceMethod
+
+
+class JacobiSolver(nn.Module):
+
+    def __init__(self):
+        super(JacobiSolver, self).__init__()
+        self.generateSmooth = generateSmooth
+        self.KernelSmooth = Smoother().KernelSmooth
+
+    def forward(self, x):
+        return None
+
+
+class DUT(nn.Module):
+
+    def __init__(self,
+                 SmootherPath='',
+                 RFDetPath='',
+                 RAFTPath='',
+                 MotionProPath='',
+                 homo=True,
+                 args=None):
+        super(DUT, self).__init__()
+        print('------------------model configuration----------------------')
+
+        if RFDetPath != '':
+            print('using RFNet ...')
+            self.keypointModule = RFDetection(RFDetPath)
+        else:
+            print('using corner keypoint detector...')
+            self.keypointModule = KeypointDetction()
+
+        if RAFTPath != '':
+            print('using RAFT for motion estimation...')
+            self.motionEstimation = MotionEstimation(args, RAFTPath)
+        else:
+            print('using KLT tracker for motion estimation...')
+            self.motionEstimation = KLT()
+
+        if MotionProPath != '':
+            if homo:
+                print('using Motion Propagation model with multi homo...')
+                self.motionPro = MotionPro(globalchoice='multi')
+            else:
+                print('using Motion Propagation model with single homo...')
+                self.motionPro = MotionPro(globalchoice='single')
+        else:
+            if homo:
+                print('using median filter with multi homo...')
+                self.motionPro = motionPropagate(MultiMotionPropagate)
+            else:
+                print('using median filter with single homo...')
+                self.motionPro = motionPropagate(SingleMotionPropagate)
+
+        if SmootherPath != '':
+            print('using Deep Smoother Model...')
+            self.smoother = Smoother()
+        else:
+            print('using Jacobi Solver ...')
+            self.smoother = JacobiSolver()
+
+        self.reload(SmootherPath, RFDetPath, RAFTPath, MotionProPath)
+
+    def forward(self, x, x_RGB, repeat=50):
+        return self.inference(x, x_RGB, repeat)
+
+    def inference(self, x, x_RGB, repeat=50):
+        """
+        @param: x [B, C, T, H, W] Assume B is 1 here, a set of Gray images
+        @param: x_RGB [B, C, T, H, W] Assume B is 1 here, a set of RGB images
+        @param: repeat int repeat time for the smoother module
+
+        @return: smoothPath
+        """
+
+        x = x.permute(0, 2, 1, 3, 4).squeeze(0)  # T, C, H, W
+
+        # keypoint extraction
+        print('------------------detect keypoints-------------------------')
+        im_topk, kpts = self.keypointModule.forward(
+            x)  # T, 1, H, W; list([N, 4])
+
+        # This will slow down the code but save GPU memory
+        x = x.cpu()
+        torch.cuda.empty_cache()
+
+        print('------------------estimate motion--------------------------')
+        masked_flow = self.motionEstimation.forward(x, x_RGB, im_topk, kpts)
+
+        x_RGB = x_RGB.cpu()
+        im_topk = im_topk.cpu()
+        torch.cuda.empty_cache()
+
+        del x
+        del x_RGB
+        del im_topk
+
+        print('------------------motion propagation-----------------------')
+        origin_motion = [
+            self.motionPro.inference(masked_flow[i:i + 1, 0:1, :, :].cuda(),
+                                     masked_flow[i:i + 1, 1:2, :, :].cuda(),
+                                     kpts[i]).cpu()
+            for i in range(len(kpts) - 1)
+        ]
+
+        origin_motion = torch.stack(origin_motion, 2).cuda()  # B, 2, T, H, W
+        origin_motion = torch.cat([
+            torch.zeros_like(origin_motion[:, :, 0:1, :, :]).to(
+                origin_motion.device), origin_motion
+        ], 2)
+
+        origin_motion = torch.cumsum(origin_motion, 2)
+        min_value = torch.min(origin_motion)
+        origin_motion = origin_motion - min_value
+        max_value = torch.max(origin_motion) + 1e-5
+        origin_motion = origin_motion / max_value
+
+        smoothKernel = self.smoother(origin_motion.cuda())
+
+        smoothPath = torch.cat(
+            self.smoother.KernelSmooth(smoothKernel, origin_motion.cuda(),
+                                       repeat), 1)  # B, 2, T, H, W
+        smoothPath = smoothPath * max_value + min_value
+        origin_motion = origin_motion * max_value + min_value
+
+        return origin_motion, smoothPath
+
+    def reload(self, SmootherPath, RFDetPath, RAFTPath, MotionProPath):
+        print('------------------reload parameters------------------------')
+
+        if SmootherPath == '':
+            print('No parameters for JacobiSolver')
+        else:
+            print('reload Smoother params')
+            pretrained_dict = torch.load(SmootherPath)
+            model_dict = self.smoother.state_dict()
+            pretrained_dict = {
+                k: v
+                for k, v in pretrained_dict.items() if k in model_dict
+            }
+            assert len(pretrained_dict.keys()) > 0
+            assert len(model_dict.keys()) == len(pretrained_dict.keys())
+            model_dict.update(pretrained_dict)
+            assert len(model_dict.keys()) == len(pretrained_dict.keys())
+            self.smoother.load_state_dict(model_dict)
+            print('successfully load {} params for smoother'.format(
+                len(model_dict)))
+
+        if RFDetPath != '':
+            self.keypointModule.reload(RFDetPath)
+        else:
+            print('No parameters for Keypoint detector')
+
+        if RAFTPath == '':
+            print('No parameters for Optical flow')
+        else:
+            print('reload RAFT Model')
+            self.motionEstimation.reload(RAFTPath)
+
+        if MotionProPath == '':
+            print('No parameters for motion propagation')
+        else:
+            print('reload MotionPropagation Model')
+            model_dict_motion = torch.load(MotionProPath)
+            model_dict = self.motionPro.state_dict()
+            model_dict_motion = {
+                k: v
+                for k, v in model_dict_motion.items() if k in model_dict
+            }
+            assert len(model_dict_motion.keys()) > 0
+            model_dict.update(model_dict_motion)
+            assert len(model_dict_motion.keys()) == len(model_dict.keys())
+            self.motionPro.load_state_dict(model_dict)
+            print('successfully load {} params for MotionPropagation'.format(
+                len(model_dict)))
+
+
+if __name__ == '__main__':
+
+    im_raw = np.random.randn(1, 3, 20, 240,
+                             320).astype(np.float32)  # B, 3, T, H, W (RGB)
+    im_data = im_raw[:, 0:1, :, :, :]
+    im_raw = torch.from_numpy(im_raw).cuda()
+    im_data = torch.from_numpy(im_data).cuda()
+
+    model = DUT('1', '2', '3', '4')
+    model.cuda()
+    model.eval()
+    smoothPath = model.inference(im_data, im_raw)
diff --git a/modelscope/models/cv/video_stabilization/DUT/MotionPro.py b/modelscope/models/cv/video_stabilization/DUT/MotionPro.py
new file mode 100644
index 00000000..85710e86
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/MotionPro.py
@@ -0,0 +1,128 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import math
+import os
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_stabilization.utils.MedianFilter import \
+    MedianPool2d
+from modelscope.models.cv.video_stabilization.utils.ProjectionUtils import (
+    multiHomoEstimate, singleHomoEstimate)
+from .config import cfg
+
+
+class MotionPro(nn.Module):
+
+    def __init__(self,
+                 inplanes=2,
+                 embeddingSize=64,
+                 hiddenSize=128,
+                 number_points=512,
+                 kernel=5,
+                 globalchoice='multi'):
+
+        super(MotionPro, self).__init__()
+        self.embedding = nn.Sequential(
+            nn.Conv1d(inplanes, embeddingSize, 1),
+            nn.ReLU(),
+        )
+        self.embedding_motion = nn.Sequential(
+            nn.Conv1d(inplanes, embeddingSize, 1),
+            nn.ReLU(),
+        )
+
+        self.pad = kernel // 2
+        self.conv1 = nn.Conv1d(embeddingSize, embeddingSize, 1)
+        self.conv2 = nn.Conv1d(embeddingSize, embeddingSize // 2, 1)
+        self.conv3 = nn.Conv1d(embeddingSize // 2, 1, 1)
+
+        self.weighted = nn.Softmax(dim=2)
+
+        self.relu = nn.ReLU()
+        self.leakyRelu = nn.LeakyReLU(0.1)
+
+        self.m_conv1 = nn.Conv1d(embeddingSize, 2 * embeddingSize, 1)
+        self.m_conv2 = nn.Conv1d(2 * embeddingSize, 2 * embeddingSize, 1)
+        self.m_conv3 = nn.Conv1d(2 * embeddingSize, embeddingSize, 1)
+
+        self.fuse_conv1 = nn.Conv1d(embeddingSize + embeddingSize // 2,
+                                    embeddingSize, 1)
+        self.fuse_conv2 = nn.Conv1d(embeddingSize, embeddingSize, 1)
+
+        self.decoder = nn.Linear(embeddingSize, 2, bias=False)
+
+        if globalchoice == 'multi':
+            self.homoEstimate = multiHomoEstimate
+        elif globalchoice == 'single':
+            self.homoEstimate = singleHomoEstimate
+
+        self.meidanPool = MedianPool2d(5, same=True)
+
+    def forward(self, motion):
+        '''
+        @param: motion contains distance info and motion info of keypoints
+
+        @return: return predicted motion for each grid vertex
+        '''
+        distance_info = motion[:, 0:2, :]
+        motion_info = motion[0:1, 2:4, :]
+
+        embedding_distance = self.embedding(distance_info)
+        embedding_distance = self.leakyRelu(self.conv1(embedding_distance))
+        embedding_distance = self.leakyRelu(self.conv2(embedding_distance))
+        distance_weighted = self.weighted(self.conv3(embedding_distance))
+
+        embedding_motion = self.embedding_motion(motion_info)
+        embedding_motion = self.leakyRelu(self.m_conv1(embedding_motion))
+        embedding_motion = self.leakyRelu(self.m_conv2(embedding_motion))
+        embedding_motion = self.leakyRelu(self.m_conv3(embedding_motion))
+        embedding_motion = embedding_motion.repeat(distance_info.shape[0], 1,
+                                                   1)
+
+        embedding_motion = torch.cat([embedding_motion, embedding_distance], 1)
+        embedding_motion = self.leakyRelu(self.fuse_conv1(embedding_motion))
+        embedding_motion = self.leakyRelu(self.fuse_conv2(embedding_motion))
+
+        embedding_motion = torch.sum(embedding_motion * distance_weighted, 2)
+
+        out_motion = self.decoder(embedding_motion)
+
+        return out_motion
+
+    def inference(self, x_flow, y_flow, kp):
+        """
+        @param x_flow [B, 1, H, W]
+        @param y_flow [B, 1, H, W]
+        @param kp     [B*topk, 4 / 2]->[N, 4/2]
+        """
+        if kp.shape[1] == 4:
+            kp = kp[:, 2:]
+        index = kp.long()
+        origin_motion = torch.cat([x_flow, y_flow], 1)
+        extracted_motion = origin_motion[0, :, index[:, 0], index[:, 1]]
+        kp = kp.permute(1, 0).float()
+        concat_motion = torch.cat([kp[1:2, :], kp[0:1, :], extracted_motion],
+                                  0)
+
+        motion, gridsMotion, _ = self.homoEstimate(concat_motion, kp)
+        GridMotion = (self.forward(motion)
+                      + gridsMotion.squeeze(-1)) * cfg.MODEL.FLOWC
+        GridMotion = GridMotion.view(cfg.MODEL.HEIGHT // cfg.MODEL.PIXELS,
+                                     cfg.MODEL.WIDTH // cfg.MODEL.PIXELS, 2)
+        GridMotion = GridMotion.permute(2, 0, 1).unsqueeze(0)
+        GridMotion = self.meidanPool(GridMotion)
+        return GridMotion
+
+
+if __name__ == '__main__':
+    model = MotionPro()
+    model.train()
+    model.cuda()
+    x = torch.from_numpy(np.random.randn(4, 512).astype(np.float32)).cuda()
+    kp = torch.from_numpy(np.random.randn(512, 2).astype(np.float32)).cuda()
+    model.train_step(x, kp)
diff --git a/modelscope/models/cv/video_stabilization/DUT/RAFT/__init__.py b/modelscope/models/cv/video_stabilization/DUT/RAFT/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_stabilization/DUT/RAFT/corr.py b/modelscope/models/cv/video_stabilization/DUT/RAFT/corr.py
new file mode 100644
index 00000000..3fcc56c4
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/RAFT/corr.py
@@ -0,0 +1,98 @@
+# Part of the implementation is borrowed and modified from RAFT,
+# publicly available at https://github.com/princeton-vl/RAFT
+
+import torch
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_stabilization.utils.RAFTUtils import (
+    bilinear_sampler, coords_grid)
+
+try:
+    import alt_cuda_corr
+except Exception:
+    # alt_cuda_corr is not compiled
+    pass
+
+
+class CorrBlock:
+
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+
+        # all pairs correlation
+        corr = CorrBlock.corr(fmap1, fmap2)
+
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
+
+        self.corr_pyramid.append(corr)
+        for i in range(self.num_levels - 1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            self.corr_pyramid.append(corr)
+
+    def __call__(self, coords):
+        r = self.radius
+        coords = coords.permute(0, 2, 3, 1)
+        batch, h1, w1, _ = coords.shape
+
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            dx = torch.linspace(-r, r, 2 * r + 1, device=coords.device)
+            dy = torch.linspace(-r, r, 2 * r + 1, device=coords.device)
+            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)
+
+            centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+
+            corr = bilinear_sampler(corr, coords_lvl)
+            corr = corr.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+
+        out = torch.cat(out_pyramid, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float()
+
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht * wd)
+        fmap2 = fmap2.view(batch, dim, ht * wd)
+
+        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr / torch.sqrt(torch.tensor(dim).float())
+
+
+class AlternateCorrBlock:
+
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+
+        self.pyramid = [(fmap1, fmap2)]
+        for i in range(self.num_levels):
+            fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
+            fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
+            self.pyramid.append((fmap1, fmap2))
+
+    def __call__(self, coords):
+        coords = coords.permute(0, 2, 3, 1)
+        B, H, W, _ = coords.shape
+        dim = self.pyramid[0][0].shape[1]
+
+        corr_list = []
+        for i in range(self.num_levels):
+            r = self.radius
+            fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
+            fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()
+
+            coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
+            corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
+            corr_list.append(corr.squeeze(1))
+
+        corr = torch.stack(corr_list, dim=1)
+        corr = corr.reshape(B, -1, H, W)
+        return corr / torch.sqrt(torch.tensor(dim).float())
diff --git a/modelscope/models/cv/video_stabilization/DUT/RAFT/extractor.py b/modelscope/models/cv/video_stabilization/DUT/RAFT/extractor.py
new file mode 100644
index 00000000..3e15341c
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/RAFT/extractor.py
@@ -0,0 +1,288 @@
+# Part of the implementation is borrowed and modified from RAFT,
+# publicly available at https://github.com/princeton-vl/RAFT
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(
+                    num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm3)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BottleneckBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes // 4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(
+            planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes // 4)
+            self.norm2 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes // 4)
+            self.norm3 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(
+                    num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm4)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BasicEncoder(nn.Module):
+
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(
+            self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
+
+
+class SmallEncoder(nn.Module):
+
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(32)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(32)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32, stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(
+            self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
diff --git a/modelscope/models/cv/video_stabilization/DUT/RAFT/raft.py b/modelscope/models/cv/video_stabilization/DUT/RAFT/raft.py
new file mode 100644
index 00000000..3626046c
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/RAFT/raft.py
@@ -0,0 +1,163 @@
+# Part of the implementation is borrowed and modified from RAFT,
+# publicly available at https://github.com/princeton-vl/RAFT
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_stabilization.utils.RAFTUtils import (
+    bilinear_sampler, coords_grid, upflow8)
+from .corr import AlternateCorrBlock, CorrBlock
+from .extractor import BasicEncoder, SmallEncoder
+from .update import BasicUpdateBlock, SmallUpdateBlock
+
+try:
+    autocast = torch.cuda.amp.autocast
+except Exception:
+    # dummy autocast for PyTorch < 1.6
+    class autocast:
+
+        def __init__(self, enabled):
+            pass
+
+        def __enter__(self):
+            pass
+
+        def __exit__(self, *args):
+            pass
+
+
+class RAFT(nn.Module):
+
+    def __init__(self, args):
+        super(RAFT, self).__init__()
+        self.args = args
+
+        if args.small:
+            self.hidden_dim = hdim = 96
+            self.context_dim = cdim = 64
+            args.corr_levels = 4
+            args.corr_radius = 3
+
+        else:
+            self.hidden_dim = hdim = 128
+            self.context_dim = cdim = 128
+            args.corr_levels = 4
+            args.corr_radius = 4
+
+        if 'dropout' not in self.args:
+            self.args.dropout = 0
+
+        if 'alternate_corr' not in self.args:
+            self.args.alternate_corr = False
+
+        # feature network, context network, and update block
+        if args.small:
+            self.fnet = SmallEncoder(
+                output_dim=128, norm_fn='instance', dropout=args.dropout)
+            self.cnet = SmallEncoder(
+                output_dim=hdim + cdim, norm_fn='none', dropout=args.dropout)
+            self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim)
+
+        else:
+            self.fnet = BasicEncoder(
+                output_dim=256, norm_fn='instance', dropout=args.dropout)
+            self.cnet = BasicEncoder(
+                output_dim=hdim + cdim, norm_fn='batch', dropout=args.dropout)
+            self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def initialize_flow(self, img):
+        """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
+        N, C, H, W = img.shape
+        coords0 = coords_grid(N, H // 8, W // 8, device=img.device)
+        coords1 = coords_grid(N, H // 8, W // 8, device=img.device)
+
+        # optical flow computed as difference: flow = coords1 - coords0
+        return coords0, coords1
+
+    def upsample_flow(self, flow, mask):
+        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+
+        up_flow = F.unfold(8 * flow, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8 * H, 8 * W)
+
+    def forward(self,
+                image1,
+                image2,
+                iters=12,
+                flow_init=None,
+                upsample=True,
+                test_mode=False):
+        """ Estimate optical flow between pair of frames """
+
+        image1 = 2 * (image1 / 255.0) - 1.0
+        image2 = 2 * (image2 / 255.0) - 1.0
+
+        image1 = image1.contiguous()
+        image2 = image2.contiguous()
+
+        hdim = self.hidden_dim
+        cdim = self.context_dim
+
+        # run the feature network
+        with autocast(enabled=self.args.mixed_precision):
+            fmap1, fmap2 = self.fnet([image1, image2])
+
+        fmap1 = fmap1.float()
+        fmap2 = fmap2.float()
+        if self.args.alternate_corr:
+            corr_fn = AlternateCorrBlock(
+                fmap1, fmap2, radius=self.args.corr_radius)
+        else:
+            corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
+
+        # run the context network
+        with autocast(enabled=self.args.mixed_precision):
+            cnet = self.cnet(image1)
+            net, inp = torch.split(cnet, [hdim, cdim], dim=1)
+            net = torch.tanh(net)
+            inp = torch.relu(inp)
+
+        coords0, coords1 = self.initialize_flow(image1)
+
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+
+        flow_predictions = []
+        for itr in range(iters):
+            coords1 = coords1.detach()
+            corr = corr_fn(coords1)  # index correlation volume
+
+            flow = coords1 - coords0
+            with autocast(enabled=self.args.mixed_precision):
+                net, up_mask, delta_flow = self.update_block(
+                    net, inp, corr, flow)
+
+            # F(t+1) = F(t) + \Delta(t)
+            coords1 = coords1 + delta_flow
+
+            # upsample predictions
+            if up_mask is None:
+                flow_up = upflow8(coords1 - coords0)
+            else:
+                flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+
+            flow_predictions.append(flow_up)
+
+        if test_mode:
+            return coords1 - coords0, flow_up
+
+        return flow_predictions
diff --git a/modelscope/models/cv/video_stabilization/DUT/RAFT/update.py b/modelscope/models/cv/video_stabilization/DUT/RAFT/update.py
new file mode 100644
index 00000000..280b7c8f
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/RAFT/update.py
@@ -0,0 +1,160 @@
+# Part of the implementation is borrowed and modified from RAFT,
+# publicly available at https://github.com/princeton-vl/RAFT
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowHead(nn.Module):
+
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+
+
+class ConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+
+        h = (1 - z) * h + z * q
+        return h
+
+
+class SepConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convr1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convq1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+
+        self.convz2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convr2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convq2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        return h
+
+
+class SmallMotionEncoder(nn.Module):
+
+    def __init__(self, args):
+        super(SmallMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
+        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
+        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
+        self.conv = nn.Conv2d(128, 80, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class BasicMotionEncoder(nn.Module):
+
+    def __init__(self, args):
+        super(BasicMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64 + 192, 128 - 2, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class SmallUpdateBlock(nn.Module):
+
+    def __init__(self, args, hidden_dim=96):
+        super(SmallUpdateBlock, self).__init__()
+        self.encoder = SmallMotionEncoder(args)
+        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82 + 64)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)
+
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        return net, None, delta_flow
+
+
+class BasicUpdateBlock(nn.Module):
+
+    def __init__(self, args, hidden_dim=128, input_dim=128):
+        super(BasicUpdateBlock, self).__init__()
+        self.args = args
+        self.encoder = BasicMotionEncoder(args)
+        self.gru = SepConvGRU(
+            hidden_dim=hidden_dim, input_dim=128 + hidden_dim)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(256, 64 * 9, 1, padding=0))
+
+    def forward(self, net, inp, corr, flow, upsample=True):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        # scale mask to balence gradients
+        mask = .25 * self.mask(net)
+        return net, mask, delta_flow
diff --git a/modelscope/models/cv/video_stabilization/DUT/Smoother.py b/modelscope/models/cv/video_stabilization/DUT/Smoother.py
new file mode 100644
index 00000000..20687251
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/Smoother.py
@@ -0,0 +1,101 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_stabilization.utils.IterativeSmooth import \
+    generateSmooth
+
+
+class Smoother(nn.Module):
+
+    def __init__(self, inplanes=2, embeddingSize=64, hiddenSize=64, kernel=5):
+        super(Smoother, self).__init__()
+        self.embedding = nn.Sequential(
+            nn.Linear(inplanes, embeddingSize), nn.ReLU())
+        self.pad = kernel // 2
+        self.conv1 = nn.Conv3d(
+            embeddingSize,
+            embeddingSize, (kernel, 3, 3),
+            padding=(self.pad, 1, 1))
+        self.conv3 = nn.Conv3d(
+            embeddingSize,
+            embeddingSize, (kernel, 3, 3),
+            padding=(self.pad, 1, 1))
+        self.conv2 = nn.Conv3d(
+            embeddingSize,
+            embeddingSize, (kernel, 3, 3),
+            padding=(self.pad, 1, 1))
+        self.decoder = nn.Linear(embeddingSize, 12, bias=True)
+        self.scale = nn.Linear(embeddingSize, 1, bias=True)
+        self.activation = nn.Sigmoid()
+        self.relu = nn.ReLU()
+        self.generateSmooth = generateSmooth
+
+    def forward(self, trajectory):
+        '''
+        @param trajectory: Unstable trajectory with shape [B, 2, T, H, W]
+
+        @return kernel: dynamic smooth kernel with shape [B, 12, T, H, W]
+        '''
+
+        trajectory = trajectory.permute(0, 2, 3, 4, 1)
+        embedding_trajectory = self.embedding(trajectory).permute(
+            0, 4, 1, 2, 3)
+        hidden = embedding_trajectory
+        hidden = self.relu(self.conv1(hidden))
+        hidden = self.relu(self.conv3(hidden))
+        hidden = self.relu(self.conv2(hidden))
+        kernel = self.activation(
+            self.decoder(hidden.permute(0, 2, 3, 4, 1)).permute(0, 4, 1, 2, 3))
+        kernel = self.scale(hidden.permute(0, 2, 3, 4, 1)).permute(
+            0, 4, 1, 2, 3) * kernel
+        return kernel
+
+    def inference(self, x_paths, y_paths, repeat=50):
+        '''
+        @param x_paths: Unstable trajectory in x direction, [B, T, H, W]
+        @param y_paths: Unstable trajectory in y direction, [B, T, H, W]
+        @param repeat: iterations for smoother, int
+
+        @return smooth_x: Smoothed trajectory in x direction, [B, T, H, W]
+        @return smooth_y: Smoothed trajectory in y direction, [B, T, H, W]
+        '''
+        path = np.concatenate(
+            [np.expand_dims(x_paths, -1),
+             np.expand_dims(y_paths, -1)], -1)
+
+        # regularization
+        min_v = np.min(path, keepdims=True)
+        path = path - min_v
+        max_v = np.max(path, keepdims=True) + 1e-5
+        path = path / max_v
+        path = np.transpose(np.expand_dims(path, 0), (0, 4, 3, 1, 2))
+        path_t = torch.from_numpy(path.astype(np.float32)).cuda()
+
+        # get smooth kernel
+        kernel_t = self.forward(path_t)
+
+        # iterative smooth
+        smooth_x, smooth_y = self.KernelSmooth(kernel_t, path_t, repeat)
+
+        smooth_x = smooth_x.cpu().squeeze().permute(1, 2,
+                                                    0).numpy() * max_v + min_v
+        smooth_y = smooth_y.cpu().squeeze().permute(1, 2,
+                                                    0).numpy() * max_v + min_v
+        return smooth_x, smooth_y
+
+    def KernelSmooth(self, kernel, path, repeat=20):
+        if kernel is None:
+            smooth_x = self.generateSmooth(path[:, 0:1, :, :, :], None, repeat)
+            smooth_y = self.generateSmooth(path[:, 1:2, :, :, :], None, repeat)
+        else:
+            smooth_x = self.generateSmooth(path[:, 0:1, :, :, :],
+                                           kernel[:, 0:6, :, :, :], repeat)
+            smooth_y = self.generateSmooth(path[:, 1:2, :, :, :],
+                                           kernel[:, 6:12, :, :, :], repeat)
+        return smooth_x, smooth_y
diff --git a/modelscope/models/cv/video_stabilization/DUT/__init__.py b/modelscope/models/cv/video_stabilization/DUT/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_stabilization/DUT/config.py b/modelscope/models/cv/video_stabilization/DUT/config.py
new file mode 100644
index 00000000..85c33bc3
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/config.py
@@ -0,0 +1,86 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+from __future__ import absolute_import, division, print_function
+
+from easydict import EasyDict as edict
+
+__C = edict()
+cfg = __C
+"""
+Model options
+"""
+__C.MODEL = edict()
+
+# gaussian kernel size
+__C.MODEL.GAUSSIAN_KSIZE = 15
+
+# gaussian kernel sigma
+__C.MODEL.GAUSSIAN_SIGMA = 0.5
+
+# Descriptor Threshold
+__C.MODEL.DES_THRSH = 1.0
+
+# Coordinate Threshold
+__C.MODEL.COO_THRSH = 5.0
+
+# Ksize
+__C.MODEL.KSIZE = 3
+
+# padding
+__C.MODEL.padding = 1
+
+# dilation
+__C.MODEL.dilation = 1
+
+# scale_list
+__C.MODEL.scale_list = [3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0]
+
+# grid size
+__C.MODEL.PIXELS = 16
+
+# neighbor radius
+__C.MODEL.RADIUS = 200
+
+# input height
+__C.MODEL.HEIGHT = 480
+
+# input width
+__C.MODEL.WIDTH = 640
+
+# normalization amplitude in optical flow
+__C.MODEL.FLOWC = 20
+
+# cluster threshold
+__C.MODEL.THRESHOLDPOINT = 102
+"""
+Training options
+"""
+__C.TRAIN = edict()
+
+# score strength weight
+__C.TRAIN.score_com_strength = 100.0
+
+# scale strength weight
+__C.TRAIN.scale_com_strength = 100.0
+
+# non maximum supression threshold
+__C.TRAIN.NMS_THRESH = 0.0
+
+# nms kernel size
+__C.TRAIN.NMS_KSIZE = 5
+
+# top k patch
+__C.TRAIN.TOPK = 512
+"""
+Threshold options
+"""
+__C.Threshold = edict()
+
+__C.Threshold.MANG = 2
+__C.Threshold.ROT = 5
+"""
+Infer options
+"""
+__C.INFER = edict()
+__C.INFER.ALLINFER = False
diff --git a/modelscope/models/cv/video_stabilization/DUT/rf_det_module.py b/modelscope/models/cv/video_stabilization/DUT/rf_det_module.py
new file mode 100644
index 00000000..5451d8e3
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/rf_det_module.py
@@ -0,0 +1,220 @@
+# @Time    : 2018-9-27 15:39
+# @Author  : xylon
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_stabilization.utils.image_utils import (
+    filter_border, get_gauss_filter_weight, nms, topk_map)
+
+
+class RFDetModule(nn.Module):
+
+    def __init__(
+        self,
+        score_com_strength,
+        scale_com_strength,
+        nms_thresh,
+        nms_ksize,
+        topk,
+        gauss_ksize,
+        gauss_sigma,
+        ksize,
+        padding,
+        dilation,
+        scale_list,
+    ):
+        super(RFDetModule, self).__init__()
+
+        self.score_com_strength = score_com_strength
+        self.scale_com_strength = scale_com_strength
+        self.NMS_THRESH = nms_thresh
+        self.NMS_KSIZE = nms_ksize
+        self.TOPK = topk
+        self.GAUSSIAN_KSIZE = gauss_ksize
+        self.GAUSSIAN_SIGMA = gauss_sigma
+
+        self.conv1 = nn.Conv2d(
+            in_channels=1,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 3 RF
+        self.insnorm1 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s3 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s3 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv2 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 5 RF
+        self.insnorm2 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s5 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s5 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv3 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 7 RF
+        self.insnorm3 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s7 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s7 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv4 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 9 RF
+        self.insnorm4 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s9 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s9 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv5 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 11 RF
+        self.insnorm5 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s11 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s11 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv6 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 13 RF
+        self.insnorm6 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s13 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s13 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv7 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 15 RF
+        self.insnorm7 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s15 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s15 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv8 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 17 RF
+        self.insnorm8 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s17 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s17 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv9 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 19 RF
+        self.insnorm9 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s19 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s19 = nn.InstanceNorm2d(1, affine=True)
+
+        self.conv10 = nn.Conv2d(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=ksize,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+        )  # 21 RF
+        self.insnorm10 = nn.InstanceNorm2d(16, affine=True)
+        self.conv_s21 = nn.Conv2d(
+            in_channels=16, out_channels=1, kernel_size=1, stride=1, padding=0)
+        self.insnorm_s21 = nn.InstanceNorm2d(1, affine=True)
+
+        self.scale_list = torch.tensor(scale_list)
+
+    def forward(self, **kwargs):
+        pass
+
+    def process(self, im1w_score):
+        """
+        nms(n), topk(t), gaussian kernel(g) operation
+        :param im1w_score: warped score map
+        :return: processed score map, topk mask, topk value
+        """
+        im1w_score = filter_border(im1w_score)
+
+        # apply nms to im1w_score
+        nms_mask = nms(
+            im1w_score, thresh=self.NMS_THRESH, ksize=self.NMS_KSIZE)
+        im1w_score = im1w_score * nms_mask
+        topk_value = im1w_score
+
+        # apply topk to im1w_score
+        topk_mask = topk_map(im1w_score, self.TOPK)
+        im1w_score = topk_mask.to(torch.float) * im1w_score
+
+        # apply gaussian kernel to im1w_score
+        psf = get_gauss_filter_weight(
+            self.GAUSSIAN_KSIZE,
+            self.GAUSSIAN_SIGMA)[None, None, :, :].clone().detach().to(
+                im1w_score.device)
+        im1w_score = F.conv2d(
+            input=im1w_score.permute(0, 3, 1, 2),
+            weight=psf,
+            stride=1,
+            padding=self.GAUSSIAN_KSIZE // 2,
+        ).permute(0, 2, 3, 1)  # (B, H, W, 1)
+        """
+        apply tf.clamp to make sure all value in im1w_score isn't greater than 1
+        but this won't happend in correct way
+        """
+        im1w_score = im1w_score.clamp(min=0.0, max=1.0)
+
+        return im1w_score, topk_mask, topk_value
+
+    @staticmethod
+    def weights_init(m):
+        if isinstance(m, nn.Conv2d):
+            nn.init.xavier_uniform_(
+                m.weight.data, gain=nn.init.calculate_gain('leaky_relu'))
+            try:
+                nn.init.xavier_uniform_(m.bias.data)
+            except Exception:
+                pass
diff --git a/modelscope/models/cv/video_stabilization/DUT/rf_det_so.py b/modelscope/models/cv/video_stabilization/DUT/rf_det_so.py
new file mode 100644
index 00000000..73e15749
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUT/rf_det_so.py
@@ -0,0 +1,212 @@
+# @Time    : 2018-9-13 16:03
+# @Author  : xylon
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_stabilization.utils.image_utils import (
+    soft_max_and_argmax_1d, soft_nms_3d)
+from modelscope.models.cv.video_stabilization.utils.math_utils import L2Norm
+from .rf_det_module import RFDetModule
+
+
+class RFDetSO(RFDetModule):
+
+    def __init__(
+        self,
+        score_com_strength,
+        scale_com_strength,
+        nms_thresh,
+        nms_ksize,
+        topk,
+        gauss_ksize,
+        gauss_sigma,
+        ksize,
+        padding,
+        dilation,
+        scale_list,
+    ):
+        super(RFDetSO, self).__init__(
+            score_com_strength,
+            scale_com_strength,
+            nms_thresh,
+            nms_ksize,
+            topk,
+            gauss_ksize,
+            gauss_sigma,
+            ksize,
+            padding,
+            dilation,
+            scale_list,
+        )
+
+        self.conv_o3 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o5 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o7 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o9 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o11 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o13 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o15 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o17 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o19 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+        self.conv_o21 = nn.Conv2d(
+            in_channels=16, out_channels=2, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, photos):
+
+        # Extract score map in scale space from 3 to 21
+        score_featmaps_s3 = F.leaky_relu(self.insnorm1(self.conv1(photos)))
+        score_map_s3 = self.insnorm_s3(
+            self.conv_s3(score_featmaps_s3)).permute(0, 2, 3, 1)
+        orint_map_s3 = (
+            L2Norm(self.conv_o3(score_featmaps_s3),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+
+        score_featmaps_s5 = F.leaky_relu(
+            self.insnorm2(self.conv2(score_featmaps_s3)))
+        score_map_s5 = self.insnorm_s5(
+            self.conv_s5(score_featmaps_s5)).permute(0, 2, 3, 1)
+        orint_map_s5 = (
+            L2Norm(self.conv_o5(score_featmaps_s5),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s5 = score_featmaps_s5 + score_featmaps_s3
+
+        score_featmaps_s7 = F.leaky_relu(
+            self.insnorm3(self.conv3(score_featmaps_s5)))
+        score_map_s7 = self.insnorm_s7(
+            self.conv_s7(score_featmaps_s7)).permute(0, 2, 3, 1)
+        orint_map_s7 = (
+            L2Norm(self.conv_o7(score_featmaps_s7),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s7 = score_featmaps_s7 + score_featmaps_s5
+
+        score_featmaps_s9 = F.leaky_relu(
+            self.insnorm4(self.conv4(score_featmaps_s7)))
+        score_map_s9 = self.insnorm_s9(
+            self.conv_s9(score_featmaps_s9)).permute(0, 2, 3, 1)
+        orint_map_s9 = (
+            L2Norm(self.conv_o9(score_featmaps_s9),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s9 = score_featmaps_s9 + score_featmaps_s7
+
+        score_featmaps_s11 = F.leaky_relu(
+            self.insnorm5(self.conv5(score_featmaps_s9)))
+        score_map_s11 = self.insnorm_s11(
+            self.conv_s11(score_featmaps_s11)).permute(0, 2, 3, 1)
+        orint_map_s11 = (
+            L2Norm(self.conv_o11(score_featmaps_s11),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s11 = score_featmaps_s11 + score_featmaps_s9
+
+        score_featmaps_s13 = F.leaky_relu(
+            self.insnorm6(self.conv6(score_featmaps_s11)))
+        score_map_s13 = self.insnorm_s13(
+            self.conv_s13(score_featmaps_s13)).permute(0, 2, 3, 1)
+        orint_map_s13 = (
+            L2Norm(self.conv_o13(score_featmaps_s13),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s13 = score_featmaps_s13 + score_featmaps_s11
+
+        score_featmaps_s15 = F.leaky_relu(
+            self.insnorm7(self.conv7(score_featmaps_s13)))
+        score_map_s15 = self.insnorm_s15(
+            self.conv_s15(score_featmaps_s15)).permute(0, 2, 3, 1)
+        orint_map_s15 = (
+            L2Norm(self.conv_o15(score_featmaps_s15),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s15 = score_featmaps_s15 + score_featmaps_s13
+
+        score_featmaps_s17 = F.leaky_relu(
+            self.insnorm8(self.conv8(score_featmaps_s15)))
+        score_map_s17 = self.insnorm_s17(
+            self.conv_s17(score_featmaps_s17)).permute(0, 2, 3, 1)
+        orint_map_s17 = (
+            L2Norm(self.conv_o17(score_featmaps_s17),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s17 = score_featmaps_s17 + score_featmaps_s15
+
+        score_featmaps_s19 = F.leaky_relu(
+            self.insnorm9(self.conv9(score_featmaps_s17)))
+        score_map_s19 = self.insnorm_s19(
+            self.conv_s19(score_featmaps_s19)).permute(0, 2, 3, 1)
+        orint_map_s19 = (
+            L2Norm(self.conv_o19(score_featmaps_s19),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+        score_featmaps_s19 = score_featmaps_s19 + score_featmaps_s17
+
+        score_featmaps_s21 = F.leaky_relu(
+            self.insnorm10(self.conv10(score_featmaps_s19)))
+        score_map_s21 = self.insnorm_s21(
+            self.conv_s21(score_featmaps_s21)).permute(0, 2, 3, 1)
+        orint_map_s21 = (
+            L2Norm(self.conv_o21(score_featmaps_s21),
+                   dim=1).permute(0, 2, 3, 1).unsqueeze(-2))
+
+        score_maps = torch.cat(
+            (
+                score_map_s3,
+                score_map_s5,
+                score_map_s7,
+                score_map_s9,
+                score_map_s11,
+                score_map_s13,
+                score_map_s15,
+                score_map_s17,
+                score_map_s19,
+                score_map_s21,
+            ),
+            -1,
+        )  # (B, H, W, C)
+
+        orint_maps = torch.cat(
+            (
+                orint_map_s3,
+                orint_map_s5,
+                orint_map_s7,
+                orint_map_s9,
+                orint_map_s11,
+                orint_map_s13,
+                orint_map_s15,
+                orint_map_s17,
+                orint_map_s19,
+                orint_map_s21,
+            ),
+            -2,
+        )  # (B, H, W, 10, 2)
+
+        # get each pixel probability in all scale
+        scale_probs = soft_nms_3d(score_maps, ksize=15, com_strength=3.0)
+
+        # get each pixel probability summary from all scale space and correspond scale value
+        score_map, scale_map, orint_map = soft_max_and_argmax_1d(
+            input=scale_probs,
+            orint_maps=orint_maps,
+            dim=-1,
+            scale_list=self.scale_list,
+            keepdim=True,
+            com_strength1=self.score_com_strength,
+            com_strength2=self.scale_com_strength,
+        )
+
+        return score_map, scale_map, orint_map
+
+    @staticmethod
+    def convO_init(m):
+        if isinstance(m, nn.Conv2d):
+            nn.init.zeros_(m.weight.data)
+            try:
+                nn.init.ones_(m.bias.data)
+            except Exception:
+                pass
diff --git a/modelscope/models/cv/video_stabilization/DUTRAFTStabilizer.py b/modelscope/models/cv/video_stabilization/DUTRAFTStabilizer.py
new file mode 100644
index 00000000..1b7fe0a7
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/DUTRAFTStabilizer.py
@@ -0,0 +1,94 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import math
+import os
+import sys
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.video_stabilization.DUT.config import cfg
+from modelscope.models.cv.video_stabilization.DUT.DUT_raft import DUT
+from modelscope.preprocessors.cv import VideoReader, stabilization_preprocessor
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+__all__ = ['DUTRAFTStabilizer']
+
+
+@MODELS.register_module(
+    Tasks.video_stabilization, module_name=Models.video_stabilization)
+class DUTRAFTStabilizer(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video stabilization model from the `model_dir` path.
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        SmootherPath = os.path.join(self.model_dir,
+                                    self.config.modelsetting.SmootherPath)
+        RFDetPath = os.path.join(self.model_dir,
+                                 self.config.modelsetting.RFDetPath)
+        RAFTPath = os.path.join(self.model_dir,
+                                self.config.modelsetting.RAFTPath)
+        MotionProPath = os.path.join(self.model_dir,
+                                     self.config.modelsetting.MotionProPath)
+        homo = self.config.modelsetting.homo
+        args = self.config.modelsetting.args
+        self.base_crop_width = self.config.modelsetting.base_crop_width
+
+        self.net = DUT(
+            SmootherPath=SmootherPath,
+            RFDetPath=RFDetPath,
+            RAFTPath=RAFTPath,
+            MotionProPath=MotionProPath,
+            homo=homo,
+            args=args)
+
+        self.net.cuda()
+        self.net.eval()
+
+    def _inference_forward(self, input: str) -> Dict[str, Any]:
+        data = stabilization_preprocessor(input, cfg)
+        with torch.no_grad():
+            origin_motion, smooth_path = self.net.inference(
+                data['x'].cuda(), data['x_rgb'].cuda(), repeat=50)
+
+        origin_motion = origin_motion.cpu().numpy()
+        smooth_path = smooth_path.cpu().numpy()
+        origin_motion = np.transpose(origin_motion[0], (2, 3, 1, 0))
+        smooth_path = np.transpose(smooth_path[0], (2, 3, 1, 0))
+
+        return {
+            'origin_motion': origin_motion,
+            'smooth_path': smooth_path,
+            'ori_images': data['ori_images'],
+            'fps': data['fps'],
+            'width': data['width'],
+            'height': data['height'],
+            'base_crop_width': self.base_crop_width
+        }
+
+    def forward(self, inputs: Dict[str, str]) -> Dict[str, Any]:
+        """return the result by the model
+        Args:
+            inputs (str): the input video path
+        Returns:
+            Dict[str, str]: results
+        """
+        return self._inference_forward(inputs['input'][0])
diff --git a/modelscope/models/cv/video_stabilization/__init__.py b/modelscope/models/cv/video_stabilization/__init__.py
new file mode 100644
index 00000000..81f18ccd
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .DUTRAFTStabilizer import DUTRAFTStabilizer
+
+else:
+    _import_structure = {'DUTRAFTStabilizer': ['DUTRAFTStabilizer']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_stabilization/utils/IterativeSmooth.py b/modelscope/models/cv/video_stabilization/utils/IterativeSmooth.py
new file mode 100644
index 00000000..3f70aa3b
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/IterativeSmooth.py
@@ -0,0 +1,116 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import math
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def gauss(t, r=0, window_size=3):
+    """
+    @param window_size is the size of window over which gaussian to be applied
+    @param t is the index of current point
+    @param r is the index of point in window
+
+    @return guassian weights over a window size
+    """
+    if np.abs(r - t) > window_size:
+        return 0
+    else:
+        return np.exp((-9 * (r - t)**2) / window_size**2)
+
+
+def generateSmooth(originPath, kernel=None, repeat=20):
+    # B, 1, T, H, W; B, 6, T, H, W
+    smooth = originPath
+
+    temp_smooth_3 = originPath[:, :, 3:-3, :, :]
+
+    kernel = kernel
+
+    if kernel is None:
+        kernel = torch.Tensor([gauss(i)
+                               for i in range(-3, 4)]).to(originPath.device)
+        kernel = torch.cat([kernel[:3], kernel[4:]])
+        kernel = kernel.unsqueeze(0).unsqueeze(2).unsqueeze(3).unsqueeze(4)
+        kernel = kernel.repeat(*originPath.shape)
+
+    abskernel = torch.abs(kernel)
+    lambda_t = 100
+
+    for _ in range(repeat):
+        # import ipdb; ipdb.set_trace()
+        temp_smooth = torch.zeros_like(smooth, device=smooth.device)
+        temp_smooth_0 = smooth[:, :, 0:-6, :, :] * \
+            kernel[:, 0:1, 3:-3, :, :] * lambda_t
+        temp_smooth_1 = smooth[:, :, 1:-5, :, :] * \
+            kernel[:, 1:2, 3:-3, :, :] * lambda_t
+        temp_smooth_2 = smooth[:, :, 2:-4, :, :] * \
+            kernel[:, 2:3, 3:-3, :, :] * lambda_t
+
+        temp_smooth_4 = smooth[:, :, 4:-2, :, :] * \
+            kernel[:, 3:4, 3:-3, :, :] * lambda_t
+        temp_smooth_5 = smooth[:, :, 5:-1, :, :] * \
+            kernel[:, 4:5, 3:-3, :, :] * lambda_t
+        temp_smooth_6 = smooth[:, :, 6:, :, :] * \
+            kernel[:, 5:6, 3:-3, :, :] * lambda_t
+
+        temp_value_01 = (
+            1 + lambda_t
+            * torch.sum(abskernel[:, :, 3:-3, :, :], dim=1, keepdim=True))
+        temp_smooth[:, :, 3:-3, :, :] = (
+            (temp_smooth_0 + temp_smooth_1 + temp_smooth_2 + temp_smooth_3
+             + temp_smooth_4 + temp_smooth_5 + temp_smooth_6) / temp_value_01)
+
+        # 0
+        temp = smooth[:, :, 1:4, :, :]
+        temp_smooth[:, :, 0, :, :] = (
+            torch.sum(kernel[:, 3:, 0, :, :].unsqueeze(1) * temp, 2) * lambda_t
+            + originPath[:, :, 0, :, :]) / (1 + lambda_t * torch.sum(
+                abskernel[:, 3:, 0, :, :].unsqueeze(1), 2))
+        # 1
+        temp = torch.cat([smooth[:, :, :1, :, :], smooth[:, :, 2:5, :, :]], 2)
+        temp_smooth[:, :, 1, :, :] = (
+            torch.sum(kernel[:, 2:, 1, :, :].unsqueeze(1) * temp, 2) * lambda_t
+            + originPath[:, :, 1, :, :]) / (1 + lambda_t * torch.sum(
+                abskernel[:, 2:, 1, :, :].unsqueeze(1), 2))
+        # 2
+        temp = torch.cat([smooth[:, :, :2, :, :], smooth[:, :, 3:6, :, :]], 2)
+        temp_smooth[:, :, 2, :, :] = (
+            torch.sum(kernel[:, 1:, 2, :, :].unsqueeze(1) * temp, 2) * lambda_t
+            + originPath[:, :, 2, :, :]) / (1 + lambda_t * torch.sum(
+                abskernel[:, 1:, 2, :, :].unsqueeze(1), 2))
+        # -1
+        temp = smooth[:, :, -4:-1]
+        temp_value_11 = torch.sum(kernel[:, :3, -1, :, :].unsqueeze(1) * temp,
+                                  2)
+        temp_value_08 = (temp_value_11 * lambda_t + originPath[:, :, -1, :, :])
+        temp_value_10 = torch.sum(abskernel[:, :3, -1, :, :].unsqueeze(1), 2)
+        temp_value_09 = (1 + lambda_t * temp_value_10)
+        temp_smooth[:, :, -1, :, :] = temp_value_08 / temp_value_09
+        # -2
+        temp = torch.cat([smooth[:, :, -5:-2, :, :], smooth[:, :, -1:, :, :]],
+                         2)
+        temp_value_07 = torch.sum(kernel[:, :4, -2, :, :].unsqueeze(1) * temp,
+                                  2)
+        temp_value_04 = (temp_value_07 * lambda_t + originPath[:, :, -2, :, :])
+        temp_value_06 = torch.sum(abskernel[:, :4, -2, :, :].unsqueeze(1), 2)
+        temp_value_05 = (1 + lambda_t * temp_value_06)
+        temp_smooth[:, :, -2, :, :] = temp_value_04 / temp_value_05
+        # -3
+        temp = torch.cat([smooth[:, :, -6:-3, :, :], smooth[:, :, -2:, :, :]],
+                         2)
+        temp_value_02 = (
+            torch.sum(kernel[:, :5, -3, :, :].unsqueeze(1) * temp, 2)
+            * lambda_t + originPath[:, :, -3, :, :])
+        temp_value_03 = (
+            1
+            + lambda_t * torch.sum(abskernel[:, :5, -3, :, :].unsqueeze(1), 2))
+        temp_smooth[:, :, -3, :, :] = temp_value_02 / temp_value_03
+
+        smooth = temp_smooth
+
+    return smooth
diff --git a/modelscope/models/cv/video_stabilization/utils/MedianFilter.py b/modelscope/models/cv/video_stabilization/utils/MedianFilter.py
new file mode 100644
index 00000000..5b18ff6d
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/MedianFilter.py
@@ -0,0 +1,343 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import math
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.utils import _pair, _quadruple
+
+from modelscope.models.cv.video_stabilization.utils.ProjectionUtils import (
+    HomoCalc, HomoProj, MotionDistanceMeasure)
+from ..DUT.config import cfg
+
+
+class MedianPool2d(nn.Module):
+    """ Median pool (usable as median filter when stride=1) module.
+
+    Args:
+         kernel_size: size of pooling kernel, int or 2-tuple
+         stride: pool stride, int or 2-tuple
+         padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad
+         same: override padding and enforce same padding, boolean
+    """
+
+    def __init__(self, kernel_size=3, stride=1, padding=0, same=False):
+        super(MedianPool2d, self).__init__()
+        self.k = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _quadruple(padding)  # convert to l, r, t, b
+        self.same = same
+
+    def _padding(self, x):
+        if self.same:
+            ih, iw = x.size()[2:]
+            if ih % self.stride[0] == 0:
+                ph = max(self.k[0] - self.stride[0], 0)
+            else:
+                ph = max(self.k[0] - (ih % self.stride[0]), 0)
+            if iw % self.stride[1] == 0:
+                pw = max(self.k[1] - self.stride[1], 0)
+            else:
+                pw = max(self.k[1] - (iw % self.stride[1]), 0)
+            pl = pw // 2
+            pr = pw - pl
+            pt = ph // 2
+            pb = ph - pt
+            padding = (pl, pr, pt, pb)
+        else:
+            padding = self.padding
+        return padding
+
+    def forward(self, x):
+        # using existing pytorch functions and tensor ops so that we get autograd,
+        # would likely be more efficient to implement from scratch at C/Cuda level
+        x = F.pad(x, self._padding(x), mode='reflect')
+        x = x.unfold(2, self.k[0],
+                     self.stride[0]).unfold(3, self.k[1], self.stride[1])
+        x = x.contiguous().view(x.size()[:4] + (-1, )).median(dim=-1)[0]
+        return x
+
+
+def SingleMotionPropagate(x_flow, y_flow, pts):
+    """
+    Traditional median filter for motion propagation
+    @param: x_flow [B, 1, H, W]
+    @param: y_flow [B, 1, H, W]
+    @param: pts    [B*topk, 4]
+    """
+
+    pts = pts.float()
+
+    medfilt = MedianPool2d(same=True)
+
+    _, _, H, W = x_flow.shape
+    grids = torch.stack(torch.meshgrid(torch.arange(W), torch.arange(H)),
+                        0).to(x_flow.device).permute(0, 2,
+                                                     1)  # 2, W, H --> 2, H, W
+    grids = grids.unsqueeze(0)  # 1, 2, H, W
+    grids = grids.float()
+    new_points = grids + torch.cat([x_flow, y_flow], 1)  # B, 2, H, W
+    new_points_S = new_points.clone()
+    new_points = new_points[0, :, pts[:, 2].long(),
+                            pts[:, 3].long()].permute(1, 0)  # B*topK, 2
+    old_points = grids[0, :, pts[:, 2].long(),
+                       pts[:, 3].long()].permute(1, 0)  # B*topK, 2
+
+    old_points_numpy = old_points.detach().cpu().numpy()
+    new_points_numpy = new_points.detach().cpu().numpy()
+
+    # pre-warping with global homography
+    Homo, state = cv2.findHomography(old_points_numpy, new_points_numpy,
+                                     cv2.RANSAC)
+
+    if Homo is None:
+        Homo = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+
+    Homo = torch.from_numpy(Homo.astype(np.float32)).to(old_points.device)
+
+    meshes_x, meshes_y = torch.meshgrid(
+        torch.arange(0, W, cfg.MODEL.PIXELS),
+        torch.arange(0, H, cfg.MODEL.PIXELS))
+    meshes_x = meshes_x.float().permute(1, 0)
+    meshes_y = meshes_y.float().permute(1, 0)
+    meshes_x = meshes_x.to(old_points.device)
+    meshes_y = meshes_y.to(old_points.device)
+    meshes_z = torch.ones_like(meshes_x).to(old_points.device)
+    meshes = torch.stack([meshes_x, meshes_y, meshes_z], 0)
+
+    meshes_projected = torch.mm(Homo, meshes.view(3, -1)).view(*meshes.shape)
+    x_motions = meshes[0, :, :] - meshes_projected[0, :, :] / (
+        meshes_projected[2, :, :] + 1e-5)
+    y_motions = meshes[1, :, :] - meshes_projected[1, :, :] / (
+        meshes_projected[2, :, :] + 1e-5)
+
+    temp_x_motion = torch.zeros_like(x_motions)
+    temp_y_motion = torch.zeros_like(x_motions)
+
+    for i in range(x_motions.shape[0]):
+        for j in range(x_motions.shape[1]):
+            distance = torch.sqrt((pts[:, 2] - i * cfg.MODEL.PIXELS)**2
+                                  + (pts[:, 3] - j * cfg.MODEL.PIXELS)**2)
+            distance = distance < cfg.MODEL.RADIUS
+            index = distance.nonzero(
+            )  # the indexes whose distances are smaller than RADIUS
+            if index.shape[0] == 0:
+                continue
+            old_points_median = pts[index[:, 0].long(), :]  # N', 4(B, C, H, W)
+            dominator = old_points_median[:, 3:4] * Homo[2, 0] + \
+                old_points_median[:, 2:3] * Homo[2, 1] + \
+                Homo[2, 2] + 1e-5  # N', 1
+            x_nominator = old_points_median[:, 3:4] * Homo[0, 0] + \
+                old_points_median[:, 2:3] * Homo[0, 1] + Homo[0, 2]
+            y_nominator = old_points_median[:, 3:4] * Homo[1, 0] + \
+                old_points_median[:, 2:3] * Homo[1, 1] + Homo[1, 2]
+            new_points_homo = torch.cat(
+                [x_nominator / dominator, y_nominator / dominator],
+                -1)  # N', 2
+            new_points_flow = new_points_S[
+                0, :, old_points_median[:, 2].long(),
+                old_points_median[:, 3].long()].permute(1, 0)  # N', 2
+            temp_motion = new_points_flow - new_points_homo
+            temp_x_motion[i, j] = temp_motion[:, 0].median()
+            temp_y_motion[i, j] = temp_motion[:, 1].median()
+
+    x_motions = x_motions + temp_x_motion
+    y_motions = y_motions + temp_y_motion
+
+    # apply second median filter (f-2) over the motion mesh for outliers
+    x_motion_mesh = medfilt(x_motions.unsqueeze(0).unsqueeze(0))
+    y_motion_mesh = medfilt(y_motions.unsqueeze(0).unsqueeze(0))
+
+    return torch.cat([x_motion_mesh, y_motion_mesh], 1)
+
+
+def MultiMotionPropagate(x_flow, y_flow, pts):
+    """
+    Median filter for propagation with multi homography
+    @param: x_flow B, 1, H, W
+    @param: y_flow B, 1, H, W
+    @param: pts    B*topk, 4
+    """
+
+    medfilt = MedianPool2d(same=True)
+
+    # spreads motion over the mesh for the old_frame
+    from sklearn.cluster import KMeans
+    pts = pts.float()
+
+    B, C, H, W = x_flow.shape
+    grids = torch.stack(torch.meshgrid(torch.arange(W), torch.arange(H)),
+                        0).to(x_flow.device).permute(0, 2,
+                                                     1)  # 2, W, H --> 2, H, W
+    grids = grids.unsqueeze(0)  # 1, 2, H, W
+    grids = grids.float()
+    new_points = grids + torch.cat([x_flow, y_flow], 1)  # B, 2, H, W
+    new_points = new_points[0, :, pts[:, 2].long(),
+                            pts[:, 3].long()].permute(1, 0)  # B*topK, 2
+    old_points = grids[0, :, pts[:, 2].long(),
+                       pts[:, 3].long()].permute(1, 0)  # B*topK, 2
+
+    old_points_numpy = old_points.detach().cpu().numpy()
+    new_points_numpy = new_points.detach().cpu().numpy()
+    motion_numpy = new_points_numpy - old_points_numpy
+    pred_Y = KMeans(n_clusters=2, random_state=2).fit_predict(motion_numpy)
+    if np.sum(pred_Y) > cfg.TRAIN.TOPK / 2:
+        pred_Y = 1 - pred_Y
+    cluster1_old_points = old_points_numpy[(pred_Y == 0).nonzero()[0], :]
+    cluster1_new_points = new_points_numpy[(pred_Y == 0).nonzero()[0], :]
+
+    # pre-warping with global homography
+    Homo, _ = cv2.findHomography(cluster1_old_points, cluster1_new_points,
+                                 cv2.RANSAC)
+
+    if Homo is None:
+        Homo = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+
+    dominator = (
+        Homo[2, 0] * old_points_numpy[:, 0]
+        + Homo[2, 1] * old_points_numpy[:, 1] + Homo[2, 2])
+    new_points_projected = np.stack(
+        [(Homo[0, 0] * old_points_numpy[:, 0]
+          + Homo[0, 1] * old_points_numpy[:, 1] + Homo[0, 2]) / dominator,
+         (Homo[1, 0] * old_points_numpy[:, 0]
+          + Homo[1, 1] * old_points_numpy[:, 1] + Homo[1, 2]) / dominator], 1)
+
+    index = (pred_Y == 1).nonzero()[0]
+    attribute = np.zeros_like(new_points_numpy[:, 0:1])  # N', 1
+    old_points_numpy_chosen = old_points_numpy[index, :]
+    new_points_numpy_chosen = new_points_numpy[index, :]
+
+    cluster1_motion = cluster1_new_points - cluster1_old_points
+    clsuter2_motion = new_points_numpy_chosen - old_points_numpy_chosen
+    cluster1_meanMotion = np.mean(cluster1_motion, 0)
+    cluster2_meanMotion = np.mean(clsuter2_motion, 0)
+    distanceMeasure = MotionDistanceMeasure(cluster1_meanMotion,
+                                            cluster2_meanMotion)
+
+    if np.sum(pred_Y) > cfg.MODEL.THRESHOLDPOINT and distanceMeasure:
+
+        attribute[index, :] = np.expand_dims(np.ones_like(index), 1)
+
+        Homo_2, _ = cv2.findHomography(old_points_numpy_chosen,
+                                       new_points_numpy_chosen, cv2.RANSAC)
+        if Homo_2 is None:
+            Homo_2 = Homo
+
+        meshes_x, meshes_y = np.meshgrid(
+            np.arange(0, W, cfg.MODEL.PIXELS),
+            np.arange(0, H, cfg.MODEL.PIXELS))
+
+        x_dominator = Homo[0, 0] * meshes_x + \
+            Homo[0, 1] * meshes_y + Homo[0, 2]
+        y_dominator = Homo[1, 0] * meshes_x + \
+            Homo[1, 1] * meshes_y + Homo[1, 2]
+        noiminator = Homo[2, 0] * meshes_x + Homo[2, 1] * meshes_y + Homo[2, 2]
+
+        projected_1 = np.reshape(
+            np.stack([x_dominator / noiminator, y_dominator / noiminator], 2),
+            (-1, 2))
+
+        x_dominator = Homo_2[0, 0] * meshes_x + \
+            Homo_2[0, 1] * meshes_y + Homo_2[0, 2]
+        y_dominator = Homo_2[1, 0] * meshes_x + \
+            Homo_2[1, 1] * meshes_y + Homo_2[1, 2]
+        noiminator = Homo_2[2, 0] * meshes_x + \
+            Homo_2[2, 1] * meshes_y + Homo_2[2, 2]
+
+        projected_2 = np.reshape(
+            np.stack([x_dominator / noiminator, y_dominator / noiminator], 2),
+            (-1, 2))
+
+        distance_x = np.expand_dims(new_points_numpy[:, 0], 0) - np.reshape(
+            meshes_x, (-1, 1))
+        distance_y = np.expand_dims(new_points_numpy[:, 1], 0) - np.reshape(
+            meshes_y, (-1, 1))
+        distance = distance_x**2 + distance_y**2  # N, N'
+        distance_mask = (distance < (cfg.MODEL.RADIUS**2))  # N, N'
+        distance_mask_value = (distance_mask.astype(np.float32)
+                               * attribute.transpose(1, 0))  # N, N'
+        distance = np.sum(distance_mask_value, 1) / \
+            (np.sum(distance_mask, 1) + 1e-9)  # N
+
+        project_pos = np.reshape(
+            np.expand_dims(distance, 1) * projected_2 + np.expand_dims(
+                (1 - distance), 1) * projected_1,
+            (cfg.MODEL.HEIGHT // cfg.MODEL.PIXELS,
+             cfg.MODEL.WIDTH // cfg.MODEL.PIXELS, 2))
+
+        meshes_projected = torch.from_numpy(project_pos.astype(np.float32)).to(
+            new_points.device).permute(2, 0, 1)
+
+        meshes_x, meshes_y = torch.meshgrid(
+            torch.arange(0, W, cfg.MODEL.PIXELS),
+            torch.arange(0, H, cfg.MODEL.PIXELS))
+        meshes_x = meshes_x.float().permute(1, 0)
+        meshes_y = meshes_y.float().permute(1, 0)
+        meshes_x = meshes_x.to(old_points.device)
+        meshes_y = meshes_y.to(old_points.device)
+        meshes = torch.stack([meshes_x, meshes_y], 0)
+
+        x_motions = meshes[0, :, :] - meshes_projected[0, :, :]
+        y_motions = meshes[1, :, :] - meshes_projected[1, :, :]
+
+        homo_cal = HomoCalc(meshes, meshes_projected)
+        project_pts = HomoProj(homo_cal, old_points)
+        new_points_projected = project_pts
+
+        Homo = torch.from_numpy(Homo.astype(np.float32)).to(old_points.device)
+
+    else:
+
+        Homo = torch.from_numpy(Homo.astype(np.float32)).to(old_points.device)
+        meshes_x, meshes_y = torch.meshgrid(
+            torch.arange(0, W, cfg.MODEL.PIXELS),
+            torch.arange(0, H, cfg.MODEL.PIXELS))
+        meshes_x = meshes_x.float().permute(1, 0)
+        meshes_y = meshes_y.float().permute(1, 0)
+        meshes_x = meshes_x.to(old_points.device)
+        meshes_y = meshes_y.to(old_points.device)
+        meshes_z = torch.ones_like(meshes_x).to(old_points.device)
+        meshes = torch.stack([meshes_x, meshes_y, meshes_z], 0)
+
+        meshes_projected = torch.mm(Homo, meshes.view(3,
+                                                      -1)).view(*meshes.shape)
+
+        x_motions = meshes[0, :, :] - meshes_projected[0, :, :] / (
+            meshes_projected[2, :, :])
+        y_motions = meshes[1, :, :] - meshes_projected[1, :, :] / (
+            meshes_projected[2, :, :])
+        new_points_projected = torch.from_numpy(new_points_projected).to(
+            old_points.device)
+
+    temp_x_motion = torch.zeros_like(x_motions)
+    temp_y_motion = torch.zeros_like(x_motions)
+
+    for i in range(x_motions.shape[0]):
+        for j in range(x_motions.shape[1]):
+            distance = torch.sqrt((old_points[:, 0] - i * cfg.MODEL.PIXELS)**2
+                                  + (old_points[:, 1]
+                                     - j * cfg.MODEL.PIXELS)**2)
+            distance = distance < cfg.MODEL.RADIUS  # B * topK
+            index = distance.nonzero()
+            if index.shape[0] == 0:
+                continue
+
+            new_points_homo = new_points_projected[index[:, 0].long(), :]
+
+            new_points_flow = new_points[index[:, 0].long(), :]
+            temp_motion = -(new_points_homo - new_points_flow)
+            temp_x_motion[i, j] = temp_motion[:, 0].median()
+            temp_y_motion[i, j] = temp_motion[:, 1].median()
+
+    x_motions = x_motions + temp_x_motion
+    y_motions = y_motions + temp_y_motion
+
+    # apply second median filter (f-2) over the motion mesh for outliers
+    x_motion_mesh = medfilt(x_motions.unsqueeze(0).unsqueeze(0))
+    y_motion_mesh = medfilt(y_motions.unsqueeze(0).unsqueeze(0))
+
+    return torch.cat([x_motion_mesh, y_motion_mesh], 1)
diff --git a/modelscope/models/cv/video_stabilization/utils/ProjectionUtils.py b/modelscope/models/cv/video_stabilization/utils/ProjectionUtils.py
new file mode 100644
index 00000000..3b0546e4
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/ProjectionUtils.py
@@ -0,0 +1,514 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import math
+
+import cv2
+import numpy as np
+import torch
+
+from ..DUT.config import cfg
+
+
+def HomoCalc(grids, new_grids_loc):
+    """
+    @param: grids the location of origin grid vertices [2, H, W]
+    @param: new_grids_loc the location of desired grid vertices [2, H, W]
+
+    @return: homo_t homograph projection matrix for each grid [3, 3, H-1, W-1]
+    """
+
+    _, H, W = grids.shape
+
+    new_grids = new_grids_loc.unsqueeze(0)
+
+    Homo = torch.zeros(1, 3, 3, H - 1, W - 1).to(grids.device)
+
+    grids = grids.unsqueeze(0)
+
+    try:
+        # for common cases if all the homograph can be calculated
+        one = torch.ones_like(grids[:, 0:1, :-1, :-1], device=grids.device)
+        zero = torch.zeros_like(grids[:, 1:2, :-1, :-1], device=grids.device)
+
+        A = torch.cat(
+            [
+                torch.stack([
+                    grids[:, 0:1, :-1, :-1], grids[:, 1:2, :-1, :-1], one,
+                    zero, zero, zero,
+                    -1 * grids[:, 0:1, :-1, :-1] * new_grids[:, 0:1, :-1, :-1],
+                    -1 * grids[:, 1:2, :-1, :-1] * new_grids[:, 0:1, :-1, :-1]
+                ], 2),  # 1, 1, 8, h-1, w-1
+                torch.stack([
+                    grids[:, 0:1, 1:, :-1], grids[:, 1:2, 1:, :-1], one, zero,
+                    zero, zero,
+                    -1 * grids[:, 0:1, 1:, :-1] * new_grids[:, 0:1, 1:, :-1],
+                    -1 * grids[:, 1:2, 1:, :-1] * new_grids[:, 0:1, 1:, :-1]
+                ], 2),
+                torch.stack([
+                    grids[:, 0:1, :-1, 1:], grids[:, 1:2, :-1,
+                                                  1:], one, zero, zero, zero,
+                    -1 * grids[:, 0:1, :-1, 1:] * new_grids[:, 0:1, :-1, 1:],
+                    -1 * grids[:, 1:2, :-1, 1:] * new_grids[:, 0:1, :-1, 1:]
+                ], 2),
+                torch.stack([
+                    grids[:, 0:1, 1:, 1:], grids[:, 1:2, 1:,
+                                                 1:], one, zero, zero, zero,
+                    -1 * grids[:, 0:1, 1:, 1:] * new_grids[:, 0:1, 1:, 1:],
+                    -1 * grids[:, 1:2, 1:, 1:] * new_grids[:, 0:1, 1:, 1:]
+                ], 2),
+                torch.stack([
+                    zero, zero, zero, grids[:, 0:1, :-1, :-1],
+                    grids[:, 1:2, :-1, :-1], one,
+                    -1 * grids[:, 0:1, :-1, :-1] * new_grids[:, 1:2, :-1, :-1],
+                    -1 * grids[:, 1:2, :-1, :-1] * new_grids[:, 1:2, :-1, :-1]
+                ], 2),
+                torch.stack([
+                    zero, zero, zero, grids[:, 0:1, 1:, :-1],
+                    grids[:, 1:2, 1:, :-1], one,
+                    -1 * grids[:, 0:1, 1:, :-1] * new_grids[:, 1:2, 1:, :-1],
+                    -1 * grids[:, 1:2, 1:, :-1] * new_grids[:, 1:2, 1:, :-1]
+                ], 2),
+                torch.stack([
+                    zero, zero, zero, grids[:, 0:1, :-1,
+                                            1:], grids[:, 1:2, :-1, 1:], one,
+                    -1 * grids[:, 0:1, :-1, 1:] * new_grids[:, 1:2, :-1, 1:],
+                    -1 * grids[:, 1:2, :-1, 1:] * new_grids[:, 1:2, :-1, 1:]
+                ], 2),
+                torch.stack([
+                    zero, zero, zero, grids[:, 0:1, 1:, 1:], grids[:, 1:2, 1:,
+                                                                   1:], one,
+                    -1 * grids[:, 0:1, 1:, 1:] * new_grids[:, 1:2, 1:, 1:],
+                    -1 * grids[:, 1:2, 1:, 1:] * new_grids[:, 1:2, 1:, 1:]
+                ], 2),
+            ],
+            1).view(8, 8, -1).permute(2, 0, 1)  # 1, 8, 8, h-1, w-1
+        B_ = torch.stack([
+            new_grids[:, 0, :-1, :-1],
+            new_grids[:, 0, 1:, :-1],
+            new_grids[:, 0, :-1, 1:],
+            new_grids[:, 0, 1:, 1:],
+            new_grids[:, 1, :-1, :-1],
+            new_grids[:, 1, 1:, :-1],
+            new_grids[:, 1, :-1, 1:],
+            new_grids[:, 1, 1:, 1:],
+        ], 1).view(8, -1).permute(
+            1, 0)  # B, 8, h-1, w-1 ==> A @ H = B ==> H = A^-1 @ B
+        A_inverse = torch.inverse(A)
+        # B, 8, 8 @ B, 8, 1 --> B, 8, 1
+        H_recovered = torch.bmm(A_inverse, B_.unsqueeze(2))
+
+        H_ = torch.cat([
+            H_recovered,
+            torch.ones_like(H_recovered[:, 0:1, :], device=H_recovered.device)
+        ], 1).view(H_recovered.shape[0], 3, 3)
+
+        H_ = H_.permute(1, 2, 0)
+        H_ = H_.view(Homo.shape)
+        Homo = H_
+    except Exception:
+        # if some of the homography can not be calculated
+        one = torch.ones_like(grids[:, 0:1, 0, 0], device=grids.device)
+        zero = torch.zeros_like(grids[:, 1:2, 0, 0], device=grids.device)
+        H_ = torch.eye(3, device=grids.device)
+        for i in range(H - 1):
+            for j in range(W - 1):
+                A = torch.cat([
+                    torch.stack([
+                        grids[:, 0:1, i, j], grids[:, 1:2, i,
+                                                   j], one, zero, zero, zero,
+                        -1 * grids[:, 0:1, i, j] * new_grids[:, 0:1, i, j],
+                        -1 * grids[:, 1:2, i, j] * new_grids[:, 0:1, i, j]
+                    ], 2),
+                    torch.stack([
+                        grids[:, 0:1, i + 1, j], grids[:, 1:2, i + 1, j], one,
+                        zero, zero, zero, -1 * grids[:, 0:1, i + 1, j]
+                        * new_grids[:, 0:1, i + 1, j], -1
+                        * grids[:, 1:2, i + 1, j] * new_grids[:, 0:1, i + 1, j]
+                    ], 2),
+                    torch.stack([
+                        grids[:, 0:1, i, j + 1], grids[:, 1:2, i, j + 1], one,
+                        zero, zero, zero, -1 * grids[:, 0:1, i, j + 1]
+                        * new_grids[:, 0:1, i, j + 1], -1
+                        * grids[:, 1:2, i, j + 1] * new_grids[:, 0:1, i, j + 1]
+                    ], 2),
+                    torch.stack([
+                        grids[:, 0:1, i + 1, j + 1], grids[:, 1:2, i + 1,
+                                                           j + 1], one, zero,
+                        zero, zero, -1 * grids[:, 0:1, i + 1, j + 1]
+                        * new_grids[:, 0:1, i + 1, j + 1],
+                        -1 * grids[:, 1:2, i + 1, j + 1]
+                        * new_grids[:, 0:1, i + 1, j + 1]
+                    ], 2),
+                    torch.stack([
+                        zero, zero, zero, grids[:, 0:1, i, j], grids[:, 1:2, i,
+                                                                     j], one,
+                        -1 * grids[:, 0:1, i, j] * new_grids[:, 1:2, i, j],
+                        -1 * grids[:, 1:2, i, j] * new_grids[:, 1:2, i, j]
+                    ], 2),
+                    torch.stack([
+                        zero, zero, zero, grids[:, 0:1, i + 1,
+                                                j], grids[:, 1:2, i + 1, j],
+                        one, -1 * grids[:, 0:1, i + 1, j]
+                        * new_grids[:, 1:2, i + 1, j], -1
+                        * grids[:, 1:2, i + 1, j] * new_grids[:, 1:2, i + 1, j]
+                    ], 2),
+                    torch.stack([
+                        zero, zero, zero, grids[:, 0:1, i, j + 1],
+                        grids[:, 1:2, i,
+                              j + 1], one, -1 * grids[:, 0:1, i, j + 1]
+                        * new_grids[:, 1:2, i, j + 1], -1
+                        * grids[:, 1:2, i, j + 1] * new_grids[:, 1:2, i, j + 1]
+                    ], 2),
+                    torch.stack([
+                        zero, zero, zero, grids[:, 0:1, i + 1, j + 1],
+                        grids[:, 1:2, i + 1,
+                              j + 1], one, -1 * grids[:, 0:1, i + 1, j + 1]
+                        * new_grids[:, 1:2, i + 1, j + 1],
+                        -1 * grids[:, 1:2, i + 1, j + 1]
+                        * new_grids[:, 1:2, i + 1, j + 1]
+                    ], 2),
+                ], 1)  # B, 8, 8
+                B_ = torch.stack([
+                    new_grids[:, 0, i, j],
+                    new_grids[:, 0, i + 1, j],
+                    new_grids[:, 0, i, j + 1],
+                    new_grids[:, 0, i + 1, j + 1],
+                    new_grids[:, 1, i, j],
+                    new_grids[:, 1, i + 1, j],
+                    new_grids[:, 1, i, j + 1],
+                    new_grids[:, 1, i + 1, j + 1],
+                ], 1)  # B, 8 ==> A @ H = B ==> H = A^-1 @ B
+                try:
+                    A_inverse = torch.inverse(A)
+
+                    # B, 8, 8 @ B, 8, 1 --> B, 8, 1
+                    H_recovered = torch.bmm(A_inverse, B_.unsqueeze(2))
+
+                    H_ = torch.cat([
+                        H_recovered,
+                        torch.ones_like(H_recovered[:, 0:1, :]).to(
+                            H_recovered.device)
+                    ], 1).view(H_recovered.shape[0], 3, 3)
+                except Exception:
+                    pass
+                Homo[:, :, :, i, j] = H_
+
+    homo_t = Homo.view(3, 3, H - 1, W - 1)
+
+    return homo_t
+
+
+def HomoProj(homo, pts):
+    """
+    @param: homo [3, 3, G_H-1, G_W-1]
+    @param: pts  [N, 2(W, H)] - [:, 0] for width and [:, 1] for height
+
+    @return: projected pts [N, 2(W, H)] - [:, 0] for width and [:, 1] for height
+    """
+
+    # pts_location_x = (pts[:, 0:1] // cfg.MODEL.PIXELS).long()
+    # pts_location_y = (pts[:, 1:2] // cfg.MODEL.PIXELS).long()
+    pts_location_x = torch.div(
+        pts[:, 0:1], cfg.MODEL.PIXELS, rounding_mode='floor').long()
+    pts_location_y = torch.div(
+        pts[:, 1:2], cfg.MODEL.PIXELS, rounding_mode='floor').long()
+
+    # if the grid is outside of the image
+    maxWidth = cfg.MODEL.WIDTH // cfg.MODEL.PIXELS - 1
+    maxHeight = cfg.MODEL.HEIGHT // cfg.MODEL.PIXELS - 1
+    index = (pts_location_x[:, 0] >= maxWidth).nonzero().long()
+    pts_location_x[index, :] = maxWidth - 1
+    index = (pts_location_y[:, 0] >= maxHeight).nonzero().long()
+    pts_location_y[index, :] = maxHeight - 1
+
+    homo = homo.to(pts.device)
+
+    # calculate the projection
+    x_dominator = pts[:, 0] * homo[0, 0, pts_location_y[:, 0], pts_location_x[:, 0]] + pts[:, 1] * \
+        homo[0, 1, pts_location_y[:, 0], pts_location_x[:, 0]] + homo[0, 2, pts_location_y[:, 0], pts_location_x[:, 0]]
+    y_dominator = pts[:, 0] * homo[1, 0, pts_location_y[:, 0], pts_location_x[:, 0]] + pts[:, 1] * \
+        homo[1, 1, pts_location_y[:, 0], pts_location_x[:, 0]] + homo[1, 2, pts_location_y[:, 0], pts_location_x[:, 0]]
+    noiminator = pts[:, 0] * homo[2, 0, pts_location_y[:, 0], pts_location_x[:, 0]] + pts[:, 1] * \
+        homo[2, 1, pts_location_y[:, 0], pts_location_x[:, 0]] + homo[2, 2, pts_location_y[:, 0], pts_location_x[:, 0]]
+    noiminator = noiminator
+
+    new_kp_x = x_dominator / noiminator
+    new_kp_y = y_dominator / noiminator
+
+    return torch.stack([new_kp_x, new_kp_y], 1)
+
+
+def multiHomoEstimate(motion, kp):
+    """
+    @param: motion [4, N]
+    @param: kp     [2, N]
+    """
+
+    from sklearn.cluster import KMeans
+
+    new_kp = torch.cat([kp[1:2, :], kp[0:1, :]], 0) + motion[2:, :]
+    new_points_numpy = new_kp.cpu().detach().numpy().transpose(1, 0)
+    old_points = torch.stack([kp[1, :], kp[0, :]], 1).to(motion.device)
+    old_points_numpy = torch.cat([kp[1:2, :], kp[0:1, :]],
+                                 0).cpu().detach().numpy().transpose(1, 0)
+    motion_numpy = new_points_numpy - old_points_numpy
+
+    pred_Y = KMeans(n_clusters=2, random_state=2).fit_predict(motion_numpy)
+    if np.sum(pred_Y) > cfg.TRAIN.TOPK / 2:
+        pred_Y = 1 - pred_Y
+    cluster1_old_points = old_points_numpy[(pred_Y == 0).nonzero()[0], :]
+    cluster1_new_points = new_points_numpy[(pred_Y == 0).nonzero()[0], :]
+
+    # pre-warping with global homography
+    Homo, _ = cv2.findHomography(cluster1_old_points, cluster1_new_points,
+                                 cv2.RANSAC)
+
+    if Homo is None:
+        Homo = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+
+    dominator = (
+        Homo[2, 0] * old_points_numpy[:, 0]
+        + Homo[2, 1] * old_points_numpy[:, 1] + Homo[2, 2])
+    new_points_projected = torch.from_numpy(
+        np.stack(
+            [(Homo[0, 0] * old_points_numpy[:, 0]
+              + Homo[0, 1] * old_points_numpy[:, 1] + Homo[0, 2]) / dominator,
+             (Homo[1, 0] * old_points_numpy[:, 0]
+              + Homo[1, 1] * old_points_numpy[:, 1] + Homo[1, 2]) / dominator],
+            1).astype(np.float32)).to(old_points.device).permute(1, 0)
+
+    index = (pred_Y == 1).nonzero()[0]
+    attribute = np.zeros_like(new_points_numpy[:, 0:1])  # N', 1
+    cluster2_old_points = old_points_numpy[index, :]
+    cluster2_new_points = new_points_numpy[index, :]
+    attribute[index, :] = np.expand_dims(np.ones_like(index), 1)
+
+    cluster1_motion = cluster1_new_points - cluster1_old_points
+    clsuter2_motion = cluster2_new_points - cluster2_old_points
+    cluster1_meanMotion = np.mean(cluster1_motion, 0)
+    cluster2_meanMotion = np.mean(clsuter2_motion, 0)
+    distanceMeasure = MotionDistanceMeasure(cluster1_meanMotion,
+                                            cluster2_meanMotion)
+
+    threhold = (np.sum(pred_Y) > cfg.MODEL.THRESHOLDPOINT) and distanceMeasure
+
+    if threhold:
+
+        Homo_2, _ = cv2.findHomography(cluster2_old_points,
+                                       cluster2_new_points, cv2.RANSAC)
+        if Homo_2 is None:
+            Homo_2 = Homo
+
+        meshes_x, meshes_y = np.meshgrid(
+            np.arange(0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS),
+            np.arange(0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS))
+
+        # Use first cluster to do projection
+        x_dominator = Homo[0, 0] * meshes_x + \
+            Homo[0, 1] * meshes_y + Homo[0, 2]
+        y_dominator = Homo[1, 0] * meshes_x + \
+            Homo[1, 1] * meshes_y + Homo[1, 2]
+        noiminator = Homo[2, 0] * meshes_x + Homo[2, 1] * meshes_y + Homo[2, 2]
+
+        projected_1 = np.reshape(
+            np.stack([x_dominator / noiminator, y_dominator / noiminator], 2),
+            (-1, 2))
+
+        # Use second cluster to do projection
+        x_dominator = Homo_2[0, 0] * meshes_x + \
+            Homo_2[0, 1] * meshes_y + Homo_2[0, 2]
+        y_dominator = Homo_2[1, 0] * meshes_x + \
+            Homo_2[1, 1] * meshes_y + Homo_2[1, 2]
+        noiminator = Homo_2[2, 0] * meshes_x + \
+            Homo_2[2, 1] * meshes_y + Homo_2[2, 2]
+
+        projected_2 = np.reshape(
+            np.stack([x_dominator / noiminator, y_dominator / noiminator], 2),
+            (-1, 2))
+
+        # Determine use which projected position
+        distance_x = np.expand_dims(new_points_numpy[:, 0], 0) - np.reshape(
+            meshes_x, (-1, 1))
+        distance_y = np.expand_dims(new_points_numpy[:, 1], 0) - np.reshape(
+            meshes_y, (-1, 1))
+        distance = distance_x**2 + distance_y**2  # N, N'
+        distance_mask = (distance < (cfg.MODEL.RADIUS**2))  # N, N'
+        distance_mask_value = (distance_mask.astype(np.float32)
+                               * attribute.transpose(1, 0))  # N, N'
+        distance = np.sum(distance_mask_value, 1) / \
+            (np.sum(distance_mask, 1) + 1e-9)  # N
+
+        project_pos = np.reshape(
+            np.expand_dims(distance, 1) * projected_2 + np.expand_dims(
+                (1 - distance), 1) * projected_1,
+            (cfg.MODEL.HEIGHT // cfg.MODEL.PIXELS,
+             cfg.MODEL.WIDTH // cfg.MODEL.PIXELS, 2))
+
+        meshes_projected = torch.from_numpy(project_pos.astype(np.float32)).to(
+            old_points.device).permute(2, 0, 1)
+
+        # calculate reference location for each keypoint
+        meshes_x, meshes_y = torch.meshgrid(
+            torch.arange(0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS),
+            torch.arange(0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS))
+        meshes_x = meshes_x.float().permute(1, 0)
+        meshes_y = meshes_y.float().permute(1, 0)
+        meshes_x = meshes_x.to(old_points.device)
+        meshes_y = meshes_y.to(old_points.device)
+        meshes = torch.stack([meshes_x, meshes_y], 0)
+
+        x_motions = meshes[0, :, :] - \
+            meshes_projected[0, :, :]
+        y_motions = meshes[1, :, :] - meshes_projected[1, :, :]
+
+        homo_cal = HomoCalc(meshes, meshes_projected)
+        project_pts = HomoProj(homo_cal, old_points)
+        new_points_projected = project_pts.to(old_points.device).permute(1, 0)
+
+    else:
+        Homo = torch.from_numpy(Homo.astype(np.float32)).to(old_points.device)
+        meshes_x, meshes_y = torch.meshgrid(
+            torch.arange(0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS),
+            torch.arange(0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS))
+        meshes_x = meshes_x.float().permute(1, 0)
+        meshes_y = meshes_y.float().permute(1, 0)
+        meshes_x = meshes_x.to(old_points.device)
+        meshes_y = meshes_y.to(old_points.device)
+        meshes_z = torch.ones_like(meshes_x).to(old_points.device)
+        meshes = torch.stack([meshes_x, meshes_y, meshes_z], 0)
+        meshes_projected = torch.mm(Homo, meshes.view(3,
+                                                      -1)).view(*meshes.shape)
+
+        x_motions = meshes[0, :, :] - meshes_projected[0, :, :] / \
+            (meshes_projected[2, :, :])
+        y_motions = meshes[1, :, :] - meshes_projected[1, :, :] / \
+            (meshes_projected[2, :, :])
+
+    grids = torch.stack(
+        torch.meshgrid(
+            torch.arange(0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS),
+            torch.arange(0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS)),
+        0).to(motion.device).permute(0, 2, 1).reshape(2, -1).permute(1, 0)
+
+    grids = grids.unsqueeze(2).float()  # N', 2, 1
+    projected_motion = torch.stack([x_motions, y_motions],
+                                   2).view(-1, 2,
+                                           1).to(motion.device)  # G_H, G_W, 2
+
+    redisual_kp_motion = new_points_projected - torch.cat(
+        [kp[1:2, :], kp[0:1, :]], 0)
+
+    motion[:2, :] = motion[:2, :] + motion[2:, :]
+    motion = motion.unsqueeze(0).repeat(grids.shape[0], 1, 1)  # N', 4, N
+    motion[:, :2, :] = (motion[:, :2, :] - grids) / cfg.MODEL.WIDTH
+    origin_motion = motion[:, 2:, :] / cfg.MODEL.FLOWC
+    motion[:, 2:, :] = (redisual_kp_motion.unsqueeze(0)
+                        - motion[:, 2:, :]) / cfg.MODEL.FLOWC
+
+    return motion, projected_motion / cfg.MODEL.FLOWC, origin_motion
+
+
+def singleHomoEstimate(motion, kp):
+    """
+    @param: motion [4, N]
+    @param: kp     [2, N]
+    """
+    new_kp = torch.cat([kp[1:2, :], kp[0:1, :]], 0) + motion[2:, :]
+    new_points_numpy = new_kp.cpu().detach().numpy().transpose(1, 0)
+    old_points = torch.stack([kp[1, :], kp[0, :]], 1).to(motion.device)
+    old_points_numpy = torch.cat([kp[1:2, :], kp[0:1, :]],
+                                 0).cpu().detach().numpy().transpose(1, 0)
+
+    cluster1_old_points = old_points_numpy
+    cluster1_new_points = new_points_numpy
+
+    # pre-warping with global homography
+    Homo, _ = cv2.findHomography(cluster1_old_points, cluster1_new_points,
+                                 cv2.RANSAC)
+
+    if Homo is None:
+        Homo = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
+
+    dominator = (
+        Homo[2, 0] * old_points_numpy[:, 0]
+        + Homo[2, 1] * old_points_numpy[:, 1] + Homo[2, 2])
+    new_points_projected = torch.from_numpy(
+        np.stack(
+            [(Homo[0, 0] * old_points_numpy[:, 0]
+              + Homo[0, 1] * old_points_numpy[:, 1] + Homo[0, 2]) / dominator,
+             (Homo[1, 0] * old_points_numpy[:, 0]
+              + Homo[1, 1] * old_points_numpy[:, 1] + Homo[1, 2]) / dominator],
+            1).astype(np.float32)).to(old_points.device).permute(1, 0)
+
+    Homo = torch.from_numpy(Homo.astype(np.float32)).to(
+        old_points.device)  # 3, 3
+    meshes_x, meshes_y = torch.meshgrid(
+        torch.arange(0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS),
+        torch.arange(0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS))
+    meshes_x = meshes_x.float().permute(1, 0)
+    meshes_y = meshes_y.float().permute(1, 0)
+    meshes_x = meshes_x.to(old_points.device)
+    meshes_y = meshes_y.to(old_points.device)
+    meshes_z = torch.ones_like(meshes_x).to(old_points.device)
+    meshes = torch.stack([meshes_x, meshes_y, meshes_z],
+                         0)  # 3, H // PIXELS, W // PIXELS
+    meshes_projected = torch.mm(Homo, meshes.view(3, -1)).view(*meshes.shape)
+    x_motions = meshes[0, :, :] - meshes_projected[0, :, :] / \
+        (meshes_projected[2, :, :])  # H//PIXELS, W//PIXELS
+    y_motions = meshes[1, :, :] - \
+        meshes_projected[1, :, :] / (meshes_projected[2, :, :])
+
+    grids = torch.stack(
+        torch.meshgrid(
+            torch.arange(0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS),
+            torch.arange(0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS)),
+        0).to(motion.device).permute(0, 2, 1).reshape(2, -1).permute(
+            1, 0)  # 2, W, H --> 2, H, W --> 2, N'
+
+    grids = grids.unsqueeze(2).float()  # N', 2, 1
+    projected_motion = torch.stack([x_motions, y_motions],
+                                   2).view(-1, 2,
+                                           1).to(motion.device)  # G_H, G_W, 2
+
+    redisual_kp_motion = new_points_projected - torch.cat(
+        [kp[1:2, :], kp[0:1, :]], 0)
+
+    # to kp_flow (kp(t)) location
+    motion[:2, :] = motion[:2, :] + motion[2:, :]
+    motion = motion.unsqueeze(0).repeat(grids.shape[0], 1, 1)  # N', 4, N
+    motion[:, :2, :] = (motion[:, :2, :] - grids) / cfg.MODEL.WIDTH
+    origin_motion = motion[:, 2:, :] / cfg.MODEL.FLOWC
+    motion[:, 2:, :] = (redisual_kp_motion.unsqueeze(0)
+                        - motion[:, 2:, :]) / cfg.MODEL.FLOWC
+
+    return motion, projected_motion / cfg.MODEL.FLOWC, origin_motion
+
+
+def f_rot(x):
+    return math.atan2(x[1], x[0]) / math.pi * 180
+
+
+def MotionDistanceMeasure(motion1, motion2):
+    """
+    MotionDistanceMeasure
+    @params motion1 np.array(2) (w, h)
+    @params motion2 np.array(2) (w, h)
+
+    @return bool describe whether the two motion are close or not, True for far and False for close
+    """
+
+    mangnitue_motion1 = np.sqrt(np.sum(motion1**2))
+    mangnitue_motion2 = np.sqrt(np.sum(motion2**2))
+    diff_mangnitude = np.abs(mangnitue_motion1 - mangnitue_motion2)
+
+    rot_motion1 = f_rot(motion1)
+    rot_motion2 = f_rot(motion2)
+    diff_rot = np.abs(rot_motion1 - rot_motion2)
+    if diff_rot > 180:
+        diff_rot = 360 - diff_rot
+
+    temp_value_12 = (diff_mangnitude >= cfg.Threshold.MANG)
+    temp_value_13 = (diff_rot >= cfg.Threshold.ROT)
+
+    return temp_value_12 or temp_value_13
diff --git a/modelscope/models/cv/video_stabilization/utils/RAFTUtils.py b/modelscope/models/cv/video_stabilization/utils/RAFTUtils.py
new file mode 100644
index 00000000..735245cf
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/RAFTUtils.py
@@ -0,0 +1,96 @@
+# Part of the implementation is borrowed and modified from RAFT,
+# publicly available at https://github.com/princeton-vl/RAFT
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy import interpolate
+
+
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [
+                pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2,
+                pad_ht - pad_ht // 2
+            ]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+
+
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+
+    x1 = x0 + dx
+    y1 = y0 + dy
+
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+
+    flow_x = interpolate.griddata((x1, y1),
+                                  dx, (x0, y0),
+                                  method='nearest',
+                                  fill_value=0)
+
+    flow_y = interpolate.griddata((x1, y1),
+                                  dy, (x0, y0),
+                                  method='nearest',
+                                  fill_value=0)
+
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+
+
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+
+    return img
+
+
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(
+        torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+def upflow8(flow, mode='bilinear'):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return 8 * F.interpolate(
+        flow, size=new_size, mode=mode, align_corners=True)
diff --git a/modelscope/models/cv/video_stabilization/utils/WarpUtils.py b/modelscope/models/cv/video_stabilization/utils/WarpUtils.py
new file mode 100644
index 00000000..e8f713cd
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/WarpUtils.py
@@ -0,0 +1,82 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+
+from ..DUT.config import cfg
+from .ProjectionUtils import HomoCalc, HomoProj
+
+
+def mesh_warp_frame(frame, x_motion, y_motion, cap_width, cap_height):
+    """
+    @param frame current frame [N, 1, H, W]
+    @param x_motion [N, 1, G_H, G_W]
+    @param y_motion [N, 1, G_H, G_W]
+
+    @return mesh warping according to given motion
+    """
+
+    target_device = frame.device
+
+    src_grids = torch.stack(
+        torch.meshgrid(
+            torch.arange(
+                0, cfg.MODEL.WIDTH, cfg.MODEL.PIXELS, device=target_device),
+            torch.arange(
+                0, cfg.MODEL.HEIGHT, cfg.MODEL.PIXELS, device=target_device)),
+        0).permute(0, 2, 1).unsqueeze(0).float()  # 2, G_H, G_W
+
+    des_grids = src_grids + torch.cat([x_motion, y_motion], 1)
+
+    projection = []
+
+    for i in range(des_grids.shape[0]):
+        homo = HomoCalc(src_grids[0], des_grids[i])
+
+        origin_kp = torch.stack(
+            torch.meshgrid(
+                torch.arange(0, cfg.MODEL.WIDTH, device=target_device),
+                torch.arange(0, cfg.MODEL.HEIGHT, device=target_device)),
+            0).permute(0, 2, 1).float()  # 2, H, W
+
+        projected_kp = HomoProj(
+            homo,
+            origin_kp.contiguous().view(2, -1).permute(1, 0)).permute(1, 0)
+
+        projection.append(projected_kp.contiguous().view(
+            *origin_kp.shape).permute(1, 2, 0))  # 2, H, W --> H, W, 2
+    projection = torch.stack(projection, 0)
+
+    projection[:, :, :, 0] = projection[:, :, :, 0] / cfg.MODEL.WIDTH * 2. - 1.
+    projection[:, :, :, 1] = projection[:, :, :, 1] / \
+        cfg.MODEL.HEIGHT * 2. - 1.
+    # warp with original images
+    projection = projection.permute(0, 3, 1, 2)
+    projection = F.interpolate(
+        projection, (cap_height, cap_width),
+        mode='bilinear',
+        align_corners=True)
+    projection = projection.permute(0, 2, 3, 1)
+
+    generated_frame = F.grid_sample(frame, projection, align_corners=True)
+
+    return generated_frame
+
+
+def warpListImage(images, x_motion, y_motion, cap_width, cap_height):
+    """
+    @param images List(image [1, 1, H, W])
+    @param x_motion [G_H, G_W, N]
+    @param y_motion [G_H, G_W, N]
+    """
+
+    frames = np.concatenate(images, 0)
+    x_motion = np.expand_dims(np.transpose(x_motion, (2, 0, 1)), 1)
+    y_motion = np.expand_dims(np.transpose(y_motion, (2, 0, 1)), 1)
+    frames = torch.from_numpy(frames.astype(np.float32))
+    x_motion = torch.from_numpy(x_motion.astype(np.float32))
+    y_motion = torch.from_numpy(y_motion.astype(np.float32))
+    return mesh_warp_frame(frames, x_motion, y_motion, cap_width, cap_height)
diff --git a/modelscope/models/cv/video_stabilization/utils/__init__.py b/modelscope/models/cv/video_stabilization/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/models/cv/video_stabilization/utils/image_utils.py b/modelscope/models/cv/video_stabilization/utils/image_utils.py
new file mode 100644
index 00000000..9a4f5c2c
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/image_utils.py
@@ -0,0 +1,383 @@
+# @Time    : 2018-9-21 14:36
+# @Author  : xylon
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import torch
+from skimage import transform
+from torch.nn import functional as F
+
+from modelscope.models.cv.video_stabilization.utils.math_utils import L2Norm
+
+
+def clip_patch(kpts_byxc, kpts_scale, kpts_ori, im_info, images, PSIZE):
+    """
+    clip patch from im_C, im_S, im_info, im_raw.
+    :param kpts_byxc: tensor #(B*topk, 4): the 4 correspond to (b, y, x, 0) each element in it has length B*topk
+    :param kpts_scale: tensor(B*topk): image scale value corresponding to topk keypoints in all batch
+    :param kpts_ori: tensor(B*topk, 2): image orintation value corresponding to topk keypoints in all batch
+    :param im_info: tensor (B, 2): a list contain rescale ratio sh and sw
+    :param images: tensor(B, 1, H, W): like 960*720 gray image before image rescaled to 320*240
+    :param PSIZE: should be cfg.PATCH.size
+    :return: torch(B*topk, psize, psize): B*topk patch resized
+    """
+    assert kpts_byxc.size(0) == kpts_scale.size(0)
+    out_width = out_height = PSIZE
+    device = kpts_byxc.device
+    B, C, im_height, im_width = images.size()
+    num_kp = kpts_byxc.size(0)  # B*K
+    max_y = int(im_height - 1)
+    max_x = int(im_width - 1)
+    y_t, x_t = torch.meshgrid([
+        torch.linspace(-1, 1, out_height, dtype=torch.float, device=device),
+        torch.linspace(-1, 1, out_width, dtype=torch.float, device=device),
+    ])
+    one_t = x_t.new_full(x_t.size(), fill_value=1)
+    x_t = x_t.contiguous().view(-1)
+    y_t = y_t.contiguous().view(-1)
+    one_t = one_t.view(-1)
+    grid = torch.stack((x_t, y_t, one_t))  # (3, out_width*out_height)
+    grid = grid.view(-1)  # (3*out_width*out_height)
+    grid = grid.repeat(num_kp)  # (numkp*3*out_width*out_height)
+    # [num_kp, 3, 81] # this grid is designed to mask on keypoint from its left-up[-1, -1] to right-bottom[1, 1]
+    grid = grid.view(num_kp, 3, -1)
+
+    #
+    # create 6D affine from scale and orientation
+    # [s, 0, 0]   [cos, -sin, 0]
+    # [0, s, 0] * [sin,  cos, 0]
+    # [0, 0, 1]   [0,    0,   1]
+    #
+    thetas = torch.eye(
+        2, 3, dtype=torch.float,
+        device=device)  # [[ 1.,  0.,  0.],[ 0.,  1.,  0.]] (2, 3)
+    thetas = thetas.unsqueeze(0).repeat(num_kp, 1, 1)  # (num_kp, 2, 3)
+    im_info = im_info[:, 0].unsqueeze(-1)  # (B, 1)
+    kpts_scale = kpts_scale.view(im_info.size(0), -1) / im_info  # (B, topk)
+    kpts_scale = kpts_scale.view(-1) / 2.0  # (numkp)
+    thetas = thetas * kpts_scale[:, None, None]
+    ones = torch.tensor([[[0, 0, 1]]], dtype=torch.float,
+                        device=device).repeat(num_kp, 1, 1)  # (numkp, 1, 1)
+    thetas = torch.cat((thetas, ones), 1)  # (num_kp, 3, 3)
+    # thetas like this
+    # [sw, 0,  0]
+    # [0,  sh, 0]
+    # [0,  0,  1]
+
+    if kpts_ori is not None:
+        cos = kpts_ori[:, 0].unsqueeze(-1)  # [num_kp, 1]
+        sin = kpts_ori[:, 1].unsqueeze(-1)  # [num_kp, 1]
+        zeros = cos.new_full(cos.size(), fill_value=0)
+        ones = cos.new_full(cos.size(), fill_value=1)
+        R = torch.cat((cos, -sin, zeros, sin, cos, zeros, zeros, zeros, ones),
+                      dim=-1)
+        R = R.view(-1, 3, 3)
+        thetas = torch.matmul(thetas, R)
+
+    # Apply transformation to regular grid
+    # [num_kp,3,3] * [num_kp,3,H*W] = [num_kp, 3, 81] # magnify grid to each keypoint scale
+    T_g = torch.matmul(thetas, grid)
+    x = T_g[:, 0, :]  # (numkp, 81)
+    y = T_g[:, 1, :]  # (numkp, 81)
+
+    # get each keypoint x
+    kp_x_ofst = kpts_byxc[:, 2].view(B, -1).float() / im_info  # (B, topk)
+    kp_x_ofst = kp_x_ofst.view(-1, 1)  # (numkp, 1) get each keypoint x
+    # get each keypoint y
+    kp_y_ofst = kpts_byxc[:, 1].view(B, -1).float() / im_info  # (B, topk)
+    kp_y_ofst = kp_y_ofst.view(-1, 1)  # (numkp, 1) get each keypoint y
+
+    # centerize on keypoints
+    # [num_kp,81] + # [num_kp,1] # move grid center on each keypoint
+    x = x + kp_x_ofst
+    # [num_kp,81] + # [num_kp,1] # move grid center on each keypoint
+    y = y + kp_y_ofst
+    x = x.view(-1)  # [num_kp*81]
+    y = y.view(-1)  # [num_kp*81]
+
+    # interpolation
+    x0 = x.floor().long()  # [num_kp*81]
+    x1 = x0 + 1  # [num_kp*81]
+    y0 = y.floor().long()  # [num_kp*81]
+    y1 = y0 + 1  # [num_kp*81]
+
+    x0 = x0.clamp(min=0, max=max_x)  # [num_kp*81]
+    x1 = x1.clamp(min=0, max=max_x)  # [num_kp*81]
+    y0 = y0.clamp(min=0, max=max_y)  # [num_kp*81]
+    y1 = y1.clamp(min=0, max=max_y)  # [num_kp*81]
+
+    dim2 = im_width
+    dim1 = im_width * im_height
+    batch_inds = kpts_byxc[:, 0].unsqueeze(
+        -1)  # (num_kp, 1) get each keypoint batch number
+    base = batch_inds.repeat(
+        1, out_height * out_width
+    )  # [num_kp, 81] # means batch indexes correspond to each grid pixel
+    # [num_kp*81] # correspond to each grid pixel start index if all pixel flatten to a vector
+    base = base.view(-1) * dim1
+    base_y0 = (
+        base + y0 * dim2
+    )  # correspond each grid pixel y0 pixel if all pixel flatten to a vector
+    base_y1 = (
+        base + y1 * dim2
+    )  # correspond each grid pixel y1 pixel if all pixel flatten to a vector
+    idx_a = (
+        base_y0 + x0
+    )  # correspond left_up point pixel index if all pixel flatten to a vector
+    idx_b = base_y1 + x0  # left-bottom pixel
+    idx_c = base_y0 + x1  # right-up pixel
+    idx_d = base_y1 + x1  # right-bottom pixel
+
+    im_flat = images.view(-1)  # [B*height*width] # flatten all pixel
+
+    # [num_kp*81] # get pixel value in index idx_a
+    Ia = im_flat.gather(0, idx_a)
+    # [num_kp*81] # get pixel value in index idx_b
+    Ib = im_flat.gather(0, idx_b)
+    # [num_kp*81] # get pixel value in index idx_c
+    Ic = im_flat.gather(0, idx_c)
+    # [num_kp*81] # get pixel value in index idx_d
+    Id = im_flat.gather(0, idx_d)
+
+    x0_f = x0.float()  # [num_kp*81]
+    x1_f = x1.float()  # [num_kp*81]
+    y0_f = y0.float()  # [num_kp*81]
+    y1_f = y1.float()  # [num_kp*81]
+
+    # [num_kp*81] # interpolation weight which is the distance from x to x1 times y to y1
+    wa = (x1_f - x) * (y1_f - y)
+    wb = (x1_f - x) * (y - y0_f)  # [num_kp*81] # interpolation weight
+    wc = (x - x0_f) * (y1_f - y)  # [num_kp*81] # interpolation weight
+    wd = (x - x0_f) * (y - y0_f)  # [num_kp*81] # interpolation weight
+
+    output = (wa * Ia + wb * Ib + wc * Ic + wd * Id
+              )  # interpolation value in each keypoints grid
+    output = output.view(num_kp, out_height, out_width)
+    return output.unsqueeze(1)
+
+
+def warp(im1_data, homo21):
+    """
+    warp im1 to im2
+    cause we get pixel valu ein im2 from im1
+    so we warp grid in im2 to im1 that we need homo21
+    :param im1_data: (B, H, W, C)
+    :param homo21: (B, 3, 3)
+    :return: out_image (B, H, W, C)
+    """
+    B, imH, imW, C = im1_data.size()
+    outH, outW = imH, imW
+    gy, gx = torch.meshgrid([torch.arange(outH), torch.arange(outW)])
+    gx, gy = gx.float().unsqueeze(-1), gy.float().unsqueeze(-1)
+    ones = gy.new_full(gy.size(), fill_value=1)
+    grid = torch.cat((gx, gy, ones), -1)  # (H, W, 3)
+    grid = grid.unsqueeze(0)  # (1, H, W, 3)
+    grid = grid.repeat(B, 1, 1, 1)  # (B, H, W, 3)
+    grid = grid.view(grid.size(0), -1, grid.size(-1))  # (B, H*W, 3)
+    grid = grid.permute(0, 2, 1)  # (B, 3, H*W)
+    grid = grid.type_as(homo21).to(homo21.device)
+
+    # (B, 3, 3) matmul (B, 3, H*W) => (B, 3, H*W)
+    grid_w = torch.matmul(homo21, grid)
+    grid_w = grid_w.permute(0, 2, 1)  # (B, H*W, 3)
+    grid_w = grid_w.div(grid_w[:, :, 2].unsqueeze(-1) + 1e-8)  # (B, H*W, 3)
+    grid_w = grid_w.view(B, outH, outW, -1)[:, :, :, :2]  # (B, H, W, 2)
+    grid_w[:, :, :, 0] = grid_w[:, :, :, 0].div(imW - 1) * 2 - 1
+    grid_w[:, :, :, 1] = grid_w[:, :, :, 1].div(imH - 1) * 2 - 1
+
+    out_image = torch.nn.functional.grid_sample(
+        im1_data.permute(0, 3, 1, 2), grid_w)  # (B, C, H, W)
+
+    return out_image.permute(0, 2, 3, 1)
+
+
+def filtbordmask(imscore, radius):
+    bs, height, width, c = imscore.size()
+    mask = imscore.new_full((1, height - 2 * radius, width - 2 * radius, 1),
+                            fill_value=1)
+    mask = F.pad(
+        input=mask,
+        pad=(0, 0, radius, radius, radius, radius, 0, 0),
+        mode='constant',
+        value=0,
+    )
+    return mask
+
+
+def filter_border(imscore, radius=8):
+    imscore = imscore * filtbordmask(imscore, radius=radius)
+    return imscore
+
+
+def nms(input, thresh=0.0, ksize=5):
+    """
+    non maximum depression in each pixel if it is not maximum probability in its ksize*ksize range
+    :param input: (B, H, W, 1)
+    :param thresh: float
+    :param ksize: int
+    :return: mask (B, H, W, 1)
+    """
+    device = input.device
+    batch, height, width, channel = input.size()
+    pad = ksize // 2
+    zeros = torch.zeros_like(input)
+    input = torch.where(input < thresh, zeros, input)
+    input_pad = F.pad(
+        input=input,
+        pad=(0, 0, 2 * pad, 2 * pad, 2 * pad, 2 * pad, 0, 0),
+        mode='constant',
+        value=0,
+    )
+    slice_map = torch.tensor([], dtype=input_pad.dtype, device=device)
+    for i in range(ksize):
+        for j in range(ksize):
+            slice = input_pad[:, i:height + 2 * pad + i,
+                              j:width + 2 * pad + j, :]
+            slice_map = torch.cat((slice_map, slice), -1)
+
+    max_slice = slice_map.max(dim=-1, keepdim=True)[0]
+    center_map = slice_map[:, :, :, slice_map.size(-1) // 2].unsqueeze(-1)
+    mask = torch.ge(center_map, max_slice)
+
+    mask = mask[:, pad:height + pad, pad:width + pad, :]
+
+    return mask.type_as(input)
+
+
+def topk_map(maps, k=512):
+    """
+    find the top k maximum pixel probability in a maps
+    :param maps: (B, H, W, 1)
+    :param k: int
+    :return: mask (B, H, W, 1)
+    """
+    batch, height, width, _ = maps.size()
+    maps_flat = maps.view(batch, -1)
+
+    indices = maps_flat.sort(dim=-1, descending=True)[1][:, :k]
+    batch_idx = (
+        torch.arange(0, batch, dtype=indices.dtype,
+                     device=indices.device).unsqueeze(-1).repeat(1, k))
+    batch_idx = batch_idx.view(-1).cpu().detach().numpy()
+    row_idx = indices.contiguous().view(-1).cpu().detach().numpy()
+    batch_indexes = (batch_idx, row_idx)
+
+    topk_mask_flat = torch.zeros(
+        maps_flat.size(), dtype=torch.uint8).to(maps.device)
+    topk_mask_flat[batch_indexes] = 1
+
+    mask = topk_mask_flat.view(batch, height, width, -1)
+    return mask
+
+
+def get_gauss_filter_weight(ksize, sig):
+    """
+    generate a gaussian kernel
+    :param ksize: int
+    :param sig: float
+    :return: numpy(ksize*ksize)
+    """
+    mu_x = mu_y = ksize // 2
+    if sig == 0:
+        psf = torch.zeros((ksize, ksize)).float()
+        psf[mu_y, mu_x] = 1.0
+    else:
+        sig = torch.tensor(sig).float()
+        x = torch.arange(ksize)[None, :].repeat(ksize, 1).float()
+        y = torch.arange(ksize)[:, None].repeat(1, ksize).float()
+        psf = torch.exp(-(
+            (x - mu_x)**2 / (2 * sig**2) + (y - mu_y)**2 / (2 * sig**2)))
+    return psf
+
+
+def soft_nms_3d(scale_logits, ksize, com_strength):
+    """
+    calculate probability for each pixel in each scale space
+    :param scale_logits: (B, H, W, C)
+    :param ksize: int
+    :param com_strength: magnify parameter
+    :return: probability for each pixel in each scale, size is (B, H, W, C)
+    """
+    num_scales = scale_logits.size(-1)
+
+    max_each_scale = F.max_pool2d(
+        input=scale_logits.permute(0, 3, 1, 2),
+        kernel_size=ksize,
+        padding=ksize // 2,
+        stride=1,
+    ).permute(0, 2, 3, 1)  # (B, H, W, C)
+    max_all_scale, max_all_scale_idx = max_each_scale.max(
+        dim=-1, keepdim=True)  # (B, H, W, 1)
+    exp_maps = torch.exp(com_strength * (scale_logits - max_all_scale))
+    sum_exp = F.conv2d(
+        input=exp_maps.permute(0, 3, 1, 2).contiguous(),
+        weight=exp_maps.new_full([1, num_scales, ksize, ksize],
+                                 fill_value=1).contiguous(),
+        stride=1,
+        padding=ksize // 2,
+    ).permute(0, 2, 3, 1)  # (B, H, W, 1)
+    probs = exp_maps / (sum_exp + 1e-8)
+    return probs
+
+
+def soft_max_and_argmax_1d(input,
+                           orint_maps,
+                           scale_list,
+                           com_strength1,
+                           com_strength2,
+                           dim=-1,
+                           keepdim=True):
+    """
+    input should be pixel probability in each scale
+    this function calculate the final pixel probability summary from all scale and each pixel correspond scale
+    :param input: scale_probs(B, H, W, 10)
+    :param orint_maps: (B, H, W, 10, 2)
+    :param dim: final channel
+    :param scale_list: scale space list
+    :param keepdim: kepp dimension
+    :param com_strength1: magnify argument of score
+    :param com_strength2: magnify argument of scale
+    :return: score_map(B, H, W, 1), scale_map(B, H, W, 1), (orint_map(B, H, W, 1, 2))
+    """
+    inputs_exp1 = torch.exp(
+        com_strength1 * (input - torch.max(input, dim=dim, keepdim=True)[0]))
+    input_softmax1 = inputs_exp1 / (
+        inputs_exp1.sum(dim=dim, keepdim=True) + 1e-8)  # (B, H, W, 10)
+
+    inputs_exp2 = torch.exp(
+        com_strength2 * (input - torch.max(input, dim=dim, keepdim=True)[0]))
+    input_softmax2 = inputs_exp2 / (
+        inputs_exp2.sum(dim=dim, keepdim=True) + 1e-8)  # (B, H, W, 10)
+
+    score_map = torch.sum(input * input_softmax1, dim=dim, keepdim=keepdim)
+
+    scale_list_shape = [1] * len(input.size())
+    scale_list_shape[dim] = -1
+    scale_list = scale_list.view(scale_list_shape).to(input_softmax2.device)
+    scale_map = torch.sum(
+        scale_list * input_softmax2, dim=dim, keepdim=keepdim)
+
+    if orint_maps is not None:
+        orint_map = torch.sum(
+            orint_maps * input_softmax1.unsqueeze(-1),
+            dim=dim - 1,
+            keepdim=keepdim)  # (B, H, W, 1, 2)
+        orint_map = L2Norm(orint_map, dim=-1)
+        return score_map, scale_map, orint_map
+    else:
+        return score_map, scale_map
+
+
+def im_rescale(im, output_size):
+    h, w = im.shape[:2]
+    if isinstance(output_size, int):
+        if h > w:
+            new_h, new_w = output_size * h / w, output_size
+        else:
+            new_h, new_w = output_size, output_size * w / h
+    else:
+        new_h, new_w = output_size
+    new_h, new_w = int(new_h), int(new_w)
+    img = transform.resize(im, (new_h, new_w), mode='constant')
+
+    return img, h, w, new_w / w, new_h / h
diff --git a/modelscope/models/cv/video_stabilization/utils/math_utils.py b/modelscope/models/cv/video_stabilization/utils/math_utils.py
new file mode 100644
index 00000000..47368921
--- /dev/null
+++ b/modelscope/models/cv/video_stabilization/utils/math_utils.py
@@ -0,0 +1,130 @@
+# @Time    : 2018-9-21 14:36
+# @Author  : xylon
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import numpy as np
+import torch
+
+
+def distance_matrix_vector(anchor, positive):
+    """
+    Given batch of anchor descriptors and positive descriptors calculate distance matrix
+    :param anchor: (B, 128)
+    :param positive: (B, 128)
+    :return:
+    """
+    eps = 1e-8
+    FeatSimi_Mat = 2 - 2 * torch.mm(anchor, positive.t())  # [0, 4]
+    FeatSimi_Mat = FeatSimi_Mat.clamp(min=eps, max=4.0)
+    FeatSimi_Mat = torch.sqrt(FeatSimi_Mat)  # euc [0, 2]
+    return FeatSimi_Mat
+
+
+def pairwise_distances(x, y=None):
+    """
+    Input: x is a Nxd matrix
+           y is an optional Mxd matirx
+    Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
+            if y is not given then use 'y=x'.
+    i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
+    """
+    x_norm = (x**2).sum(1).view(-1, 1)
+    if y is not None:
+        y_t = y.transpose(0, 1)
+        y_norm = (y**2).sum(1).view(1, -1)
+    else:
+        y_t = x.transpose(0, 1)
+        y_norm = x_norm.view(1, -1)
+
+    dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
+    eps = 1e-8
+    return torch.sqrt(dist.clamp(min=eps, max=np.inf))
+
+
+def ptCltoCr(leftC, homolr, right_imscale, right_imorint=None, clamp=True):
+    """
+    ptCltoCr is the abbreviation of projective transform keypoints Coordinates in left back to Coordinates in right
+    :param leftC: tensor #(B*topk, 4): the 4 correspond to (b, y, x, 0) each element in it has length B*topk
+    :param homolr: torch(B, 3, 3): homogeneous matrix
+    :param right_imscale: (B, H, W, 1)
+    :param right_imorint: (B, H, W, 1, 2)
+    :param clamp: whether clamp rightC_homo
+    :return: tuple (b, y, x, 0) each element in that has length B*topk
+    """
+    # projective transform im1_C back to im2 called im2_Cw
+    B, maxh, maxw, C = right_imscale.size(
+    )  # tuple (b, h, w) max size of image
+    leftC_homo = leftC.clone()
+    leftC_homo[:, 3] = leftC_homo[:, 3] + 1  # (B*topk, 4) (b, y, x, 1)
+    leftC_homo = leftC_homo[:, 1:]  # (B*topk, 3) (y, x, 1)
+    leftC_homo = leftC_homo.index_select(1, leftC_homo.new_tensor(
+        [1, 0, 2]))  # (B*topk, 3) [[x], [y], [1]]
+    leftC_homo = leftC_homo.view(B, -1, 3)  # (B, topk, 3)
+    leftC_homo = leftC_homo.permute(0, 2, 1)  # (B, 3, topk)
+
+    rightC_homo = torch.matmul(homolr,
+                               leftC_homo.float())  # (B, 3, topk) (x, y, h)
+    rightC_homo = rightC_homo.permute(0, 2, 1)  # (B, topk, 3) (x, y, h)
+    # (B, topk, 3) (x, y, h) to 1
+    rightC_homo = rightC_homo / (
+        torch.unsqueeze(rightC_homo[:, :, 2], -1) + 1e-8)
+    rightC_homo = rightC_homo.round().long()
+    if clamp:
+        rightC_homo[:, :, 0] = rightC_homo[:, :, 0].clamp(min=0, max=maxw - 1)
+        rightC_homo[:, :, 1] = rightC_homo[:, :, 1].clamp(min=0, max=maxh - 1)
+
+    topk = rightC_homo.size(1)
+    batch_v = (torch.arange(B, device=rightC_homo.device).view(B, 1, 1).repeat(
+        1, topk, 1))  # (B, topk, 1)
+    # (B, topk, 4) (B, x, y, h)
+    rightC_homo = torch.cat((batch_v, rightC_homo), -1)
+    rightC_homo = rightC_homo.contiguous().view(-1,
+                                                4)  # (B*topk, 4) (B, x, y, h)
+    rightC_homo = rightC_homo.index_select(
+        1, rightC_homo.new_tensor([0, 2, 1, 3]))  # (B*topk, 4) (B, y, x, h)
+    rightC_homo[:, 3] = rightC_homo[:, 3] - 1  # (B*topk, 4) (B, y, x, 0)
+
+    right_imS = right_imscale.view(-1)  # (B*H*W)
+    dim1 = maxw
+    dim2 = maxh * maxw
+    scale_idx = rightC_homo[:,
+                            0] * dim2 + rightC_homo[:,
+                                                    1] * dim1 + rightC_homo[:,
+                                                                            2]
+    scale_idx = scale_idx.clamp(min=0, max=dim2 * B - 1)
+    right_imS = right_imS.gather(0, scale_idx)  # (B*topk)
+
+    if right_imorint is None:
+        right_imO = None
+    else:
+        right_cos, right_sin = right_imorint.squeeze().chunk(
+            chunks=2, dim=-1)  # each is (B, H, W, 1)
+        right_cos = right_cos.view(-1)  # (B*H*W)
+        right_sin = right_sin.view(-1)  # (B*H*W)
+        right_cos = right_cos.gather(0, scale_idx)  # (B*topk)
+        right_sin = right_sin.gather(0, scale_idx)  # (B*topk)
+        right_imO = torch.cat(
+            (right_cos.unsqueeze(-1), right_sin.unsqueeze(-1)),
+            dim=-1)  # (B*topk, 2)
+
+    return rightC_homo, right_imS, right_imO
+
+
+def L2Norm(input, dim=-1):
+    input = input / torch.norm(input, p=2, dim=dim, keepdim=True)
+    return input
+
+
+def MSD(x, y):
+    """
+    mean square distance
+    :param x: (B, H, W, 2) 2 corresponds to XY
+    :param y: (B, H, W, 2) 2 corresponds to XY
+    :return: distance: (B, H, W, 1)
+    """
+    sub = x - y
+    square = sub**2
+    sm = square.sum(keepdim=True, dim=-1)
+    sqr = torch.sqrt((sm + 1e-8).float())
+    return sqr * 2
diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py
index c9987670..c6191e8f 100644
--- a/modelscope/models/cv/video_summarization/summarizer.py
+++ b/modelscope/models/cv/video_summarization/summarizer.py
@@ -2,13 +2,11 @@
 # publicly available at https://github.com/e-apostolidis/PGL-SUM
 
 import os.path as osp
-from copy import deepcopy
 from typing import Dict, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.nn.parallel import DataParallel, DistributedDataParallel
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
@@ -205,37 +203,13 @@ class PGLVideoSummarization(TorchModel):
             self._device = torch.device('cpu')
         self.model = self.model.to(self._device)
 
-        self.model = self.load_pretrained(self.model, model_path)
+        self.model = self._load_pretrained(self.model, model_path)
 
         if self.training:
             self.model.train()
         else:
             self.model.eval()
 
-    def load_pretrained(self, net, load_path, strict=True, param_key='params'):
-        if isinstance(net, (DataParallel, DistributedDataParallel)):
-            net = net.module
-        load_net = torch.load(
-            load_path, map_location=lambda storage, loc: storage)
-        if param_key is not None:
-            if param_key not in load_net and 'params' in load_net:
-                param_key = 'params'
-                logger.info(
-                    f'Loading: {param_key} does not exist, use params.')
-            if param_key in load_net:
-                load_net = load_net[param_key]
-        logger.info(
-            f'Loading {net.__class__.__name__} model from {load_path}, with param key: [{param_key}].'
-        )
-        # remove unnecessary 'module.'
-        for k, v in deepcopy(load_net).items():
-            if k.startswith('module.'):
-                load_net[k[7:]] = v
-                load_net.pop(k)
-        net.load_state_dict(load_net, strict=strict)
-        logger.info('load model done.')
-        return net
-
     def _train_forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
         frame_features = input['frame_features']
         gtscore = input['gtscore']
diff --git a/modelscope/models/cv/video_super_resolution/__init__.py b/modelscope/models/cv/video_super_resolution/__init__.py
new file mode 100644
index 00000000..0a2cc193
--- /dev/null
+++ b/modelscope/models/cv/video_super_resolution/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .real_basicvsr_for_video_super_resolution import RealBasicVSRNetForVideoSR
+
+else:
+    _import_structure = {
+        'real_basicvsr_for_video_super_resolution':
+        ['RealBasicVSRNetForVideoSR']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/video_super_resolution/basicvsr_net.py b/modelscope/models/cv/video_super_resolution/basicvsr_net.py
new file mode 100644
index 00000000..1caa38bf
--- /dev/null
+++ b/modelscope/models/cv/video_super_resolution/basicvsr_net.py
@@ -0,0 +1,404 @@
+# The implementation is adopted from mmedit,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/open-mmlab/mmediting/blob/master/mmedit/models/backbones/sr_backbones/basicvsr_net.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.video_super_resolution.common import (
+    PixelShufflePack, ResidualBlockNoBN, flow_warp, make_layer)
+
+
+class ConvModule(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 act_cfg=dict(type='ReLU'),
+                 inplace=True):
+        super(ConvModule, self).__init__()
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+
+        self.with_activation = act_cfg is not None
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+        if self.with_activation:
+            self.activate = getattr(nn, act_cfg['type'])(self.inplace)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.with_activation:
+            x = self.activate(x)
+        return x
+
+
+class BasicVSRNet(nn.Module):
+    """BasicVSR network structure for video super-resolution.
+    Support only x4 upsampling.
+    Paper:
+        BasicVSR: The Search for Essential Components in Video Super-Resolution
+        and Beyond, CVPR, 2021
+    Args:
+        mid_channels (int): Channel number of the intermediate features.
+            Default: 64.
+        num_blocks (int): Number of residual blocks in each propagation branch.
+            Default: 30.
+        spynet_pretrained (str): Pre-trained model path of SPyNet.
+            Default: None.
+    """
+
+    def __init__(self, mid_channels=64, num_blocks=30, spynet_pretrained=None):
+
+        super().__init__()
+
+        self.mid_channels = mid_channels
+
+        # optical flow network for feature alignment
+        self.spynet = SPyNet(pretrained=spynet_pretrained)
+
+        # propagation branches
+        self.backward_resblocks = ResidualBlocksWithInputConv(
+            mid_channels + 3, mid_channels, num_blocks)
+        self.forward_resblocks = ResidualBlocksWithInputConv(
+            mid_channels + 3, mid_channels, num_blocks)
+
+        # upsample
+        self.fusion = nn.Conv2d(
+            mid_channels * 2, mid_channels, 1, 1, 0, bias=True)
+        self.upsample1 = PixelShufflePack(
+            mid_channels, mid_channels, 2, upsample_kernel=3)
+        self.upsample2 = PixelShufflePack(
+            mid_channels, 64, 2, upsample_kernel=3)
+        self.conv_hr = nn.Conv2d(64, 64, 3, 1, 1)
+        self.conv_last = nn.Conv2d(64, 3, 3, 1, 1)
+        self.img_upsample = nn.Upsample(
+            scale_factor=4, mode='bilinear', align_corners=False)
+
+        # activation function
+        self.lrelu = nn.LeakyReLU(negative_slope=0.1, inplace=True)
+
+    def check_if_mirror_extended(self, lrs):
+        """Check whether the input is a mirror-extended sequence.
+        If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+        (t-1-i)-th frame.
+        Args:
+            lrs (tensor): Input LR images with shape (n, t, c, h, w)
+        """
+
+        self.is_mirror_extended = False
+        if lrs.size(1) % 2 == 0:
+            lrs_1, lrs_2 = torch.chunk(lrs, 2, dim=1)
+            if torch.norm(lrs_1 - lrs_2.flip(1)) == 0:
+                self.is_mirror_extended = True
+
+    def compute_flow(self, lrs):
+        """Compute optical flow using SPyNet for feature warping.
+        Note that if the input is an mirror-extended sequence, 'flows_forward'
+        is not needed, since it is equal to 'flows_backward.flip(1)'.
+        Args:
+            lrs (tensor): Input LR images with shape (n, t, c, h, w)
+        Return:
+            tuple(Tensor): Optical flow. 'flows_forward' corresponds to the
+                flows used for forward-time propagation (current to previous).
+                'flows_backward' corresponds to the flows used for
+                backward-time propagation (current to next).
+        """
+
+        n, t, c, h, w = lrs.size()
+        lrs_1 = lrs[:, :-1, :, :, :].reshape(-1, c, h, w)
+        lrs_2 = lrs[:, 1:, :, :, :].reshape(-1, c, h, w)
+
+        flows_backward = self.spynet(lrs_1, lrs_2).view(n, t - 1, 2, h, w)
+
+        if self.is_mirror_extended:  # flows_forward = flows_backward.flip(1)
+            flows_forward = None
+        else:
+            flows_forward = self.spynet(lrs_2, lrs_1).view(n, t - 1, 2, h, w)
+
+        return flows_forward, flows_backward
+
+    def forward(self, lrs):
+        """Forward function for BasicVSR.
+        Args:
+            lrs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+        Returns:
+            Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
+        """
+
+        n, t, c, h, w = lrs.size()
+        assert h >= 64 and w >= 64, (
+            'The height and width of inputs should be at least 64, '
+            f'but got {h} and {w}.')
+
+        # check whether the input is an extended sequence
+        self.check_if_mirror_extended(lrs)
+
+        # compute optical flow
+        flows_forward, flows_backward = self.compute_flow(lrs)
+
+        # backward-time propagation
+        outputs = []
+        feat_prop = lrs.new_zeros(n, self.mid_channels, h, w)
+        for i in range(t - 1, -1, -1):
+            if i < t - 1:  # no warping required for the last timestep
+                flow = flows_backward[:, i, :, :, :]
+                feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
+
+            feat_prop = torch.cat([lrs[:, i, :, :, :], feat_prop], dim=1)
+            feat_prop = self.backward_resblocks(feat_prop)
+
+            outputs.append(feat_prop)
+        outputs = outputs[::-1]
+
+        # forward-time propagation and upsampling
+        feat_prop = torch.zeros_like(feat_prop)
+        for i in range(0, t):
+            lr_curr = lrs[:, i, :, :, :]
+            if i > 0:  # no warping required for the first timestep
+                if flows_forward is not None:
+                    flow = flows_forward[:, i - 1, :, :, :]
+                else:
+                    flow = flows_backward[:, -i, :, :, :]
+                feat_prop = flow_warp(feat_prop, flow.permute(0, 2, 3, 1))
+
+            feat_prop = torch.cat([lr_curr, feat_prop], dim=1)
+            feat_prop = self.forward_resblocks(feat_prop)
+
+            # upsampling given the backward and forward features
+            out = torch.cat([outputs[i], feat_prop], dim=1)
+            out = self.lrelu(self.fusion(out))
+            out = self.lrelu(self.upsample1(out))
+            out = self.lrelu(self.upsample2(out))
+            out = self.lrelu(self.conv_hr(out))
+            out = self.conv_last(out)
+            base = self.img_upsample(lr_curr)
+            out += base
+            outputs[i] = out
+
+        return torch.stack(outputs, dim=1)
+
+
+class ResidualBlocksWithInputConv(nn.Module):
+    """Residual blocks with a convolution in front.
+    Args:
+        in_channels (int): Number of input channels of the first conv.
+        out_channels (int): Number of channels of the residual blocks.
+            Default: 64.
+        num_blocks (int): Number of residual blocks. Default: 30.
+    """
+
+    def __init__(self, in_channels, out_channels=64, num_blocks=30):
+        super().__init__()
+
+        main = []
+
+        # a convolution used to match the channels of the residual blocks
+        main.append(nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=True))
+        main.append(nn.LeakyReLU(negative_slope=0.1, inplace=True))
+
+        # residual blocks
+        main.append(
+            make_layer(
+                ResidualBlockNoBN, num_blocks, mid_channels=out_channels))
+
+        self.main = nn.Sequential(*main)
+
+    def forward(self, feat):
+        """Forward function for ResidualBlocksWithInputConv.
+        Args:
+            feat (Tensor): Input feature with shape (n, in_channels, h, w)
+        Returns:
+            Tensor: Output feature with shape (n, out_channels, h, w)
+        """
+        return self.main(feat)
+
+
+class SPyNet(nn.Module):
+    """SPyNet network structure.
+    The difference to the SPyNet in [tof.py] is that
+        1. more SPyNetBasicModule is used in this version, and
+        2. no batch normalization is used in this version.
+    Paper:
+        Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+    Args:
+        pretrained (str): path for pre-trained SPyNet. Default: None.
+    """
+
+    def __init__(self, pretrained=None):
+        super().__init__()
+
+        self.basic_module = nn.ModuleList(
+            [SPyNetBasicModule() for _ in range(6)])
+
+        if isinstance(pretrained, str):
+            self.load_state_dict(torch.load(pretrained), strict=True)
+        elif pretrained is not None:
+            raise TypeError('[pretrained] should be str or None, '
+                            f'but got {type(pretrained)}.')
+
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+
+    def compute_flow(self, ref, supp):
+        """Compute flow from ref to supp.
+        Note that in this function, the images are already resized to a
+        multiple of 32.
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+        n, _, h, w = ref.size()
+
+        # normalize the input images
+        ref = [(ref - self.mean) / self.std]
+        supp = [(supp - self.mean) / self.std]
+
+        # generate downsampled frames
+        for level in range(5):
+            ref.append(
+                F.avg_pool2d(
+                    input=ref[-1],
+                    kernel_size=2,
+                    stride=2,
+                    count_include_pad=False))
+            supp.append(
+                F.avg_pool2d(
+                    input=supp[-1],
+                    kernel_size=2,
+                    stride=2,
+                    count_include_pad=False))
+        ref = ref[::-1]
+        supp = supp[::-1]
+
+        # flow computation
+        flow = ref[0].new_zeros(n, 2, h // 32, w // 32)
+        for level in range(len(ref)):
+            if level == 0:
+                flow_up = flow
+            else:
+                flow_up = F.interpolate(
+                    input=flow,
+                    scale_factor=2,
+                    mode='bilinear',
+                    align_corners=True) * 2.0
+
+            # add the residue to the upsampled flow
+            flow = flow_up + self.basic_module[level](
+                torch.cat([
+                    ref[level],
+                    flow_warp(
+                        supp[level],
+                        flow_up.permute(0, 2, 3, 1),
+                        padding_mode='border'), flow_up
+                ], 1))
+
+        return flow
+
+    def forward(self, ref, supp):
+        """Forward function of SPyNet.
+        This function computes the optical flow from ref to supp.
+        Args:
+            ref (Tensor): Reference image with shape of (n, 3, h, w).
+            supp (Tensor): Supporting image with shape of (n, 3, h, w).
+        Returns:
+            Tensor: Estimated optical flow: (n, 2, h, w).
+        """
+
+        # upsize to a multiple of 32
+        h, w = ref.shape[2:4]
+        w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
+        h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
+        ref = F.interpolate(
+            input=ref, size=(h_up, w_up), mode='bilinear', align_corners=False)
+        supp = F.interpolate(
+            input=supp,
+            size=(h_up, w_up),
+            mode='bilinear',
+            align_corners=False)
+
+        # compute flow, and resize back to the original resolution
+        flow = F.interpolate(
+            input=self.compute_flow(ref, supp),
+            size=(h, w),
+            mode='bilinear',
+            align_corners=False)
+
+        # adjust the flow values
+        flow[:, 0, :, :] *= float(w) / float(w_up)
+        flow[:, 1, :, :] *= float(h) / float(h_up)
+
+        return flow
+
+
+class SPyNetBasicModule(nn.Module):
+    """Basic Module for SPyNet.
+    Paper:
+        Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.basic_module = nn.Sequential(
+            ConvModule(
+                in_channels=8,
+                out_channels=32,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                act_cfg=dict(type='ReLU')),
+            ConvModule(
+                in_channels=32,
+                out_channels=64,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                act_cfg=dict(type='ReLU')),
+            ConvModule(
+                in_channels=64,
+                out_channels=32,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                act_cfg=dict(type='ReLU')),
+            ConvModule(
+                in_channels=32,
+                out_channels=16,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                act_cfg=dict(type='ReLU')),
+            ConvModule(
+                in_channels=16,
+                out_channels=2,
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                act_cfg=None))
+
+    def forward(self, tensor_input):
+        """
+        Args:
+            tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
+                8 channels contain:
+                [reference image (3), neighbor image (3), initial flow (2)].
+        Returns:
+            Tensor: Refined flow with shape (b, 2, h, w)
+        """
+        return self.basic_module(tensor_input)
diff --git a/modelscope/models/cv/video_super_resolution/common.py b/modelscope/models/cv/video_super_resolution/common.py
new file mode 100644
index 00000000..56eb281c
--- /dev/null
+++ b/modelscope/models/cv/video_super_resolution/common.py
@@ -0,0 +1,140 @@
+# The implementation is adopted from mmedit,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/open-mmlab/mmediting/tree/master/mmedit/models/common
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ResidualBlockNoBN(nn.Module):
+    """Residual block without BN.
+    It has a style of:
+    ::
+        ---Conv-ReLU-Conv-+-
+         |________________|
+    Args:
+        mid_channels (int): Channel number of intermediate features.
+            Default: 64.
+        res_scale (float): Used to scale the residual before addition.
+            Default: 1.0.
+    """
+
+    def __init__(self, mid_channels=64, res_scale=1.0):
+        super().__init__()
+        self.res_scale = res_scale
+        self.conv1 = nn.Conv2d(mid_channels, mid_channels, 3, 1, 1, bias=True)
+        self.conv2 = nn.Conv2d(mid_channels, mid_channels, 3, 1, 1, bias=True)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward function.
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+        Returns:
+            Tensor: Forward results.
+        """
+
+        identity = x
+        out = self.conv2(self.relu(self.conv1(x)))
+        return identity + out * self.res_scale
+
+
+class PixelShufflePack(nn.Module):
+    """Pixel Shuffle upsample layer.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Upsample ratio.
+        upsample_kernel (int): Kernel size of Conv layer to expand channels.
+    Returns:
+        Upsampled feature map.
+    """
+
+    def __init__(self, in_channels, out_channels, scale_factor,
+                 upsample_kernel):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scale_factor = scale_factor
+        self.upsample_kernel = upsample_kernel
+        self.upsample_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels * scale_factor * scale_factor,
+            self.upsample_kernel,
+            padding=(self.upsample_kernel - 1) // 2)
+
+    def forward(self, x):
+        """Forward function for PixelShufflePack.
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+        Returns:
+            Tensor: Forward results.
+        """
+        x = self.upsample_conv(x)
+        x = F.pixel_shuffle(x, self.scale_factor)
+        return x
+
+
+def flow_warp(x,
+              flow,
+              interpolation='bilinear',
+              padding_mode='zeros',
+              align_corners=True):
+    """Warp an image or a feature map with optical flow.
+    Args:
+        x (Tensor): Tensor with size (n, c, h, w).
+        flow (Tensor): Tensor with size (n, h, w, 2). The last dimension is
+            a two-channel, denoting the width and height relative offsets.
+            Note that the values are not normalized to [-1, 1].
+        interpolation (str): Interpolation mode: 'nearest' or 'bilinear'.
+            Default: 'bilinear'.
+        padding_mode (str): Padding mode: 'zeros' or 'border' or 'reflection'.
+            Default: 'zeros'.
+        align_corners (bool): Whether align corners. Default: True.
+    Returns:
+        Tensor: Warped image or feature map.
+    """
+    if x.size()[-2:] != flow.size()[1:3]:
+        raise ValueError(f'The spatial sizes of input ({x.size()[-2:]}) and '
+                         f'flow ({flow.size()[1:3]}) are not the same.')
+    _, _, h, w = x.size()
+    # create mesh grid
+    device = flow.device
+    grid_y, grid_x = torch.meshgrid(
+        torch.arange(0, h, device=device, dtype=x.dtype),
+        torch.arange(0, w, device=device, dtype=x.dtype))
+    grid = torch.stack((grid_x, grid_y), 2)  # h, w, 2
+    grid.requires_grad = False
+
+    grid_flow = grid + flow
+    # scale grid_flow to [-1,1]
+    grid_flow_x = 2.0 * grid_flow[:, :, :, 0] / max(w - 1, 1) - 1.0
+    grid_flow_y = 2.0 * grid_flow[:, :, :, 1] / max(h - 1, 1) - 1.0
+    grid_flow = torch.stack((grid_flow_x, grid_flow_y), dim=3)
+    output = F.grid_sample(
+        x,
+        grid_flow,
+        mode=interpolation,
+        padding_mode=padding_mode,
+        align_corners=align_corners)
+    return output
+
+
+def make_layer(block, num_blocks, **kwarg):
+    """Make layers by stacking the same blocks.
+    Args:
+        block (nn.module): nn.module class for basic block.
+        num_blocks (int): number of blocks.
+    Returns:
+        nn.Sequential: Stacked blocks in nn.Sequential.
+    """
+    layers = []
+    for _ in range(num_blocks):
+        layers.append(block(**kwarg))
+    return nn.Sequential(*layers)
+
+
+def charbonnier_loss(pred, target, eps=1e-12):
+    return torch.sqrt((pred - target)**2 + eps)
diff --git a/modelscope/models/cv/video_super_resolution/real_basicvsr_for_video_super_resolution.py b/modelscope/models/cv/video_super_resolution/real_basicvsr_for_video_super_resolution.py
new file mode 100644
index 00000000..55d29801
--- /dev/null
+++ b/modelscope/models/cv/video_super_resolution/real_basicvsr_for_video_super_resolution.py
@@ -0,0 +1,96 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, Union
+
+import torch.cuda
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.video_super_resolution.common import charbonnier_loss
+from modelscope.models.cv.video_super_resolution.real_basicvsr_net import \
+    RealBasicVSRNet
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+__all__ = ['RealBasicVSRNetForVideoSR']
+
+
+@MODELS.register_module(
+    Tasks.video_super_resolution, module_name=Models.real_basicvsr)
+class RealBasicVSRNetForVideoSR(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the video super-resolution model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        self.model_dir = model_dir
+        self.config = Config.from_file(
+            os.path.join(self.model_dir, ModelFile.CONFIGURATION))
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.model = RealBasicVSRNet(**self.config.model.generator)
+        self.loss = charbonnier_loss
+        self.model = self._load_pretrained(self.model, model_path)
+        self.max_seq_len = 7
+
+    def _train_forward(self, input: Tensor,
+                       target: Tensor) -> Dict[str, Tensor]:
+        preds, lqs = self.model(input, return_lqs=True)
+
+        n, t, c, h, w = target.size()
+        target_clean = target.view(-1, c, h, w)
+        target_clean = F.interpolate(
+            target_clean, scale_factor=0.25, mode='area')
+        target_clean = target_clean.view(n, t, c, h // 4, w // 4)
+
+        losses = dict()
+        losses['loss_pix'] = self.loss(preds, target)
+        losses['loss_clean'] = self.loss(lqs, target_clean)
+        return losses
+
+    def _inference_forward(self, input: Tensor) -> Dict[str, Tensor]:
+        return {'output': self.model(input).clamp(0, 1)}
+
+    def _evaluate_postprocess(self, input: Tensor,
+                              target: Tensor) -> Dict[str, list]:
+        device = input.device
+        input = input.cpu()
+        torch.cuda.empty_cache()
+        with torch.cuda.amp.autocast():
+            outputs = []
+            for i in range(0, input.size(1), self.max_seq_len):
+                imgs = input[:, i:i + self.max_seq_len, :, :, :]
+                imgs = imgs.to(device)
+                outputs.append(self.model(imgs).float().cpu())
+            preds = torch.cat(outputs, dim=1).squeeze(0)  # (t, c, h, w)
+            torch.cuda.empty_cache()
+        preds = list(torch.split(preds.clamp(0, 1), 1, 0))  # [(c, h, w), ...]
+        targets = list(torch.split(target.clamp(0, 1), 1,
+                                   0))  # [(t, c, h, w), ...]
+
+        return {'pred': preds, 'target': targets}
+
+    def forward(self, inputs: Dict[str,
+                                   Tensor]) -> Dict[str, Union[list, Tensor]]:
+        """return the result by the model
+
+        Args:
+            inputs (Tensor): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+        """
+        if self.training:
+            return self._train_forward(**inputs)
+        elif 'target' in inputs:
+            return self._evaluate_postprocess(**inputs)
+        else:
+            return self._inference_forward(**inputs)
diff --git a/modelscope/models/cv/video_super_resolution/real_basicvsr_net.py b/modelscope/models/cv/video_super_resolution/real_basicvsr_net.py
new file mode 100644
index 00000000..4898d4ad
--- /dev/null
+++ b/modelscope/models/cv/video_super_resolution/real_basicvsr_net.py
@@ -0,0 +1,89 @@
+# The implementation is adopted from mmedit,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/open-mmlab/mmediting/blob/master/mmedit/models/backbones/sr_backbones/real_basicvsr_net.py
+
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.video_super_resolution.basicvsr_net import (
+    BasicVSRNet, ResidualBlocksWithInputConv)
+
+
+class RealBasicVSRNet(nn.Module):
+    """RealBasicVSR network structure for real-world video super-resolution.
+    Support only x4 upsampling.
+    Paper:
+        Investigating Tradeoffs in Real-World Video Super-Resolution, arXiv
+    Args:
+        mid_channels (int, optional): Channel number of the intermediate
+            features. Default: 64.
+        num_propagation_blocks (int, optional): Number of residual blocks in
+            each propagation branch. Default: 20.
+        num_cleaning_blocks (int, optional): Number of residual blocks in the
+            image cleaning module. Default: 20.
+        dynamic_refine_thres (int, optional): Stop cleaning the images when
+            the residue is smaller than this value. Default: 255.
+        spynet_pretrained (str, optional): Pre-trained model path of SPyNet.
+            Default: None.
+        is_fix_cleaning (bool, optional): Whether to fix the weights of
+            the image cleaning module during training. Default: False.
+        is_sequential_cleaning (bool, optional): Whether to clean the images
+            sequentially. This is used to save GPU memory, but the speed is
+            slightly slower. Default: False.
+    """
+
+    def __init__(self,
+                 mid_channels=64,
+                 num_propagation_blocks=20,
+                 num_cleaning_blocks=20,
+                 dynamic_refine_thres=255,
+                 spynet_pretrained=None,
+                 is_fix_cleaning=False,
+                 is_sequential_cleaning=False):
+
+        super().__init__()
+
+        self.dynamic_refine_thres = dynamic_refine_thres / 255.
+        self.is_sequential_cleaning = is_sequential_cleaning
+
+        # image cleaning module
+        self.image_cleaning = nn.Sequential(
+            ResidualBlocksWithInputConv(3, mid_channels, num_cleaning_blocks),
+            nn.Conv2d(mid_channels, 3, 3, 1, 1, bias=True),
+        )
+
+        if is_fix_cleaning:  # keep the weights of the cleaning module fixed
+            self.image_cleaning.requires_grad_(False)
+
+        # BasicVSR
+        self.basicvsr = BasicVSRNet(mid_channels, num_propagation_blocks,
+                                    spynet_pretrained)
+        self.basicvsr.spynet.requires_grad_(False)
+
+    def forward(self, lqs, return_lqs=False):
+        n, t, c, h, w = lqs.size()
+
+        for _ in range(0, 3):  # at most 3 cleaning, determined empirically
+            if self.is_sequential_cleaning:
+                residues = []
+                for i in range(0, t):
+                    residue_i = self.image_cleaning(lqs[:, i, :, :, :])
+                    lqs[:, i, :, :, :] += residue_i
+                    residues.append(residue_i)
+                residues = torch.stack(residues, dim=1)
+            else:  # time -> batch, then apply cleaning at once
+                lqs = lqs.view(-1, c, h, w)
+                residues = self.image_cleaning(lqs)
+                lqs = (lqs + residues).view(n, t, c, h, w)
+
+            # determine whether to continue cleaning
+            if torch.mean(torch.abs(residues)) < self.dynamic_refine_thres:
+                break
+
+        # Super-resolution (BasicVSR)
+        outputs = self.basicvsr(lqs)
+
+        if return_lqs:
+            return outputs, lqs
+        else:
+            return outputs
diff --git a/modelscope/models/cv/vision_middleware/__init__.py b/modelscope/models/cv/vision_middleware/__init__.py
new file mode 100644
index 00000000..f0286149
--- /dev/null
+++ b/modelscope/models/cv/vision_middleware/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+
+    from .model import VisionMiddlewareModel
+
+else:
+    _import_structure = {
+        'model': ['VisionMiddlewareModel'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/vision_middleware/backbone.py b/modelscope/models/cv/vision_middleware/backbone.py
new file mode 100644
index 00000000..70b8188d
--- /dev/null
+++ b/modelscope/models/cv/vision_middleware/backbone.py
@@ -0,0 +1,190 @@
+# The implementation is adopted from CLIP,
+# made publicly available under the MIT License at https://github.com/openai/CLIP
+
+import math
+import os
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from .vim import ViM
+
+
+class LayerNorm(nn.LayerNorm):
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+        self.vim_att = ViM()
+        self.vim_mlp = ViM()
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor, task_name: str):
+        x_normed_1 = self.ln_1(x)
+        x = x + self.attention(x_normed_1)
+        x = x + self.vim_att(x_normed_1, task_name)
+
+        x_normed_2 = self.ln_2(x)
+        x = x + self.mlp(x_normed_2)
+        x = x + self.vim_mlp(x_normed_2, task_name)
+
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList([
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor, **kwargs):
+        L, B, D = x.size()
+        features = []
+        for i, blk in enumerate(self.resblocks):
+            x = blk(x, **kwargs)
+            features.append(x)
+        return features
+
+
+class VisionTransformer(nn.Module):
+    """
+    The Vision Transformer (ViT) model
+    Args:
+        - input_resolution (int): shape of input image
+        - patch_width (int): size of patch tokens
+        - width (int): feature channels
+        - layers (int): number of transformer layers
+        - heads (int): number of multi-head attention
+        - output_dim (int): output feature channels
+    """
+
+    def __init__(self,
+                 input_resolution: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 output_dim: int = 512):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.patch_per_side = input_resolution // patch_size
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+        self.output_dim = output_dim
+
+    def forward(self, x: torch.Tensor, **kwargs):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        B = x.size(0)
+        P = x.size(2)
+
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # [*, grid ** 2, width]
+
+        cls_token = self.class_embedding.to(x.dtype).reshape(1, 1, -1).repeat(
+            B, 1, 1)
+        x = torch.cat([cls_token, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x_per_layer = self.transformer(x, **kwargs)
+
+        x = x_per_layer[-1]
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+
+        # outputs: [x_1, ..., x_N, last_cls_token], x_i in 2D
+        outputs = []
+        for output in x_per_layer:
+            outputs.append(output[1:, :, :].permute(1, 2,
+                                                    0).reshape(B, -1, P, P))
+        outputs.append(x)
+        return outputs
+
+
+model_dict = {
+    'vit_b16_224':
+    dict(input_resolution=224, patch_size=16, width=768, layers=12, heads=12),
+    'vit_b32_224':
+    dict(input_resolution=224, patch_size=32, width=768, layers=12, heads=12),
+}
+
+
+def build_backbone(arch='vit_b16_224', pretrained=None):
+    """ build a ViT + ViM model
+        Args:
+            arch: name of backbone
+            pretrained: weights of pretrained model
+    """
+    model_args = model_dict[arch]
+    model = VisionTransformer(**model_args)
+    model.load_state_dict(pretrained)
+
+    return model
+
+
+if __name__ == '__main__':
+    model = build_backbone()
diff --git a/modelscope/models/cv/vision_middleware/head.py b/modelscope/models/cv/vision_middleware/head.py
new file mode 100644
index 00000000..39441179
--- /dev/null
+++ b/modelscope/models/cv/vision_middleware/head.py
@@ -0,0 +1,688 @@
+# The implementation is adopted from mmsegmentation,
+# made publicly available under the Apache License, Version 2.0 at https://github.com/open-mmlab/mmsegmentation
+
+from abc import ABCMeta, abstractmethod
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+
+
+# classification head
+class LinearClassifier(nn.Module):
+
+    def __init__(self, in_channels, num_classes):
+        super(LinearClassifier, self).__init__()
+        self.classifier = nn.Linear(in_channels, num_classes)
+
+    def forward(self, x):
+        return self.classifier(x[-1])
+
+
+# segmentation head
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super(Upsample, self).__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
+
+
+class FPN(BaseModule):
+    """Feature Pyramid Network.
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        out_channels (int): Output channels of conv_seg.
+        threshold (float): Threshold for binary segmentation in the case of
+            `out_channels==1`. Default: None.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 out_channels=None,
+                 threshold=None,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super(BaseDecodeHead, self).__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if out_channels is None:
+            if num_classes == 2:
+                warnings.warn('For binary segmentation, we suggest using'
+                              '`out_channels = 1` to define the output'
+                              'channels of segmentor, and use `threshold`'
+                              'to convert seg_logist into a prediction'
+                              'applying a threshold')
+            out_channels = num_classes
+
+        if out_channels != num_classes and out_channels != 1:
+            raise ValueError(
+                'out_channels should be equal to num_classes,'
+                'except binary segmentation set out_channels == 1 and'
+                f'num_classes == 2, but got out_channels={out_channels}'
+                f'and num_classes={num_classes}')
+
+        if out_channels == 1 and threshold is None:
+            threshold = 0.3
+            warnings.warn('threshold is not defined for binary, and defaults'
+                          'to 0.3')
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        self.threshold = threshold
+
+        self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+
+class FPNHead(BaseDecodeHead):
+    """Panoptic Feature Pyramid Networks.
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self, feature_strides, **kwargs):
+        super(FPNHead, self).__init__(
+            input_transform='multiple_select', **kwargs)
+        assert len(feature_strides) == len(self.in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.in_channels[i] if k == 0 else self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+    def forward(self, inputs):
+
+        x = self._transform_inputs(inputs)
+
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
+
+
+class FPNSegmentor(nn.Module):
+    '''
+    Packed Sementor Head
+    Args:
+        fpn_layer_indices: tuple of the indices of layers
+        neck_cfg: dict of FPN params
+        head_cfg: dict of FPNHead params
+    '''
+
+    def __init__(self,
+                 fpn_layer_indices=(3, 5, 7, 11),
+                 neck_cfg=dict(
+                     in_channels=[768, 768, 768, 768],
+                     out_channels=256,
+                     num_outs=4),
+                 head_cfg=dict(
+                     in_channels=[256, 256, 256, 256],
+                     in_index=[0, 1, 2, 3],
+                     feature_strides=[4, 8, 16, 32],
+                     channels=128,
+                     dropout_ratio=0.1,
+                     num_classes=21,
+                     norm_cfg=dict(type='BN', requires_grad=True),
+                     align_corners=False,
+                     loss_decode=dict(
+                         type='CrossEntropyLoss',
+                         use_sigmoid=False,
+                         loss_weight=1.0))):
+        super(FPNSegmentor, self).__init__()
+
+        self.fpn_layer_indices = fpn_layer_indices
+
+        width = neck_cfg['in_channels'][0]
+        self.pre_fpn = nn.ModuleList([
+            nn.Sequential(
+                nn.ConvTranspose2d(width, width, kernel_size=2, stride=2),
+                nn.BatchNorm2d(width),
+                nn.GELU(),
+                nn.ConvTranspose2d(width, width, kernel_size=2, stride=2),
+            ),
+            nn.ConvTranspose2d(width, width, kernel_size=2, stride=2),
+            nn.Identity(),
+            nn.MaxPool2d(kernel_size=2, stride=2)
+        ])
+
+        self.fpn_neck = FPN(**neck_cfg)
+        self.fpn_head = FPNHead(**head_cfg)
+
+        # for vis
+        self.NUM_CLASSES = head_cfg['num_classes']
+        state = np.random.get_state()
+        np.random.seed(42)
+        palette = np.random.randint(0, 255, size=(self.NUM_CLASSES, 3))
+        np.random.set_state(state)
+        self.PALETTE = palette
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+        # seg = result[0]
+        seg = result
+        if palette is None:
+            if self.PALETTE is None:
+                # Get random state before set seed,
+                # and restore random state later.
+                # It will prevent loss of randomness, as the palette
+                # may be different in each iteration if not specified.
+                # See: https://github.com/open-mmlab/mmdetection/issues/5844
+                state = np.random.get_state()
+                np.random.seed(42)
+                # random palette
+                palette = np.random.randint(0, 255, size=(self.NUM_CLASSES, 3))
+                np.random.set_state(state)
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == self.NUM_CLASSES
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+
+        assert seg.shape[1] == self.NUM_CLASSES
+        if seg.shape[2] != img.shape[0] or seg.shape[3] != img.shape[1]:
+            seg = resize(seg, (img.shape[0], img.shape[1]), None, 'bilinear',
+                         True)
+        seg = seg[0]
+        seg = torch.argmax(seg, dim=0)
+
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            mmcv.imshow(img, win_name, wait_time)
+        if out_file is not None:
+            mmcv.imwrite(img, out_file)
+
+        if not (show or out_file):
+            warnings.warn('show==False and out_file is not specified, only '
+                          'result image will be returned')
+            return
+
+    def forward(self, x):
+        x = [x[idx] for idx in self.fpn_layer_indices]
+        x = [self.pre_fpn[i](x[i]) for i in range(len(x))]
+
+        x = self.fpn_neck(x)
+        x = self.fpn_head(x)
+
+        return x
diff --git a/modelscope/models/cv/vision_middleware/model.py b/modelscope/models/cv/vision_middleware/model.py
new file mode 100644
index 00000000..91f4a7b7
--- /dev/null
+++ b/modelscope/models/cv/vision_middleware/model.py
@@ -0,0 +1,168 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import os.path as osp
+from typing import Any, Dict
+
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import build_backbone
+from .head import FPNSegmentor, LinearClassifier
+
+
+@MODELS.register_module(
+    Tasks.image_segmentation, module_name=Models.vision_middleware)
+class VisionMiddlewareModel(TorchModel):
+    """
+        The implementation of 'ViM: Vision Middleware for Unified Downstream Transferring'.
+        This model is dynamically initialized with the following parts:
+            - backbone: the upstream pre-trained backbone model (CLIP in this code)
+            - ViM: the zoo of middlestream trained ViM modules
+            - ViM-aggregation: the specific aggregation weights for downstream tasks
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+            Initialize a ViM-based Model
+            Args:
+                model_dir: model id or path,
+                where model_dir/pytorch_model.pt contains:
+                    'meta_info': basic information of ViM, e.g. task_list
+                    'backbone_weights': parameters of backbone [upstream]
+                    'ViM_weights': parameters of ViM [midstream]
+                    'ViM_agg_weights': parameters of ViM-aggregation [downstream]
+        """
+        super(VisionMiddlewareModel, self).__init__()
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        model_dict = torch.load(model_path, map_location='cpu')
+
+        meta_info = model_dict['meta_info']
+        self.task_list = meta_info['task_list']
+
+        # build up backbone
+        backbone_weights = model_dict['backbone_weights']
+        self.backbone = build_backbone(
+            arch=meta_info['backbone_arch'], pretrained=backbone_weights)
+        self.backbone.eval()
+
+        # build up ViM
+        vim_weights = model_dict['ViM_weights']
+        num_layers = len(vim_weights)
+        for layer_i in range(num_layers):
+            self.backbone.transformer.resblocks[layer_i].vim_att.register_ViM(
+                vim_weights[layer_i]['vim_att_weights'])
+            self.backbone.transformer.resblocks[layer_i].vim_mlp.register_ViM(
+                vim_weights[layer_i]['vim_mlp_weights'])
+
+        # build up each task-related ViM aggregation
+        agg_weights = model_dict['ViM_agg_weights']
+        agg_algo = meta_info['ViM_agg_algo']
+        for task_name in meta_info['task_list']:
+            for layer_i in range(num_layers):
+                self.backbone.transformer.resblocks[
+                    layer_i].vim_att.register_task(
+                        task_name,
+                        agg_weights[task_name][layer_i]['vim_att_agg'],
+                        agg_algo)
+                self.backbone.transformer.resblocks[
+                    layer_i].vim_mlp.register_task(
+                        task_name,
+                        agg_weights[task_name][layer_i]['vim_mlp_agg'],
+                        agg_algo)
+
+        # build up each task-related head
+        self.heads = nn.ModuleDict()
+        self.label_maps = {}
+        for task_name in meta_info['task_list']:
+            head_weights = model_dict['head_weights']
+            if task_name.startswith('cls'):
+                self.heads[task_name] = LinearClassifier(
+                    in_channels=self.backbone.output_dim,
+                    num_classes=head_weights[task_name]
+                    ['classifier.bias'].shape[0])
+            elif task_name.startswith('seg'):
+                self.heads[task_name] = FPNSegmentor()
+            else:
+                raise NotImplementedError(
+                    'Task type [{}] is not supported'.format(task_name))
+
+            self.heads[task_name].load_state_dict(head_weights[task_name])
+            self.heads[task_name].eval()
+
+            if task_name in meta_info['label_map'].keys():
+                self.label_maps[task_name] = meta_info['label_map'][task_name]
+
+    def __call__(self, inputs, task_name) -> Dict[str, Any]:
+        return self.postprocess(
+            self.forward(inputs, task_name), inputs, task_name)
+
+    def forward(self, inputs, task_name):
+        """
+            Dynamic Forward Function of ViM
+            Args:
+                x: the input images (B, 3, H, W)
+                task_name: specified task for forwarding
+        """
+        if task_name not in self.task_list:
+            raise NotImplementedError(
+                f'task_name should in {self.task_list}, but got {task_name}')
+
+        features = self.backbone(inputs, task_name=task_name)
+        outputs = self.heads[task_name](features)
+
+        return outputs
+
+    def postprocess(self, outputs, inputs, task_name):
+        """
+            Post-process of ViM, based on task_name
+            Args:
+                inputs: batched input image (B, 3, H, W)
+                outputs: batched output (format based on task_name)
+                task_name: str, task name
+        """
+
+        _, in_channels, img_height, img_width = inputs.size()
+
+        if 'seg' in task_name:
+            # outputs in shape of [1, C, H, W]
+            seg = F.softmax(outputs, dim=1)
+            seg = F.interpolate(seg, (img_height, img_width), None, 'bilinear',
+                                True)
+            seg = seg[0].detach().cpu()
+            pred = torch.argmax(seg, dim=0)
+
+            labels = sorted(list(set(pred.reshape(-1).numpy())))
+
+            masks, scores = [], []
+            for label in labels:
+                mask = (pred == label)
+                masks.append(mask.long().numpy())
+                scores.append(((mask.float() * seg[label]).sum()
+                               / mask.float().sum()).item())
+
+            label_names = [
+                self.label_maps[task_name][label] for label in labels
+            ]
+
+            return {
+                OutputKeys.MASKS: masks,
+                OutputKeys.LABELS: label_names,
+                OutputKeys.SCORES: scores
+            }
+        else:
+            raise NotImplementedError(
+                'Only segmentation task is currently supported in pipeline')
+
+    def get_tasks(self):
+        """
+            Get the supported tasks of current ViM model
+        """
+        return self.task_list
diff --git a/modelscope/models/cv/vision_middleware/vim.py b/modelscope/models/cv/vision_middleware/vim.py
new file mode 100644
index 00000000..4ab84c08
--- /dev/null
+++ b/modelscope/models/cv/vision_middleware/vim.py
@@ -0,0 +1,187 @@
+# Part of this code is adopted from PETL-ViT,
+# made publicly available under the MIT License at https://github.com/JieShibo/PETL-ViT
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+
+def _agg_conv1d(weight_list, bias_list, agg, x):
+    """
+        weight list: list of conv1d weight ([out, in] * a)
+        bias list: list of conv1d bias ([out] * a)
+        agg: aggreagtion weights (a)
+        x: input tensor (b, in, n)
+
+        return output in (b, n, out)
+    """
+
+    weight_list = torch.cat([w.unsqueeze(0) for w in weight_list],
+                            dim=0)  # n_ada, out, in
+    weight = torch.sum(
+        weight_list * rearrange(agg, 'a -> a 1 1'),
+        dim=0).unsqueeze(2)  # out, in, 1
+
+    bias_list = torch.cat([w.unsqueeze(0) for w in bias_list],
+                          dim=0)  # n_ada, out
+    bias = torch.sum(bias_list * rearrange(agg, 'a -> a 1'), dim=0)  # out
+
+    x = F.conv1d(x, weight=weight, bias=bias)
+
+    return x
+
+
+def _agg_conv2d(weight_list, bias_list, agg, x):
+    """
+        weight list: list of conv2d weight ([out, in, m, n] * a)
+        bias list: list of conv2d bias ([out] * a)
+        agg: aggregation weights (a)
+        x: input tensor (b, in, p, q)
+
+        return output in (b, out, p, q)
+    """
+
+    weight_list = torch.cat([w.unsqueeze(0) for w in weight_list],
+                            dim=0)  # n_ada, out, in, m, n
+    weight = torch.sum(
+        weight_list * rearrange(agg, 'a -> a 1 1 1 1'), dim=0)  # out, in, m, n
+
+    bias_list = torch.cat([w.unsqueeze(0) for w in bias_list],
+                          dim=0)  # n_ada, out
+    bias = torch.sum(bias_list * rearrange(agg, 'a -> a 1'), dim=0)  # out
+
+    x = F.conv2d(
+        x, weight=weight, bias=bias, stride=1, padding=1)  # 1 (b out) p q
+
+    return x
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ViM(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+        self.act = QuickGELU()
+
+        self.adapter_conv_weight = nn.ParameterList()
+        self.adapter_conv_bias = nn.ParameterList()
+
+        self.adapter_up_weight = nn.ParameterList()
+        self.adapter_up_bias = nn.ParameterList()
+
+        self.adapter_down_weight = nn.ParameterList()
+        self.adapter_down_bias = nn.ParameterList()
+
+        # agg related
+        self.num_modules = 0
+        self.task_list = []
+        self.agg_weights = {}
+        self.agg_algos = {}
+
+    def register_ViM(self, vim_list):
+        self.num_modules = len(vim_list)
+        for state_dict in vim_list:
+            self.adapter_conv_weight.append(
+                nn.Parameter(state_dict['adapter_conv.weight']))
+            self.adapter_conv_bias.append(
+                nn.Parameter(state_dict['adapter_conv.bias']))
+
+            self.adapter_up_weight.append(
+                nn.Parameter(state_dict['adapter_up.weight']))
+            self.adapter_up_bias.append(
+                nn.Parameter(state_dict['adapter_up.bias']))
+
+            self.adapter_down_weight.append(
+                nn.Parameter(state_dict['adapter_down.weight']))
+            self.adapter_down_bias.append(
+                nn.Parameter(state_dict['adapter_down.bias']))
+
+    def register_task(self, task_name, agg_weights, agg_algo):
+        assert agg_weights.shape[0] == self.num_modules
+
+        self.task_list.append(task_name)
+        self.agg_weights[task_name] = agg_weights
+        self.agg_algos[task_name] = agg_algo
+
+    def forward(self, x, task_name):
+        assert task_name in self.task_list
+
+        agg_algo = self.agg_algos[task_name]
+        if agg_algo == 'Ens-MoE':
+            return self.forward_ens_moe(x, self.agg_weights[task_name])
+        else:
+            raise NotImplementedError(
+                'Aggregation algorithm [{}] is currently not supported!'.
+                format(agg_algo))
+
+    def forward_ens_moe(self, x, agg):
+
+        logits = agg
+        k = agg.shape[0]  # MoE-full (k=N)
+
+        top_logits, top_indices = logits.topk(
+            min(k + 1, logits.size(0)), dim=0)
+        top_k_logits = top_logits[:k]
+        top_k_indices = top_indices[:k]
+        top_k_gates = F.softmax(top_k_logits, dim=0)
+
+        zeros = torch.zeros_like(logits, requires_grad=True)
+        gates = zeros.scatter(0, top_k_indices, top_k_gates)
+
+        N, B, C = x.shape
+        x = x.permute(1, 2, 0)
+        output = None
+        for i in range(self.num_modules):
+
+            if gates[i] > 0:
+
+                x_down = F.conv1d(
+                    x,
+                    weight=self.adapter_down_weight[i].unsqueeze(2),
+                    bias=self.adapter_down_bias[i])  # equivalent to 1 * 1 Conv
+                x_down = self.act(x_down)
+
+                num_patch_side = int(math.sqrt(x_down.size(2) - 1))
+                x_patch = x_down[:, :,
+                                 1:].reshape(B, -1, num_patch_side,
+                                             num_patch_side)  # b, in, p, p
+                x_patch = F.conv2d(
+                    x_patch,
+                    weight=self.adapter_conv_weight[i],
+                    bias=self.adapter_conv_bias[i],
+                    stride=1,
+                    padding=1)
+                x_patch = rearrange(x_patch, 'b o p q -> b o (p q)')
+
+                x_cls = x_down[:, :, :1].reshape(B, -1, 1, 1)
+                x_cls = F.conv2d(
+                    x_cls,
+                    weight=self.adapter_conv_weight[i],
+                    bias=self.adapter_conv_bias[i],
+                    stride=1,
+                    padding=1)
+                x_cls = rearrange(x_cls, 'b o 1 1 -> b o 1')
+
+                x_down = torch.cat([x_cls, x_patch], dim=2)
+
+                x_down = self.act(x_down)
+                x_up = F.conv1d(
+                    x_down,
+                    weight=self.adapter_up_weight[i].unsqueeze(2),
+                    bias=self.adapter_up_bias[i])  # equivalent to 1 * 1 Conv
+
+                if output is None:
+                    output = x_up * gates[i]
+                else:
+                    output += x_up * gates[i]
+
+        return output.permute(2, 0, 1)
diff --git a/modelscope/models/cv/vop_retrieval/__init__.py b/modelscope/models/cv/vop_retrieval/__init__.py
new file mode 100644
index 00000000..5b3e762c
--- /dev/null
+++ b/modelscope/models/cv/vop_retrieval/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .basic_utils import set_seed, get_state_dict, load_data, init_transform_dict, load_frames_from_video
+    from .model import VoP
+    from .tokenization_clip import LengthAdaptiveTokenizer
+else:
+    _import_structure = {
+        'basic_utils': [
+            'set_seed', 'get_state_dict', 'load_data', 'init_transform_dict',
+            'load_frames_from_video'
+        ],
+        'model': ['VoP'],
+        'tokenization_clip': ['LengthAdaptiveTokenizer']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/vop_retrieval/backbone.py b/modelscope/models/cv/vop_retrieval/backbone.py
new file mode 100644
index 00000000..a2b26e07
--- /dev/null
+++ b/modelscope/models/cv/vop_retrieval/backbone.py
@@ -0,0 +1,354 @@
+# The implementation here is modified based on HuggingFace, originally Apache 2.0 License
+# and publicly avaialbe at https://github.com/huggingface/transformers
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import hashlib
+import os
+import urllib
+import warnings
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm import tqdm
+
+from modelscope.models.base.base_torch_model import TorchModel
+
+
+class LayerNorm(nn.LayerNorm):
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(TorchModel):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(TorchModel):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(TorchModel):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisualTransformer(TorchModel):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+        x_1 = self.class_embedding.to(x.dtype)
+        x_2 = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x_1 = x_1 + x_2
+        x = torch.cat([x_1, x], dim=1)
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(TorchModel):
+
+    def __init__(self, embed_dim: int, image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int, vision_patch_size: int,
+                 context_length: int, vocab_size: int, transformer_width: int,
+                 transformer_heads: int, transformer_layers: int):
+        super().__init__()
+
+        self.context_length = context_length
+
+        vision_heads = vision_width // 64
+        self.visual = VisualTransformer(
+            input_resolution=image_resolution,
+            patch_size=vision_patch_size,
+            width=vision_width,
+            layers=vision_layers,
+            heads=vision_heads,
+            output_dim=embed_dim)
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(
+                self.text_projection, std=self.transformer.width**-0.5)
+
+    def build_attention_mask(self):
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text, return_all_tokens=False):
+        x = self.token_embedding(text).type(self.dtype)
+
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)
+        x = self.ln_final(x).type(self.dtype)
+
+        if return_all_tokens:
+            return x @ self.text_projection
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        image_features = image_features / image_features.norm(
+            dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+        return logits_per_image, logits_per_text
+
+
+def build_model(state_dict: dict):
+    vit = 'visual.proj' in state_dict
+
+    if vit:
+        vision_width = state_dict['visual.conv1.weight'].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith('visual.') and k.endswith('.attn.in_proj_weight')
+        ])
+        vision_patch_size = state_dict['visual.conv1.weight'].shape[-1]
+        grid_size = round(
+            (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split('.')[2] for k in state_dict
+                    if k.startswith(f'visual.layer{b}')))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0]
+        output_width = round(
+            (state_dict['visual.attnpool.positional_embedding'].shape[0]
+             - 1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            'visual.attnpool.positional_embedding'].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict['text_projection'].shape[1]
+    context_length = state_dict['positional_embedding'].shape[0]
+    vocab_size = state_dict['token_embedding.weight'].shape[0]
+    transformer_width = state_dict['ln_final.weight'].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split('.')[2] for k in state_dict
+            if k.startswith('transformer.resblocks')))
+
+    model = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+
+    for key in ['input_resolution', 'context_length', 'vocab_size']:
+        if key in state_dict:
+            del state_dict[key]
+
+    model.load_state_dict(state_dict)
+    return model.eval()
+
+
+def load_clip(name: str,
+              device: Union[str, torch.device] = 'cuda'
+              if torch.cuda.is_available() else 'cpu',
+              jit=True):
+    jit = False
+    model_path = name
+    try:
+        model = torch.jit.load(
+            model_path, map_location=device if jit else 'cpu').eval()
+        state_dict = None
+    except RuntimeError:
+        if jit:
+            warnings.warn(
+                f'File {model_path} is not a JIT archive. Loading as a state dict instead'
+            )
+            jit = False
+        state_dict = torch.load(model_path, map_location='cpu')
+
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == 'cpu':
+            model.float()
+        return model
+
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, 'graph') else []
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        node['value']).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    if str(device) == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:
+                        if inputs[i].node()['value'] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model
diff --git a/modelscope/models/cv/vop_retrieval/basic_utils.py b/modelscope/models/cv/vop_retrieval/basic_utils.py
new file mode 100644
index 00000000..c38482fc
--- /dev/null
+++ b/modelscope/models/cv/vop_retrieval/basic_utils.py
@@ -0,0 +1,170 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import os
+import pickle
+import random
+import shutil
+import zipfile
+from collections import OrderedDict
+
+import cv2
+import numpy as np
+import torch
+import ujson as json
+from PIL import Image
+from torchvision import transforms
+
+
+def init_transform_dict(input_res=224):
+    """
+        The implementation of transforms functions.
+        The default image resolution is 224.
+        The normalize parameter follows the mainstream setting.
+    """
+    tsfm_dict = {
+        'clip_test':
+        transforms.Compose([
+            transforms.Resize(input_res, interpolation=Image.BICUBIC),
+            transforms.CenterCrop(input_res),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                 (0.26862954, 0.26130258, 0.27577711)),
+        ]),
+        'clip_train':
+        transforms.Compose([
+            transforms.RandomResizedCrop(input_res, scale=(0.5, 1.0)),
+            transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(brightness=0, saturation=0, hue=0),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
+                                 (0.26862954, 0.26130258, 0.27577711)),
+        ])
+    }
+    return tsfm_dict
+
+
+def load_data(feature_path, mydevice):
+    """
+        Loading dataset from 'feature_path' as a retrieval docs.
+        The default dataset is MSRVTT-9K.
+
+        Args:
+            feature_path: 'VoP_msrvtt9k_features.pkl'
+            mydevice: device(type='cuda', index=0)
+
+        Returns:
+            [text_embeds, vid_embeds_pooled, vid_ids, texts]
+    """
+    feature_content = torch.load(feature_path)
+    text_embeds = feature_content['text_embeds'].to(device=mydevice)
+    vid_embeds_pooled = feature_content['vid_embeds'].to(device=mydevice)
+    vid_ids = feature_content['vid_ids']
+    texts = feature_content['texts']
+    return [text_embeds, vid_embeds_pooled, vid_ids, texts]
+
+
+def load_json(filename):
+    """
+        Load json files.
+    """
+    with open(filename, 'r') as f:
+        return json.load(f)
+
+
+def set_seed(seed):
+    """
+        Set random seed.
+    """
+    if seed >= 0:
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        random.seed(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def get_state_dict(checkpoint_path):
+    """
+        Load pre-train parameters for VoP.
+    """
+    checkpoint = torch.load(checkpoint_path)
+    state_dict = checkpoint['state_dict']
+
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        new_state_dict[k.replace('module.', '')] = v
+    state_dict = new_state_dict
+
+    return state_dict
+
+
+def get_valid_frames(cap, num_frames, vlen, sample='rand'):
+    """
+        Get indexes of sampled frames.
+
+        Args:
+            cap: cv2.VideoCapture
+            num_frames: int - number of frames to sample
+            vlen: video length, int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 325
+            sample: 'rand' | 'uniform' how to sample
+
+        Returns:
+            frames: torch.tensor of stacked sampled video frames
+                    of dim (num_frames, C, H, W)
+            frame_idxs: list(int) indices of where the frames where sampled
+    """
+    acc_samples = min(num_frames, vlen)
+    intervals = np.linspace(
+        start=0, stop=vlen, num=acc_samples + 1).astype(int)
+    ranges = []
+    for idx, interv in enumerate(intervals[:-1]):
+        ranges.append((interv, intervals[idx + 1] - 1))
+    if sample == 'rand':
+        frame_idxs = [random.choice(range(x[0], x[1])) for x in ranges]
+    else:
+        frame_idxs = [(x[0] + x[1]) // 2 for x in ranges]
+
+    frames = []
+    for index in frame_idxs:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, index)
+        ret, frame = cap.read()
+        if not ret:
+            n_tries = 5
+            for _ in range(n_tries):
+                ret, frame = cap.read()
+                if ret:
+                    break
+        if not ret:
+            return None, None
+
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frame = torch.from_numpy(frame)
+        frame = frame.permute(2, 0, 1)
+        frames.append(frame)
+
+    while len(frames) < num_frames:
+        frames.append(frames[-1].clone())
+
+    return frames, frame_idxs
+
+
+def load_frames_from_video(video_path, num_frames, sample='rand'):
+    """
+        Get indexes of sampled frames.
+
+        Args:
+            video_path: the local video path
+            num_frames: Frame number, 12 frames for each video
+            sample: 'rand' | 'uniform' how to sample
+
+        Returns:
+            frames: torch.tensor of stacked sampled video frames
+                    of dim (num_frames, C, H, W)
+            frame_idxs: list(int) indices of where the frames where sampled
+    """
+    cap = cv2.VideoCapture(video_path)
+    assert (cap.isOpened()), video_path
+    vlen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    frames, frame_idxs = get_valid_frames(cap, num_frames, vlen, sample)
+    frames = torch.stack(frames).float() / 255
+    cap.release()
+    return frames, frame_idxs
diff --git a/modelscope/models/cv/vop_retrieval/model.py b/modelscope/models/cv/vop_retrieval/model.py
new file mode 100644
index 00000000..d89fcb7f
--- /dev/null
+++ b/modelscope/models/cv/vop_retrieval/model.py
@@ -0,0 +1,378 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import os
+import os.path as osp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from .backbone import load_clip
+from .basic_utils import get_state_dict, set_seed
+
+
+@MODELS.register_module(
+    Tasks.vop_retrieval, module_name=Models.vop_retrieval_model)
+class VoP(TorchModel):
+    """
+        The implementation of 'VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval'.
+        This model is dynamically initialized with the following parts:
+            - clip: the upstream pre-trained backbone model (CLIP in this code)
+            - pool_frames: the frames pooling method
+            - visual_prompt_learner: visual prompt
+            - ImageEncoder: get image encoder
+            - TextPromptLearner: text prompt
+            - TextEncoder: get text encoder
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+            Initialize a VoP Model
+
+            Args:
+                model_dir: model id or path,
+        """
+        super(VoP, self).__init__()
+        model_path = osp.join(model_dir, 'VoP_msrvtt9k.pth')
+        clip_arch = osp.join(model_dir, 'ViT-B-32.pt')
+        config_path = osp.join(model_dir, ModelFile.CONFIGURATION)
+
+        self.config = Config.from_file(config_path).hyperparam
+        self.clip = load_clip(name=clip_arch)
+
+        self.config.vpt_layers = list(
+            range(self.clip.visual.transformer.layers))
+        self.config.tpt_layers = list(range(self.clip.transformer.layers))
+
+        self.pool_frames = BaselinePooling(self.config.pooling_type,
+                                           self.config)
+
+        self.visual_prompt_learner = VisualPromptLearner(
+            self.clip, self.config)
+        self.image_encoder = ImageEncoder(self.clip, self.config)
+
+        self.text_prompt_learner = TextPromptLearner(self.clip, self.config)
+        self.text_encoder = TextEncoder(self.clip, self.config)
+
+        # load param from pre-train model
+        self.load_state_dict(get_state_dict(model_path))
+        self.eval()
+
+        # set seed
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+        set_seed(self.config.seed)
+
+    def get_video_features(self, videos, return_all_frames=False):
+        """
+            Get video Features
+
+            Args:
+                videos: the dim is [1, 12, 3, 224, 224]
+                return_all_frames: default False
+        """
+        batch_size = videos.shape[0]
+        video_data = videos.reshape(-1, 3, self.config.input_res,
+                                    self.config.input_res)
+
+        visual_prompts = self.visual_prompt_learner()
+        video_features = self.image_encoder(visual_prompts, video_data)
+
+        video_features = video_features / video_features.norm(
+            dim=-1, keepdim=True)
+        video_features = video_features.reshape(batch_size,
+                                                self.config.num_frames, -1)
+
+        video_features_pooled = self.pool_frames(None, video_features)
+
+        if return_all_frames:
+            return video_features, video_features_pooled
+
+        return video_features_pooled
+
+    def get_text_features(self, text_data):
+        """
+            Get Text Features
+
+            Args:
+                text_data: the dim is [1, 69]
+        """
+        text_prompts = self.text_prompt_learner()
+        text_features = self.text_encoder(text_prompts, text_data)
+
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+        return text_features
+
+    def forward(self, data, return_all_frames=False):
+        """
+            Dynamic Forward Function of VoP
+
+            Args:
+                data: the input data
+                return_all_frames: default False
+        """
+        batch_size = data['video'].shape[0]
+        text_data = data['text']
+        video_data = data['video']
+        video_data = video_data.reshape(-1, 3, self.config.input_res,
+                                        self.config.input_res)
+
+        visual_prompts = self.visual_prompt_learner()
+        video_features = self.image_encoder(visual_prompts, video_data)
+
+        text_prompts = self.text_prompt_learner()
+        text_features = self.text_encoder(text_prompts, text_data)
+
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+        video_features = video_features / video_features.norm(
+            dim=-1, keepdim=True)
+        video_features = video_features.reshape(batch_size,
+                                                self.config.num_frames, -1)
+
+        video_features_pooled = self.pool_frames(text_features, video_features)
+
+        if return_all_frames:
+            return text_features, video_features, video_features_pooled
+
+        return text_features, video_features_pooled
+
+
+class BaselinePooling(TorchModel):
+    """
+        Redefined Pooling Function
+    """
+
+    def __init__(self, pooling_type, config):
+        super(BaselinePooling, self).__init__()
+        if pooling_type == 'avg':
+            self.pooling_func = self._avg_pooling
+        else:
+            raise NotImplementedError
+
+    def _avg_pooling(self, text_embeds, video_embeds):
+        """
+            Pooling mean of frames
+
+            Args:
+                text_embeds: the input text embedding which is None here.
+                video_embeds: the input video embedding with [1, 12, 512].
+
+            Returns:
+                video_embeds_pooled: num_vids x embed_dim
+        """
+        video_embeds_pooled = video_embeds.mean(dim=1)
+        return video_embeds_pooled
+
+    def forward(self, text_embeds, video_embeds):
+        return self.pooling_func(text_embeds, video_embeds)
+
+
+class VisualPromptLearner(TorchModel):
+    """
+        The implementation of visual prompt.
+        This module is used to define the learnable prompt parameters:
+            the number of tokens is 8,
+            the prompt dimension is 768,
+            and the initialization weight std used is 0.02.
+    """
+
+    def __init__(self, clip_model, config):
+        super(VisualPromptLearner, self).__init__()
+
+        vp_token_num = config.vp_token_num
+        vp_dim = clip_model.visual.ln_post.weight.shape[0]
+        dtype = clip_model.dtype
+
+        visual_prompts = torch.empty(
+            len(config.vpt_layers), 1, vp_token_num, vp_dim, dtype=dtype)
+        nn.init.normal_(visual_prompts, std=0.02)
+        self.visual_prompts = nn.Parameter(visual_prompts)
+
+    def forward(self):
+        vp = self.visual_prompts
+        return vp
+
+
+class TextPromptLearner(TorchModel):
+    """
+        The implementation of visual prompt.
+        This module is used to define the learnable prompt parameters:
+            the number of tokens is 4,
+            the prompt dimension is 512,
+            and the initialization weight std used is 0.02.
+    """
+
+    def __init__(self, clip_model, config):
+        super(TextPromptLearner, self).__init__()
+
+        tp_prefix_token_num = config.tp_prefix_token_num
+        tp_suffix_token_num = config.tp_suffix_token_num
+        assert tp_prefix_token_num >= 0 and tp_suffix_token_num >= 0
+        tp_dim = clip_model.ln_final.weight.shape[0]
+        dtype = clip_model.dtype
+
+        text_prompts = torch.empty(
+            len(config.tpt_layers),
+            tp_prefix_token_num + tp_suffix_token_num,
+            tp_dim,
+            dtype=dtype)
+        nn.init.normal_(text_prompts, std=0.02)
+
+        self.text_prompts = nn.Parameter(text_prompts)
+        self.tp_prefix_token_num = tp_prefix_token_num
+        self.tp_suffix_token_num = tp_suffix_token_num
+
+    def forward(self):
+        return (self.text_prompts[:, :self.tp_prefix_token_num, :],
+                self.text_prompts[:, self.tp_prefix_token_num:, :])
+
+
+class ImageEncoder(TorchModel):
+    """
+        The implementation of image encoder.
+        This module is used to obtain the features of each frame of the video.
+    """
+
+    def __init__(self, clip_model, config):
+        super(ImageEncoder, self).__init__()
+
+        self.config = config
+        self.vpt_layers = config.vpt_layers
+        self.vp_token_num = config.vp_token_num
+        self.num_frames = config.num_frames
+
+        self.conv1 = clip_model.visual.conv1
+        self.class_embedding = clip_model.visual.class_embedding
+        self.positional_embedding = clip_model.visual.positional_embedding
+        self.ln_pre = clip_model.visual.ln_pre
+
+        self.transformer = clip_model.visual.transformer
+
+        self.ln_post = clip_model.visual.ln_post
+        self.proj = clip_model.visual.proj
+
+    def forward(self, visual_prompts, x):
+        """
+            The forward function of image encoder.
+
+            Args:
+                visual_prompts: the visual prompt, dim is [12, 1, 8, 768]
+                x: the input data, dim is [12, 3, 224, 224]
+
+            Returns:
+                x: the output data, dim is [12, 512]
+        """
+        batch_size = x.shape[0]
+        x = self.conv1(x)
+        x = x.reshape(batch_size, x.shape[1], -1)
+        x = x.permute(0, 2, 1)
+        x_1 = self.class_embedding.to(x.dtype)
+        x_2 = torch.zeros(
+            batch_size, 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x_1 = x_1 + x_2
+        x = torch.cat([x_1, x], dim=1)
+        x = x + self.positional_embedding.to(x.dtype)
+
+        for i_layer in range(self.transformer.layers):
+            if i_layer in self.vpt_layers:
+                i_prompt = self.vpt_layers.index(i_layer)
+                cur_layer_vp = visual_prompts[i_prompt, :, :, :].repeat(
+                    batch_size, 1, 1)
+                x = torch.cat([x[:, :1, :], cur_layer_vp, x[:, 1:, :]], dim=1)
+
+            if i_layer == 0:
+                x = self.ln_pre(x)
+            x = x.permute(1, 0, 2)
+            x = self.transformer.resblocks[i_layer](x)
+            x = x.permute(1, 0, 2)
+
+            if i_layer + 1 in self.vpt_layers:
+                x = torch.cat([x[:, :1, :], x[:, 1 + self.vp_token_num:, :]],
+                              dim=1)
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class TextEncoder(TorchModel):
+    """
+        The implementation of text encoder.
+        This module is used to obtain the features of each word of the sentence.
+    """
+
+    def __init__(self, clip_model, config):
+        super(TextEncoder, self).__init__()
+        self.transformer = clip_model.transformer
+        self.token_embedding = clip_model.token_embedding
+        self.positional_embedding = clip_model.positional_embedding
+        self.ln_final = clip_model.ln_final
+        self.text_projection = clip_model.text_projection
+        self.dtype = clip_model.dtype
+
+        self.tpt_layers = config.tpt_layers
+        assert 0 in self.tpt_layers
+        self.tp_prefix_token_num = config.tp_prefix_token_num
+        self.tp_suffix_token_num = config.tp_suffix_token_num
+        self.tp_token_num = config.tp_prefix_token_num + config.tp_suffix_token_num
+
+    def forward(self, text_prompts, text):
+        """
+            The forward function of text encoder.
+
+            Args:
+                text_prompts: the text prompt, dim is 2 x [12, 4, 512]
+                text: the input data, dim is [1, 69]
+
+            Returns:
+                x: the output data, dim is [1, 512]
+        """
+        x = self.token_embedding(text).type(self.dtype)
+        batch_size = x.shape[0]
+        prompt_prefix, prompt_suffix = text_prompts
+
+        for i_layer in range(self.transformer.layers):
+            if i_layer in self.tpt_layers:
+                i_prompt = self.tpt_layers.index(i_layer)
+                if self.tp_prefix_token_num > 0:
+                    cur_layer_tp_prefix = prompt_prefix[i_prompt:i_prompt
+                                                        + 1, :, :].expand(
+                                                            batch_size, -1, -1)
+                    x = torch.cat(
+                        [x[:, :1, :], cur_layer_tp_prefix, x[:, 1:, :]], dim=1)
+                if self.tp_suffix_token_num > 0:
+                    cur_layer_tp_suffix = prompt_suffix[i_prompt:i_prompt
+                                                        + 1, :, :].expand(
+                                                            batch_size, -1, -1)
+                    x = torch.cat(
+                        [x[:, :-1, :], cur_layer_tp_suffix, x[:, -1:, :]],
+                        dim=1)
+
+            if i_layer == 0:
+                x = x + self.positional_embedding.type(self.dtype)
+            x = x.permute(1, 0, 2)
+            x = self.transformer.resblocks[i_layer](x)
+            x = x.permute(1, 0, 2)
+
+            if i_layer + 1 in self.tpt_layers:
+                temp_1 = x[:, :1, :]
+                temp_2 = x[:, 1 + self.tp_prefix_token_num:-1
+                           - self.tp_suffix_token_num, :]
+                temp_3 = x[:, -1:, :]
+                temp = torch.cat([temp_1, temp_2, temp_3], dim=1)
+                x = temp
+
+        x = self.ln_final(x).type(self.dtype)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1) + self.tp_token_num] @ self.text_projection
+
+        return x
diff --git a/modelscope/models/cv/vop_retrieval/tokenization_clip.py b/modelscope/models/cv/vop_retrieval/tokenization_clip.py
new file mode 100644
index 00000000..07bad10c
--- /dev/null
+++ b/modelscope/models/cv/vop_retrieval/tokenization_clip.py
@@ -0,0 +1,159 @@
+# The implementation here is modified based on HuggingFace, originally Apache 2.0 License
+# and publicly avaialbe at https://github.com/huggingface/transformers
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+import torch
+
+
+@lru_cache()
+def bytes_to_unicode():
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class LengthAdaptiveTokenizer(object):
+
+    def __init__(self, config, bpe_path):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = bpe_path
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+        self.vocab = self.encoder
+
+        self.tp_token_num = config.tp_prefix_token_num + config.tp_suffix_token_num
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def __call__(self,
+                 texts,
+                 return_tensors='pt',
+                 padding=True,
+                 truncation=True):
+        context_length = 77 - self.tp_token_num
+        if isinstance(texts, str):
+            texts = [texts]
+
+        sot_token = self.encoder['<|startoftext|>']
+        eot_token = self.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.encode(text) + [eot_token]
+                      for text in texts]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                new_tokens = [sot_token
+                              ] + tokens[1:context_length - 1] + [eot_token]
+                result[i, :len(tokens)] = torch.tensor(new_tokens)
+            else:
+                result[i, :len(tokens)] = torch.tensor(tokens)
+
+        return result
diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py
index 0053da43..4edf6212 100644
--- a/modelscope/models/multi_modal/__init__.py
+++ b/modelscope/models/multi_modal/__init__.py
@@ -10,12 +10,13 @@ if TYPE_CHECKING:
     from .team import TEAMForMultiModalSimilarity
     from .diffusion import DiffusionForTextToImageSynthesis
     from .mmr import VideoCLIPForMultiModalEmbedding
-    from .mplug_for_all_tasks import MPlugForAllTasks
+    from .mplug_for_all_tasks import MPlugForAllTasks, HiTeAForAllTasks
     from .ofa_for_all_tasks import OfaForAllTasks
     from .ofa_for_text_to_image_synthesis_model import \
         OfaForTextToImageSynthesis
     from .multi_stage_diffusion import \
         MultiStageDiffusionForTextToImageSynthesis
+    from .vldoc import VLDocForDocVLEmbedding
 
 else:
     _import_structure = {
@@ -24,12 +25,13 @@ else:
         'gemm': ['GEMMForMultiModalEmbedding'],
         'team': ['TEAMForMultiModalSimilarity'],
         'mmr': ['VideoCLIPForMultiModalEmbedding'],
-        'mplug_for_all_tasks': ['MPlugForAllTasks'],
+        'mplug_for_all_tasks': ['MPlugForAllTasks', 'HiTeAForAllTasks'],
         'ofa_for_all_tasks': ['OfaForAllTasks'],
         'ofa_for_text_to_image_synthesis_model':
         ['OfaForTextToImageSynthesis'],
         'multi_stage_diffusion':
-        ['MultiStageDiffusionForTextToImageSynthesis']
+        ['MultiStageDiffusionForTextToImageSynthesis'],
+        'vldoc': ['VLDocForDocVLEmbedding'],
     }
 
     import sys
diff --git a/modelscope/models/multi_modal/diffusion/model.py b/modelscope/models/multi_modal/diffusion/model.py
index 32956324..d979cc7f 100644
--- a/modelscope/models/multi_modal/diffusion/model.py
+++ b/modelscope/models/multi_modal/diffusion/model.py
@@ -23,6 +23,7 @@ from modelscope.models.multi_modal.diffusion.unet_upsampler_256 import \
 from modelscope.models.multi_modal.diffusion.unet_upsampler_1024 import \
     SuperResUNet1024
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import create_device
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -113,22 +114,17 @@ class DiffusionModel(nn.Module):
     Tasks.text_to_image_synthesis, module_name=Models.diffusion)
 class DiffusionForTextToImageSynthesis(Model):
 
-    def __init__(self, model_dir, device_id=-1):
-        super().__init__(model_dir=model_dir, device_id=device_id)
+    def __init__(self, model_dir, device='gpu'):
+        device = 'gpu' if torch.cuda.is_available() else 'cpu'
+        super().__init__(model_dir=model_dir, device=device)
         diffusion_model = DiffusionModel(model_dir=model_dir)
         pretrained_params = torch.load(
             osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
         diffusion_model.load_state_dict(pretrained_params)
-        diffusion_model.eval()
+        diffusion_model.eval().to()
 
-        self.device_id = device_id
-        if self.device_id >= 0:
-            self.device = torch.device(f'cuda:{self.device_id}')
-            diffusion_model.to('cuda:{}'.format(self.device_id))
-            logger.info('Use GPU: {}'.format(self.device_id))
-        else:
-            self.device = torch.device('cpu')
-            logger.info('Use CPU for inference')
+        self.device = create_device(device)
+        diffusion_model.to(self.device)
 
         # modules
         self.text_encoder = diffusion_model.text_encoder
diff --git a/modelscope/models/multi_modal/mgeo/__init__.py b/modelscope/models/multi_modal/mgeo/__init__.py
new file mode 100644
index 00000000..c311e8a6
--- /dev/null
+++ b/modelscope/models/multi_modal/mgeo/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .backbone import (MGeo, MGeoPreTrainedModel)
+    from .text_classification import MGeoForSequenceClassification
+    from .token_classification import MGeoForTokenClassification
+    from .text_ranking import MGeoForTextRanking
+else:
+    _import_structure = {
+        'backbone': ['MGeo', 'MGeoPreTrainedModel'],
+        'text_classification': ['MGeoForSequenceClassification'],
+        'token_classification': ['MGeoForTokenClassification'],
+        'text_ranking': ['MGeoForTextRanking'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/multi_modal/mgeo/backbone.py b/modelscope/models/multi_modal/mgeo/backbone.py
new file mode 100644
index 00000000..f5e4fb6d
--- /dev/null
+++ b/modelscope/models/multi_modal/mgeo/backbone.py
@@ -0,0 +1,2503 @@
+import math
+import os
+import random
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from torch import Tensor, device, dtype, nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput, SequenceClassifierOutput,
+    TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.utils import logging
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.nlp.utils import parse_labels_in_order
+
+transformers.logging.set_verbosity_error()
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = 'BertConfig'
+_TOKENIZER_FOR_DOC = 'BertTokenizer'
+
+
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            'Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see '
+            'https://www.tensorflow.org/install/ for installation instructions.'
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info('Converting TensorFlow checkpoint from {}'.format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info('Loading TF weight {} with shape {}'.format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to
+        # calculated m and v which are not required for using pretrained model
+        if any(n in [
+                'adam_v', 'adam_m', 'AdamWeightDecayOptimizer',
+                'AdamWeightDecayOptimizer_1', 'global_step'
+        ] for n in name):
+            logger.info('Skipping {}'.format('/'.join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                scope_names = re.split(r'_(\d+)', m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == 'kernel' or scope_names[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'output_bias' or scope_names[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif scope_names[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif scope_names[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info('Skipping {}'.format('/'.join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f'Pointer shape {pointer.shape} and array shape {array.shape} mismatched'
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info('Initialize PyTorch weight {}'.format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class GisEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size, padding_idx=0)
+        self.rel_type_embeddings = nn.Embedding(
+            config.rel_type_vocab_size, config.hidden_size, padding_idx=0)
+        self.absolute_x_embeddings = nn.Embedding(
+            config.absolute_x_vocab_size, config.hidden_size, padding_idx=0)
+        self.absolute_y_embeddings = nn.Embedding(
+            config.absolute_y_vocab_size, config.hidden_size, padding_idx=0)
+        self.relative_x_embeddings = nn.Embedding(
+            config.relative_x_vocab_size, config.hidden_size, padding_idx=0)
+        self.relative_y_embeddings = nn.Embedding(
+            config.relative_y_vocab_size, config.hidden_size, padding_idx=0)
+        if hasattr(config, 'prov_vocab_size'):
+            self.prov_embeddings = nn.Embedding(
+                config.prov_vocab_size, config.hidden_size, padding_idx=0)
+            self.city_embeddings = nn.Embedding(
+                config.city_vocab_size, config.hidden_size, padding_idx=0)
+            self.dist_embeddings = nn.Embedding(
+                config.dist_vocab_size, config.hidden_size, padding_idx=0)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+
+        self.config = config
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                rel_type_ids=None,
+                absolute_position_ids=None,
+                relative_position_ids=None,
+                prov_ids=None,
+                city_ids=None,
+                dist_ids=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings += self.rel_type_embeddings(rel_type_ids)
+        embeddings += self.absolute_x_embeddings(absolute_position_ids[:, :,
+                                                                       0])
+        embeddings += self.absolute_y_embeddings(absolute_position_ids[:, :,
+                                                                       1])
+        embeddings += self.absolute_x_embeddings(absolute_position_ids[:, :,
+                                                                       2])
+        embeddings += self.absolute_y_embeddings(absolute_position_ids[:, :,
+                                                                       3])
+        embeddings += self.relative_x_embeddings(relative_position_ids[:, :,
+                                                                       0])
+        embeddings += self.relative_y_embeddings(relative_position_ids[:, :,
+                                                                       1])
+        embeddings += self.relative_x_embeddings(relative_position_ids[:, :,
+                                                                       2])
+        embeddings += self.relative_y_embeddings(relative_position_ids[:, :,
+                                                                       3])
+        if prov_ids is not None:
+            embeddings += self.prov_embeddings(prov_ids)
+            embeddings += self.city_embeddings(city_ids)
+            embeddings += self.dist_embeddings(dist_ids)
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type
+    embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+
+        self.config = config
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                rel_type_ids=None,
+                absolute_position_ids=None,
+                relative_position_ids=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                'The hidden size (%d) is not a multiple of the number of attention '
+                'heads (%d)' %
+                (config.hidden_size, config.num_attention_heads))
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+
+        self.has_cross_attention = (layer_num >= config.fusion_layer)
+        if self.has_cross_attention:
+            self.layer_num = layer_num
+            self.crossattention = BertAttention(
+                config, is_cross_attention=True)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at
+        # positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+
+        if self.has_cross_attention:
+            assert encoder_hidden_states is not None, 'encoder_hidden_states must be given for cross-attention layers'
+
+            if type(encoder_hidden_states) == list:
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states[(self.layer_num
+                                           - self.config.fusion_layer)
+                                          % len(encoder_hidden_states)],
+                    encoder_attention_mask[(self.layer_num
+                                            - self.config.fusion_layer)
+                                           % len(encoder_hidden_states)],
+                    output_attentions=output_attentions,
+                )
+                attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            else:
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[
+                    1:
+                    -1]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode='multi_modal',
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+
+        if mode == 'text':
+            start_layer = 0
+            output_layer = self.config.fusion_layer
+
+        elif mode == 'query':
+            start_layer = 0
+            output_layer = self.config.num_hidden_layers
+
+        elif mode == 'fusion':
+            start_layer = self.config.fusion_layer
+            output_layer = self.config.num_hidden_layers
+
+        elif mode == 'multi_modal':
+            start_layer = 0
+            output_layer = self.config.num_hidden_layers
+
+        for i in range(start_layer, output_layer):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if getattr(self.config, 'gradient_checkpointing',
+                       False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        '`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting '
+                        '`use_cache=False`...')
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = 'bert'
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.BertForPreTraining`. Args:
+        loss (`optional`, returned when ``labels`` is provided,
+        ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the
+            next sequence prediction (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each
+            vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape
+        :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification)
+            head (scores of True/False continuation before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned
+        when ``output_hidden_states=True`` is passed or when
+        ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the
+            embeddings + one for the output of each layer) of shape
+            :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the initial embedding
+            outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when
+        ``output_attentions=True`` is passed or when
+        ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    seq_relationship_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    Noted that the bert model here is slightly updated from original bert, so we
+    maintian the code here independently. The Bert Model transformer outputting
+    raw hidden-states without any specific head on top.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass
+    documentation for the generic methods the library implements for all its
+    model (such as downloading or saving, resizing the input embeddings, pruning
+    heads etc.)
+
+    This model is also a PyTorch
+    [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch
+    documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`BertConfig`]): Model configuration class with all the
+        parameters of the model.
+            Initializing with a config file does not load the weights associated
+            with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a
+    decoder, in which case a layer of cross-attention is added between the
+    self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
+    Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    `is_decoder` argument of the configuration set to `True`. To be used in a
+    Seq2Seq model, the model needs to initialized with both `is_decoder`
+    argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
+    is then expected as an input to the forward pass.
+
+
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        if config.gis_embedding == 0:
+            self.embeddings = BertEmbeddings(config)
+        else:
+            self.embeddings = GisEmbeddings(config)
+
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(self, attention_mask: Tensor,
+                                    input_shape: Tuple[int], device: device,
+                                    is_decoder: bool) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked
+        tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens
+                to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same
+            dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size,
+        # from_seq_length, to_seq_length] ourselves in which case we just need
+        # to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to
+            #   the padding mask
+            # - if the model is an encoder, make the mask broadcastable to
+            #   [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(
+                    batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones
+                # mask to the causal mask causal and attention masks must have
+                # same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[
+                        1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, seq_length, prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask[:,
+                                                      None, :, :] * attention_mask[:,
+                                                                                   None,
+                                                                                   None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                'Wrong shape for input_ids (shape {}) or attention_mask (shape {})'
+                .format(input_shape, attention_mask.shape))
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0
+        # for masked positions, this operation will create a tensor which is 0.0
+        # for positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        rel_type_ids=None,
+        absolute_position_ids=None,
+        relative_position_ids=None,
+        prov_ids=None,
+        city_ids=None,
+        dist_ids=None,
+    ):
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        Others (**kwargs)
+            some additional parameters might passed in from upstream pipeline,
+            which not influence the results.
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds or encoder_embeds'
+            )
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size,
+        # from_seq_length, to_seq_length] ourselves in which case we just need
+        # to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device, is_decoder)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention we
+        # need to make broadcastable to [batch_size, num_heads, seq_length,
+        # seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+                )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask)
+                    for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+                rel_type_ids=rel_type_ids,
+                absolute_position_ids=absolute_position_ids,
+                relative_position_ids=relative_position_ids,
+                prov_ids=prov_ids,
+                city_ids=city_ids,
+                dist_ids=dist_ids,
+            )
+        else:
+            embedding_output = encoder_embeds
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertForPreTraining(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            next_sentence_loss = loss_fct(
+                seq_relationship_score.view(-1, 2),
+                next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=True,
+        reduction='mean',
+        mode='multi_modal',
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape
+        :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape
+        :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in ``[0,
+            1]``: - 1 for tokens that are **not masked**, - 0 for tokens that
+            are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next
+            word prediction). Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with
+            indices set to ``-100`` are ignored (masked), the loss is only
+            computed for the tokens with labels n ``[0, ...,
+            config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length
+        :obj:`config.n_layers` with each tuple having 4 tensors of shape
+        :obj:`(batch_size, num_heads, sequence_length - 1,
+        embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding. If :obj:`past_key_values`
+            are used, the user can optionally input only the last
+            :obj:`decoder_input_ids` (those that don't have their past key value
+            states given to this model) of shape :obj:`(batch_size, 1)` instead
+            of all :obj:`decoder_input_ids` of shape :obj:`(batch_size,
+            sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are
+            returned and can be used to speed up decoding (see
+            :obj:`past_key_values`).
+        Returns: Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :
+                                                          -1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+            lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if soft_labels is not None:
+            loss_distill = -torch.sum(
+                F.log_softmax(shifted_prediction_scores, dim=-1) * soft_labels,
+                dim=-1)
+            loss_distill = (loss_distill * (labels != -100)).sum(1)
+            lm_loss = (1 - alpha) * lm_loss + alpha * loss_distill
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((lm_loss, ) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past=None,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            'input_ids':
+            input_ids,
+            'attention_mask':
+            attention_mask,
+            'past_key_values':
+            past,
+            'encoder_hidden_states':
+            model_kwargs.get('encoder_hidden_states', None),
+            'encoder_attention_mask':
+            model_kwargs.get('encoder_attention_mask', None),
+            'is_decoder':
+            True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
+        return reordered_past
+
+
+class GisBertLMPredictionHead(nn.Module):
+
+    def __init__(self, config, vocab_size):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertForGisMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls_geom_id = GisBertLMPredictionHead(config, config.vocab_size)
+        self.cls_geom_type = GisBertLMPredictionHead(config,
+                                                     config.type_vocab_size)
+        self.cls_rel_type = GisBertLMPredictionHead(config,
+                                                    config.rel_type_vocab_size)
+        self.cls_absolute_position_x1 = GisBertLMPredictionHead(
+            config, config.absolute_x_vocab_size)
+        self.cls_absolute_position_x2 = GisBertLMPredictionHead(
+            config, config.absolute_x_vocab_size)
+        self.cls_absolute_position_y1 = GisBertLMPredictionHead(
+            config, config.absolute_y_vocab_size)
+        self.cls_absolute_position_y2 = GisBertLMPredictionHead(
+            config, config.absolute_y_vocab_size)
+        self.cls_relative_position_x1 = GisBertLMPredictionHead(
+            config, config.relative_x_vocab_size)
+        self.cls_relative_position_x2 = GisBertLMPredictionHead(
+            config, config.relative_x_vocab_size)
+        self.cls_relative_position_y1 = GisBertLMPredictionHead(
+            config, config.relative_y_vocab_size)
+        self.cls_relative_position_y2 = GisBertLMPredictionHead(
+            config, config.relative_y_vocab_size)
+        self.config = config
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+        rel_type_ids=None,
+        absolute_position_ids=None,
+        relative_position_ids=None,
+        token_type_ids_label=None,
+        rel_type_ids_label=None,
+        absolute_position_ids_label=None,
+        relative_position_ids_label=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices
+            should be in ``[-100, 0, ..., config.vocab_size]`` (see
+            ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with
+            labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_embeds=encoder_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+            rel_type_ids=rel_type_ids,
+            absolute_position_ids=absolute_position_ids,
+            relative_position_ids=relative_position_ids,
+        )
+
+        sequence_output = outputs[0]
+
+        prediction_scores = self.cls_geom_id(sequence_output)
+        loss_fct = CrossEntropyLoss()  # -100 index = padding token
+        masked_lm_loss = loss_fct(
+            prediction_scores.view(-1, self.config.vocab_size),
+            labels.view(-1))
+
+        positions_cls = [
+            self.cls_geom_type, self.cls_rel_type,
+            self.cls_absolute_position_x1, self.cls_absolute_position_x2,
+            self.cls_absolute_position_y1, self.cls_absolute_position_y2,
+            self.cls_relative_position_x1, self.cls_relative_position_x2,
+            self.cls_relative_position_y1, self.cls_relative_position_y2
+        ]
+        positions_label = [
+            token_type_ids_label, rel_type_ids_label,
+            absolute_position_ids_label[:, :,
+                                        0], absolute_position_ids_label[:, :,
+                                                                        2],
+            absolute_position_ids_label[:, :,
+                                        1], absolute_position_ids_label[:, :,
+                                                                        3],
+            relative_position_ids_label[:, :,
+                                        0], relative_position_ids_label[:, :,
+                                                                        2],
+            relative_position_ids_label[:, :,
+                                        1], relative_position_ids_label[:, :,
+                                                                        3]
+        ]
+        positions_size = [
+            self.config.type_vocab_size, self.config.rel_type_vocab_size,
+            self.config.absolute_x_vocab_size,
+            self.config.absolute_x_vocab_size,
+            self.config.absolute_y_vocab_size,
+            self.config.absolute_y_vocab_size,
+            self.config.relative_x_vocab_size,
+            self.config.relative_x_vocab_size,
+            self.config.relative_y_vocab_size,
+            self.config.relative_y_vocab_size
+        ]
+        for mycls, mylabels, mysize in zip(positions_cls, positions_label,
+                                           positions_size):
+            if mylabels is not None:
+                myprediction_scores = mycls(sequence_output)
+                masked_lm_loss += loss_fct(
+                    myprediction_scores.view(-1, mysize), mylabels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multi_modal',
+        soft_labels=None,
+        alpha=0,
+        return_logits=False,
+        rel_type_ids=None,
+        absolute_position_ids=None,
+        relative_position_ids=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_embeds=encoder_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+            rel_type_ids=rel_type_ids,
+            absolute_position_ids=absolute_position_ids,
+            relative_position_ids=relative_position_ids,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if soft_labels is not None:
+            loss_distill = -torch.sum(
+                F.log_softmax(prediction_scores, dim=-1) * soft_labels, dim=-1)
+            loss_distill = loss_distill[labels != -100].mean()
+            masked_lm_loss = (1
+                              - alpha) * masked_lm_loss + alpha * loss_distill
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, 'The PAD token should be defined for generation'
+
+        padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+
+
+class BertForNextSentencePrediction(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequenåce prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(
+                seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores, ) + outputs[2:]
+            return ((next_sentence_loss, )
+                    + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertForSequenceClassification(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertForMultipleChoice(BertPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
+        `optional`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices-1]`` where
+            :obj:`num_choices` is the size of the second dimension of the input
+            tensors. (See :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[
+            1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(
+            -1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(
+            -1,
+            attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(
+            -1,
+            token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(
+            -1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2),
+                               inputs_embeds.size(-1))
+            if inputs_embeds is not None else None)
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should
+            be in ``[0, ..., config.num_labels - 1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertForQuestionAnswering(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
+        `optional`):
+            Labels for position (index) of the start of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (:obj:`sequence_length`). Position
+            outside of the sequence are not taken into account for computing the
+            loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
+        `optional`):
+            Labels for position (index) of the end of the labelled span for
+            computing the token classification loss. Positions are clamped to
+            the length of the sequence (:obj:`sequence_length`). Position
+            outside of the sequence are not taken into account for computing the
+            loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss, )
+                    + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class MGeoPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface
+    for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the
+                    label information. num_labels: An optional arg to tell the
+                    model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping
+                                    if num_labels not supplied. If num_labels is
+                                    not found, the model will use the default
+                                    setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by
+            transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+        if model_dir is None:
+            config = BertConfig(**model_args)
+            model = cls(config)
+        else:
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_args)
+        model.model_dir = model_dir
+        return model
+
+
+@MODELS.register_module(Tasks.backbone, module_name=Models.mgeo)
+class MGeo(MGeoPreTrainedModel):
+
+    def __init__(self,
+                 config: BertConfig,
+                 finetune_mode: str = 'single-modal',
+                 gis_num: int = 1,
+                 add_pooling_layer=False,
+                 **kwargs):
+        super().__init__(config)
+
+        self.finetune_mode = finetune_mode
+
+        self.config = config
+        self.text_encoder = BertModel(
+            config, add_pooling_layer=add_pooling_layer)
+
+        if self.finetune_mode == 'multi-modal':
+            gis_config = BertConfig.from_dict(config.gis_encoder)
+            self.gis_encoder = BertModel(gis_config, add_pooling_layer=False)
+            for param in self.gis_encoder.parameters():
+                param.requires_grad = False
+            self.gis2text = nn.Linear(gis_config.hidden_size,
+                                      self.config.hidden_size)
+            self.gis_type = nn.Embedding(gis_num, gis_config.hidden_size)
+
+        self.init_weights()
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                is_decoder=False,
+                mode='single-modal',
+                gis_list=None,
+                gis_tp=None,
+                use_token_type=False):
+        if self.finetune_mode == 'multi-modal' and gis_list is not None and len(
+                gis_list) > 0:
+            gis_embs = []
+            gis_atts = []
+            for gis in gis_list:
+                gis_embs.append(
+                    self.gis_encoder(return_dict=True, mode='text',
+                                     **gis).last_hidden_state)
+                gis_atts.append(gis['attention_mask'])
+        if use_token_type:
+            embedding_output = self.text_encoder.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+            )
+        else:
+            embedding_output = self.text_encoder.embeddings(
+                input_ids=input_ids, )
+
+        if self.finetune_mode == 'multi-modal' and gis_list is not None and len(
+                gis_list) > 0:
+            embs = [embedding_output]
+            atts = [attention_mask]
+            tp_emb = [self.gis_type(gtp) for gtp in gis_tp]
+            for ge, ga, gt in zip(gis_embs, gis_atts, tp_emb):
+                embs.append(self.gis2text(ge + gt))
+                atts.append(ga)
+            merge_emb = torch.cat(embs, dim=1)
+            merge_attention = torch.cat(atts, dim=-1)
+        else:
+            merge_emb = embedding_output
+            merge_attention = attention_mask
+        encoder_outputs = self.text_encoder(
+            attention_mask=merge_attention,
+            encoder_embeds=merge_emb,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode='text')
+
+        if not return_dict:
+            return encoder_outputs
+
+        return AttentionBackboneModelOutput(
+            last_hidden_state=encoder_outputs.last_hidden_state,
+            pooler_output=encoder_outputs.pooler_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+        return output
+
+    def extract_sequence_outputs(self, outputs):
+        return outputs['last_hidden_state']
+
+    def extract_pooled_outputs(self, outputs):
+        return outputs['pooler_output']
diff --git a/modelscope/models/multi_modal/mgeo/text_classification.py b/modelscope/models/multi_modal/mgeo/text_classification.py
new file mode 100644
index 00000000..d5b4f1b6
--- /dev/null
+++ b/modelscope/models/multi_modal/mgeo/text_classification.py
@@ -0,0 +1,187 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import MGeo, MGeoPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.text_classification, module_name=Models.mgeo)
+@MODELS.register_module(Tasks.nli, module_name=Models.mgeo)
+@MODELS.register_module(
+    Tasks.sentiment_classification, module_name=Models.mgeo)
+@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.mgeo)
+@MODELS.register_module(
+    Tasks.zero_shot_classification, module_name=Models.mgeo)
+class MGeoForSequenceClassification(MGeoPreTrainedModel):
+
+    def __init__(self, config, finetune_mode: str = 'single-modal', **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        setattr(self, self.base_model_prefix,
+                MGeo(config, finetune_mode, add_pooling_layer=True))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        gis_list=None,
+        gis_tp=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.base_model.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            gis_list=gis_list,
+            gis_tp=gis_tp,
+            use_token_type=True,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long
+                                              or labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+
+            if self.config.problem_type == 'regression':
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return AttentionTextClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modelscope/models/multi_modal/mgeo/text_ranking.py b/modelscope/models/multi_modal/mgeo/text_ranking.py
new file mode 100644
index 00000000..b0d75aa1
--- /dev/null
+++ b/modelscope/models/multi_modal/mgeo/text_ranking.py
@@ -0,0 +1,88 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from modelscope.metainfo import Models
+from modelscope.models import Model
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTextClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import MGeo, MGeoPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.text_ranking, module_name=Models.mgeo)
+class MGeoForTextRanking(MGeoPreTrainedModel):
+
+    def __init__(self,
+                 config,
+                 finetune_mode: str = 'single-modal',
+                 gis_num: int = 1,
+                 *args,
+                 **kwargs):
+        super().__init__(config)
+        neg_sample = kwargs.get('neg_sample', 8)
+        eval_neg_sample = kwargs.get('eval_neg_sample', 8)
+        self.neg_sample = neg_sample
+        self.eval_neg_sample = eval_neg_sample
+        setattr(
+            self, self.base_model_prefix,
+            MGeo(self.config, finetune_mode, gis_num, add_pooling_layer=True))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                labels=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                gis_list=None,
+                gis_tp=None,
+                *args,
+                **kwargs) -> AttentionTextClassificationModelOutput:
+        outputs = self.base_model.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            gis_list=gis_list,
+            gis_tp=gis_tp,
+        )
+
+        # backbone model should return pooled_output as its second output
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        if self.base_model.training:
+            scores = logits.view(-1, self.neg_sample + 1)
+            batch_size = scores.size(0)
+            loss_fct = torch.nn.CrossEntropyLoss()
+            target_label = torch.zeros(
+                batch_size, dtype=torch.long, device=scores.device)
+            loss = loss_fct(scores, target_label)
+            return AttentionTextClassificationModelOutput(
+                loss=loss,
+                logits=logits,
+            )
+        return AttentionTextClassificationModelOutput(logits=logits, )
diff --git a/modelscope/models/multi_modal/mgeo/token_classification.py b/modelscope/models/multi_modal/mgeo/token_classification.py
new file mode 100644
index 00000000..5f49f863
--- /dev/null
+++ b/modelscope/models/multi_modal/mgeo/token_classification.py
@@ -0,0 +1,230 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionTokenClassificationModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import MGeo, MGeoPreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+@MODELS.register_module(Tasks.token_classification, module_name=Models.mgeo)
+@MODELS.register_module(Tasks.part_of_speech, module_name=Models.mgeo)
+@MODELS.register_module(Tasks.word_segmentation, module_name=Models.mgeo)
+class MGeoForTokenClassification(MGeoPreTrainedModel):
+    r"""Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation.
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Preprocessor:
+        This is the fill_mask model of Bert, the preprocessor of this model
+        is `modelscope.preprocessors.TokenClassificationTransformersPreprocessor`.
+
+    Trainer:
+        This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
+        NlpEpochBasedTrainer, or trainers from other frameworks.
+        The preferred trainer in ModelScope is NlpEpochBasedTrainer.
+
+    Parameters:
+        config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
+            all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+    """
+    _keys_to_ignore_on_load_unexpected = [r'pooler']
+
+    def __init__(self, config, finetune_mode: str = 'single-modal', **kwargs):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        setattr(self, self.base_model_prefix,
+                MGeo(config, finetune_mode, add_pooling_layer=False))
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None
+            else config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        offset_mapping=None,
+        label_mask=None,
+        gis_list=None,
+        gis_tp=None,
+    ):
+        r"""
+        Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using
+            :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and
+            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
+
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in ``[0, 1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or
+        :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert :obj:`input_ids` indices into
+            associated vectors than the model's internal embedding lookup
+            matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention
+            layers. See ``attentions`` under returned tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See
+            ``hidden_states`` under returned tensors for more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput`
+            instead of a plain tuple.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
+        `optional`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If
+            :obj:`config.num_labels == 1` a regression loss is computed
+            (Mean-Square loss), If :obj:`config.num_labels > 1` a classification
+            loss is computed (Cross-Entropy).
+        offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the sentence.
+            Selected in the range ``[0, sequence_length - 1]``.
+        label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
+        sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+            Returns `modelscope.outputs.AttentionTokenClassificationModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            gis_list=gis_list,
+            gis_tp=gis_tp,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels))
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits, ) + outputs[2:]
+            return ((loss, ) + output) if loss is not None else output
+
+        return AttentionTokenClassificationModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            offset_mapping=offset_mapping,
+            label_mask=label_mask,
+        )
diff --git a/modelscope/models/multi_modal/mplug/__init__.py b/modelscope/models/multi_modal/mplug/__init__.py
index 955c87e2..67b0a426 100644
--- a/modelscope/models/multi_modal/mplug/__init__.py
+++ b/modelscope/models/multi_modal/mplug/__init__.py
@@ -13,5 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .configuration_mplug import MPlugConfig
-from .modeling_mplug import CONFIG_NAME, MPlug
+from .configuration_mplug import HiTeAConfig, MPlugConfig
+from .modeling_mplug import CONFIG_NAME, HiTeA, MPlug
diff --git a/modelscope/models/multi_modal/mplug/configuration_mplug.py b/modelscope/models/multi_modal/mplug/configuration_mplug.py
index 9900ff7c..dcbb0270 100644
--- a/modelscope/models/multi_modal/mplug/configuration_mplug.py
+++ b/modelscope/models/multi_modal/mplug/configuration_mplug.py
@@ -114,3 +114,67 @@ class MPlugConfig(PretrainedConfig):
         with open(yaml_file, 'r', encoding='utf-8') as reader:
             config_dict = yaml.load(reader, Loader=yaml.Loader)
         return cls(**config_dict)
+
+
+class HiTeAConfig(PretrainedConfig):
+
+    model_type = 'hitea'
+
+    def __init__(
+            self,
+            task=Tasks.video_question_answering,
+            bert_config='config_bert.json',
+            image_res=224,
+            num_frames=16,
+            batch_size_train=32,
+            vision_width=768,
+            distill=True,
+            batch_size_test=64,
+            k_test=128,
+            alpha=0.4,
+            warm_up=True,
+            eos='[SEP]',
+            optimizer=None,
+            schedular=None,
+            min_length=1,
+            max_length=10,
+            beam_size=5,
+            text_encoder='bert-base-uncased',
+            text_decoder='bert-base-uncased',
+            # retrieval
+            queue_size=65536,
+            embed_dim=256,
+            temp=0.07,
+            **kwargs):
+
+        super().__init__(**kwargs)
+        self.task = task
+        self.bert_config = bert_config
+        self.image_res = image_res
+        self.num_frames = num_frames
+        self.batch_size_train = batch_size_train
+        self.vision_width = vision_width
+        self.distill = distill
+        self.batch_size_test = batch_size_test
+        self.k_test = k_test
+        self.alpha = alpha
+        self.warm_up = warm_up
+        self.eos = eos
+        self.optimizer = optimizer
+        self.schedular = schedular
+        self.min_length = min_length
+        self.max_length = max_length
+        self.beam_size = beam_size
+        self.text_encoder = text_encoder
+        self.text_decoder = text_decoder
+        # retrieval
+        self.queue_size = queue_size
+        self.embed_dim = embed_dim
+        self.temp = temp
+
+    @classmethod
+    def from_yaml_file(cls, yaml_file: Union[str,
+                                             os.PathLike]) -> Dict[str, Any]:
+        with open(yaml_file, 'r', encoding='utf-8') as reader:
+            config_dict = yaml.load(reader, Loader=yaml.Loader)
+        return cls(**config_dict)
diff --git a/modelscope/models/multi_modal/mplug/modeling_mplug.py b/modelscope/models/multi_modal/mplug/modeling_mplug.py
index 4b393439..98edd898 100755
--- a/modelscope/models/multi_modal/mplug/modeling_mplug.py
+++ b/modelscope/models/multi_modal/mplug/modeling_mplug.py
@@ -40,7 +40,9 @@ from transformers.modeling_utils import (PreTrainedModel,
                                          prune_linear_layer)
 from transformers.utils import logging
 
-from modelscope.models.multi_modal.mplug.configuration_mplug import MPlugConfig
+from modelscope.models.multi_modal.mplug.configuration_mplug import (
+    HiTeAConfig, MPlugConfig)
+from modelscope.models.multi_modal.mplug.mvit import MViTv2, MViTv2_Base_config
 from modelscope.models.multi_modal.mplug.predictor import TextGenerator
 from modelscope.utils.constant import ModelFile
 
@@ -2483,3 +2485,322 @@ class MPlugForImageTextRetrieval(MPlug):
             scores = F.softmax(scores, dim=-1)
 
             return scores
+
+
+class HiTeA(PreTrainedModel):
+    config_class = HiTeAConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.tokenizer = BertTokenizer.from_pretrained(
+            os.path.join(config.model_dir, ModelFile.VOCAB_FILE))
+        self.module_setting(config)
+        self.visual_encoder = MViTv2(
+            img_size=config.image_res,
+            config=MViTv2_Base_config,
+            num_frames=config.num_frames)
+        self.text_encoder = BertModel(
+            self.config_encoder, add_pooling_layer=False)
+        self.fusion_encoder = FusionModel(
+            self.config_fusion, add_pooling_layer=False)
+
+    @classmethod
+    def from_pretrained(cls, model_dir, load_checkpoint=True):
+        from modelscope.utils.constant import Tasks
+
+        task_mapping = {
+            Tasks.video_question_answering: HiTeAForVideoQuestionAnswering,
+            Tasks.video_captioning: HiTeAForVideoCaption,
+        }
+        config = cls.config_class.from_yaml_file(
+            os.path.join(model_dir, CONFIG_NAME))
+        config.model_dir = model_dir
+        model = task_mapping[config.task](config)
+        if load_checkpoint:
+            checkpoint_path = os.path.join(model_dir,
+                                           ModelFile.TORCH_MODEL_BIN_FILE)
+            checkpoint = torch.load(checkpoint_path, map_location='cpu')
+            if 'model' in checkpoint:
+                checkpoint = checkpoint['model']
+            if 'module' in checkpoint:
+                checkpoint = checkpoint['module']
+            checkpoint = {
+                k.replace('model.', ''): v
+                for k, v in checkpoint.items()
+            }
+
+            model.load_state_dict(checkpoint, strict=False)
+        return model
+
+    def init_distill(self, config):
+        self.distill = config.distill
+        if self.distill:
+            self.visual_encoder_m = MViTv2(
+                img_size=config.image_res,
+                config=MViTv2_Base_config,
+                num_frames=config.num_frames)
+            self.text_encoder_m = BertModel(
+                self.config_encoder, add_pooling_layer=False)
+            self.fusion_encoder_m = FusionModel(
+                self.config_fusion, add_pooling_layer=False)
+            self.text_decoder_m = BertLMHeadModel(self.config_decoder)
+            self.model_pairs = [
+                [self.visual_encoder, self.visual_encoder_m],
+                [self.text_encoder, self.text_encoder_m],
+                [self.text_decoder, self.text_decoder_m],
+            ]
+            self.copy_params()
+            self.momentum = 0.995
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def module_setting(self, config):
+        bert_config_path = os.path.join(config.model_dir, config.bert_config)
+        self.config_encoder = BertConfig.from_json_file(bert_config_path)
+        self.config_encoder.num_hidden_layers = self.config_encoder.text_encoder_layers
+        self.config_fusion = BertConfig.from_json_file(bert_config_path)
+        self.config_decoder = BertConfig.from_json_file(bert_config_path)
+        self.config_decoder.add_cross_attention = True
+        self.config_decoder.num_hidden_layers = self.config_decoder.text_decode_layers
+
+    @torch.no_grad()
+    def copy_params(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(),
+                                      model_pair[1].parameters()):
+                param_m.data.copy_(param.data)  # initialize
+                param_m.requires_grad = False  # not update by gradient
+
+    @torch.no_grad()
+    def _momentum_update(self):
+        for model_pair in self.model_pairs:
+            for param, param_m in zip(model_pair[0].parameters(),
+                                      model_pair[1].parameters()):
+                param_m.data = param_m.data * self.momentum + param.data * (
+                    1. - self.momentum)
+
+    def generation(self, question_states, question_atts, out_size=1):
+        encoder_inputs = [question_states, question_atts]
+        topk_ids, topk_scores = self.beam_generator.translate_batch(
+            encoder_inputs, out_size=out_size)
+        return topk_ids, topk_scores
+
+    @staticmethod
+    def _tile(x, dim, n_tile):
+        import numpy as np
+        init_dim = x.size(dim)
+        repeat_idx = [1] * x.dim()
+        repeat_idx[dim] = n_tile
+        x = x.repeat(*(repeat_idx))
+        order_index = torch.LongTensor(
+            np.concatenate(
+                [init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
+        return torch.index_select(x, dim, order_index.to(x.device))
+
+
+class HiTeAForVideoQuestionAnswering(HiTeA):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_decoder = BertLMHeadModel(self.config_decoder)
+        self.beam_generator = TextGenerator(config, self.text_decoder)
+        self.init_distill(config)
+
+    def forward(self,
+                video,
+                question,
+                answer=None,
+                alpha=0,
+                k=None,
+                weights=None,
+                train=True):
+        video = video.to(dtype=next(self.parameters()).dtype)
+        video_embeds = self.visual_encoder(video)
+        video_atts = torch.ones(
+            video_embeds.size()[:-1], dtype=torch.long).to(video.device)
+
+        if train:
+            '''
+            k: number of answers for each question
+            weights: weight for each answer
+            '''
+            answer_targets = answer.input_ids.masked_fill(
+                answer.input_ids == self.tokenizer.pad_token_id, -100)
+            text_output = self.text_encoder(
+                question.input_ids,
+                attention_mask=question.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            fusion_output = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=question.attention_mask,
+                encoder_hidden_states=video_embeds,
+                encoder_attention_mask=video_atts,
+                return_dict=False)
+
+            video_output, question_output = fusion_output
+
+            question_output = torch.cat([video_output, question_output], 1)
+            merge_text_attention = torch.cat(
+                [video_atts, question.attention_mask], 1)
+
+            if k is None:
+                k = [1] * question_output.shape[0]
+            question_states = []
+            question_atts = []
+            for b, n in enumerate(k):
+                question_states += [question_output[b]] * n
+                question_atts += [merge_text_attention[b]] * n
+            question_states = torch.stack(question_states, 0)
+            question_atts = torch.stack(question_atts, 0)
+
+            if self.distill:
+                with torch.no_grad():
+                    self._momentum_update()
+                    video_embeds_m = self.visual_encoder_m(video)
+                    text_output_m = self.text_encoder_m(
+                        question.input_ids,
+                        attention_mask=question.attention_mask,
+                        return_dict=True)
+                    text_embeds_m = text_output_m.last_hidden_state
+                    fusion_output_m = self.fusion_encoder_m(
+                        encoder_embeds=text_embeds_m,
+                        attention_mask=question.attention_mask,
+                        encoder_hidden_states=video_embeds_m,
+                        encoder_attention_mask=video_atts,
+                        return_dict=False)
+
+                    image_output_m, question_output_m = fusion_output_m
+                    question_output_m = torch.cat(
+                        [image_output_m, question_output_m], 1)
+
+                    question_states_m = []
+                    for b, n in enumerate(k):
+                        question_states_m += [question_output_m[b]] * n
+                    question_states_m = torch.stack(question_states_m, 0)
+
+                    logits_m = self.text_decoder_m(
+                        answer.input_ids,
+                        attention_mask=answer.attention_mask,
+                        encoder_hidden_states=question_states_m,
+                        encoder_attention_mask=question_atts,
+                        return_logits=True,
+                    )
+
+                answer_output = self.text_decoder(
+                    answer.input_ids,
+                    attention_mask=answer.attention_mask,
+                    encoder_hidden_states=question_states,
+                    encoder_attention_mask=question_atts,
+                    labels=answer_targets,
+                    return_dict=True,
+                    soft_labels=F.softmax(logits_m, dim=-1),
+                    reduction='none',
+                )
+            else:
+                answer_output = self.text_decoder(
+                    answer.input_ids,
+                    attention_mask=answer.attention_mask,
+                    encoder_hidden_states=question_states,
+                    encoder_attention_mask=question_atts,
+                    labels=answer_targets,
+                    return_dict=True,
+                    reduction='none',
+                )
+            if weights is None:
+                weights = 1
+            loss = weights * answer_output.loss
+            loss = loss.sum() / video.size(0)
+
+            return loss
+
+        else:
+            text_output = self.text_encoder(
+                question.input_ids,
+                attention_mask=question.attention_mask,
+                return_dict=True)
+            text_embeds = text_output.last_hidden_state
+            fusion_output = self.fusion_encoder(
+                encoder_embeds=text_embeds,
+                attention_mask=question.attention_mask,
+                encoder_hidden_states=video_embeds,
+                encoder_attention_mask=video_atts,
+                return_dict=False)
+            video_output, question_output = fusion_output
+            question_output = torch.cat([video_output, question_output], 1)
+            merge_text_attention = torch.cat(
+                [video_atts, question.attention_mask], 1)
+            topk_ids, topk_probs = self.generation(question_output,
+                                                   merge_text_attention)
+            return topk_ids, topk_probs
+
+
+class HiTeAForVideoCaption(HiTeA):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.text_decoder = BertPrefixModel(self.config_decoder)
+        self.beam_generator = TextGenerator(config, self.text_decoder)
+
+    def beam_search(self,
+                    video,
+                    question,
+                    answer=None,
+                    train=True,
+                    out_size=5):
+        video_embeds = self.visual_encoder(video)
+        video_atts = torch.ones(
+            video_embeds.size()[:-1], dtype=torch.long).to(video.device)
+        text_output = self.text_encoder(
+            question.input_ids,
+            attention_mask=question.attention_mask,
+            return_dict=True)
+        text_embeds = text_output.last_hidden_state
+        fusion_output = self.fusion_encoder(
+            encoder_embeds=text_embeds,
+            attention_mask=question.attention_mask,
+            encoder_hidden_states=video_embeds,
+            encoder_attention_mask=video_atts,
+            return_dict=False)
+        video_output, question_output = fusion_output
+        question_output = torch.cat([video_output, question_output], 1)
+        merge_text_attention = torch.cat([video_atts, question.attention_mask],
+                                         1)
+        topk_ids, topk_probs = self.generation(
+            question_output, merge_text_attention, out_size=out_size)
+        return topk_ids, topk_probs
+
+    def forward(self,
+                video,
+                question,
+                answer=None,
+                train=True,
+                out_size=5,
+                scst=False):
+        if (scst):
+            return self.beam_search(
+                video, question, answer, train=True, out_size=out_size)
+        video = video.to(dtype=next(self.parameters()).dtype)
+        video_embeds = self.visual_encoder(video)
+        video_atts = torch.ones(
+            video_embeds.size()[:-1], dtype=torch.long).to(video.device)
+
+        if train:
+            answer_targets = answer.input_ids.masked_fill(
+                answer.input_ids == self.tokenizer.pad_token_id, -100)
+            answer_output = self.text_decoder(
+                answer.input_ids,
+                attention_mask=answer.attention_mask,
+                encoder_hidden_states=video_embeds,
+                encoder_attention_mask=video_atts,
+                labels=answer_targets,
+                return_dict=True,
+                reduction='none')
+            loss = answer_output.loss
+
+            return loss
+        else:
+            topk_ids, topk_probs = self.generation(video_embeds, video_atts)
+            return topk_ids, topk_probs
diff --git a/modelscope/models/multi_modal/mplug/mvit.py b/modelscope/models/multi_modal/mplug/mvit.py
new file mode 100644
index 00000000..f3140ce4
--- /dev/null
+++ b/modelscope/models/multi_modal/mplug/mvit.py
@@ -0,0 +1,1007 @@
+# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. All Rights Reserved.
+
+from collections import OrderedDict
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import trunc_normal_
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except ImportError:
+    checkpoint_wrapper = None
+
+MViTv2_Base_config = {
+    'depth':
+    24,
+    'dim_mul': [[2, 2.0], [5, 2.0], [21, 2.0]],
+    'head_mul': [[2, 2.0], [5, 2.0], [21, 2.0]],
+    'pool_q_stride':
+    [[0, 1, 1, 1], [1, 1, 1, 1], [2, 1, 2, 2], [3, 1, 1, 1], [4, 1, 1, 1],
+     [5, 1, 2, 2], [6, 1, 1, 1], [7, 1, 1, 1], [8, 1, 1, 1], [9, 1, 1, 1],
+     [10, 1, 1, 1], [11, 1, 1, 1], [12, 1, 1, 1], [13, 1, 1, 1], [14, 1, 1, 1],
+     [15, 1, 1, 1], [16, 1, 1, 1], [17, 1, 1, 1], [18, 1, 1, 1], [19, 1, 1, 1],
+     [20, 1, 1, 1], [21, 1, 2, 2], [22, 1, 1, 1], [23, 1, 1, 1]],
+    'pool_kvq_kernel': [3, 3, 3],
+    'pool_kv_stride_adaptive': [1, 4, 4],
+}
+
+
+def interpolate_rel_pos_embed(state_dict_origin,
+                              state_dict_model,
+                              temporal=True,
+                              verbose=False):
+    rel_pos_embed_types = ['rel_pos_h', 'rel_pos_w']
+    if temporal:
+        rel_pos_embed_types += ['rel_pos_t']
+
+    state_dict_inflated = state_dict_origin.copy()
+    for k, v2d in state_dict_origin.items():
+        if any([x in k for x in rel_pos_embed_types]):
+            v3d = state_dict_model[k]
+            if v2d.shape[0] != v3d.shape[0]:
+                rel_pos_resized = F.interpolate(
+                    v2d.reshape(1, v2d.shape[0], -1).permute(0, 2, 1),
+                    size=v3d.shape[0],
+                    mode='linear',
+                )
+                v3d = rel_pos_resized.reshape(-1, v3d.shape[0]).permute(1, 0)
+                if verbose:
+                    print('Inflate {}: {} -> {}: {}'.format(
+                        k, v2d.shape, k, v3d.shape))
+            else:
+                v3d = v2d
+            state_dict_inflated[k] = v3d.clone()
+    return state_dict_inflated
+
+
+def _prepare_mvit_configs(cfg):
+    depth = cfg['depth']
+    dim_mul, head_mul = torch.ones(depth + 1), torch.ones(depth + 1)
+    for i in range(len(cfg['dim_mul'])):
+        dim_mul[cfg['dim_mul'][i][0]] = cfg['dim_mul'][i][1]
+    for i in range(len(cfg['head_mul'])):
+        head_mul[cfg['head_mul'][i][0]] = cfg['head_mul'][i][1]
+
+    pool_q = [[] for i in range(depth)]
+    pool_kv = [[] for i in range(depth)]
+    stride_q = [[] for i in range(depth)]
+    stride_kv = [[] for i in range(depth)]
+
+    for i in range(len(cfg['pool_q_stride'])):
+        stride_q[cfg['pool_q_stride'][i][0]] = cfg['pool_q_stride'][i][1:]
+        pool_q[cfg['pool_q_stride'][i][0]] = cfg['pool_kvq_kernel']
+
+    if cfg['pool_kv_stride_adaptive'] is not None:
+        _stride_kv = cfg['pool_kv_stride_adaptive']
+        cfg['pool_kv_stride'] = []
+        for i in range(cfg['depth']):
+            if len(stride_q[i]) > 0:
+                _stride_kv = [
+                    max(_stride_kv[d] // stride_q[i][d], 1)
+                    for d in range(len(_stride_kv))
+                ]
+            cfg['pool_kv_stride'].append([i] + _stride_kv)
+
+    for i in range(len(cfg['pool_kv_stride'])):
+        stride_kv[cfg['pool_kv_stride'][i][0]] = cfg['pool_kv_stride'][i][1:]
+        pool_kv[cfg['pool_kv_stride'][i][0]] = cfg['pool_kvq_kernel']
+
+    return dim_mul, head_mul, pool_q, pool_kv, stride_q, stride_kv
+
+
+class Mlp(nn.Module):
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop_rate=0.0,
+    ):
+        super().__init__()
+        self.drop_rate = drop_rate
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        if self.drop_rate > 0.0:
+            self.drop = nn.Dropout(drop_rate)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        if self.drop_rate > 0.0:
+            x = self.drop(x)
+        x = self.fc2(x)
+        if self.drop_rate > 0.0:
+            x = self.drop(x)
+        return x
+
+
+class Permute(nn.Module):
+
+    def __init__(self, dims):
+        super().__init__()
+        self.dims = dims
+
+    def forward(self, x):
+        return x.permute(*self.dims)
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """
+    Stochastic Depth per sample.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    mask = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    mask.floor_()  # binarize
+    output = x.div(keep_prob) * mask
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+def round_width(width, multiplier, min_width=1, divisor=1, verbose=False):
+    if not multiplier:
+        return width
+    width *= multiplier
+    min_width = min_width or divisor
+    if verbose:
+        print(f'min width {min_width}')
+        print(f'width {width} divisor {divisor}')
+        print(f'other {int(width + divisor / 2) // divisor * divisor}')
+
+    width_out = max(min_width, int(width + divisor / 2) // divisor * divisor)
+    if width_out < 0.9 * width:
+        width_out += divisor
+    return int(width_out)
+
+
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed.
+    """
+
+    def __init__(
+            self,
+            dim_in=3,
+            dim_out=768,
+            kernel=(7, 7),
+            stride=(4, 4),
+            padding=(3, 3),
+            conv2d=False,
+    ):
+        super().__init__()
+
+        if conv2d:
+            conv_function = nn.Conv2d
+        else:
+            conv_function = nn.Conv3d
+
+        self.proj = conv_function(
+            dim_in,
+            dim_out,
+            kernel_size=kernel,
+            stride=stride,
+            padding=padding,
+        )
+
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B HW C
+        return x.flatten(2).transpose(1, 2), x.shape
+
+
+def attention_pool(tensor, pool, thw_shape, has_cls_embed=True, norm=None):
+    if pool is None:
+        return tensor, thw_shape
+    tensor_dim = tensor.ndim
+    if tensor_dim == 4:
+        pass
+    elif tensor_dim == 3:
+        tensor = tensor.unsqueeze(1)
+    else:
+        raise NotImplementedError(
+            f'Unsupported input dimension {tensor.shape}')
+
+    if has_cls_embed:
+        cls_tok, tensor = tensor[:, :, :1, :], tensor[:, :, 1:, :]
+
+    B, N, L, C = tensor.shape
+    T, H, W = thw_shape
+    tensor = (
+        tensor.reshape(B * N, T, H, W, C).permute(0, 4, 1, 2, 3).contiguous())
+
+    tensor = pool(tensor)
+
+    thw_shape = [tensor.shape[2], tensor.shape[3], tensor.shape[4]]
+    L_pooled = tensor.shape[2] * tensor.shape[3] * tensor.shape[4]
+    tensor = tensor.reshape(B, N, C, L_pooled).transpose(2, 3)
+    if has_cls_embed:
+        tensor = torch.cat((cls_tok, tensor), dim=2)
+    if norm is not None:
+        tensor = norm(tensor)
+    # Assert tensor_dim in [3, 4]
+    if tensor_dim == 4:
+        pass
+    else:  # tensor_dim == 3:
+        tensor = tensor.squeeze(1)
+    return tensor, thw_shape
+
+
+def get_rel_pos(rel_pos, d):
+    if isinstance(d, int):
+        ori_d = rel_pos.shape[0]
+        if ori_d == d:
+            return rel_pos
+        else:
+            # Interpolate rel pos.
+            new_pos_embed = F.interpolate(
+                rel_pos.reshape(1, ori_d, -1).permute(0, 2, 1),
+                size=d,
+                mode='linear',
+            )
+
+            return new_pos_embed.reshape(-1, d).permute(1, 0)
+
+
+def cal_rel_pos_spatial(attn, q, k, has_cls_embed, q_shape, k_shape, rel_pos_h,
+                        rel_pos_w):
+    """
+    Decomposed Spatial Relative Positional Embeddings.
+    """
+    sp_idx = 1 if has_cls_embed else 0
+    q_t, q_h, q_w = q_shape
+    k_t, k_h, k_w = k_shape
+    dh = int(2 * max(q_h, k_h) - 1)
+    dw = int(2 * max(q_w, k_w) - 1)
+
+    # Scale up rel pos if shapes for q and k are different.
+    q_h_ratio = max(k_h / q_h, 1.0)
+    k_h_ratio = max(q_h / k_h, 1.0)
+    dist_h = (
+        torch.arange(q_h)[:, None] * q_h_ratio
+        - torch.arange(k_h)[None, :] * k_h_ratio)
+    dist_h += (k_h - 1) * k_h_ratio
+    q_w_ratio = max(k_w / q_w, 1.0)
+    k_w_ratio = max(q_w / k_w, 1.0)
+    dist_w = (
+        torch.arange(q_w)[:, None] * q_w_ratio
+        - torch.arange(k_w)[None, :] * k_w_ratio)
+    dist_w += (k_w - 1) * k_w_ratio
+
+    # Intepolate rel pos if needed.
+    rel_pos_h = get_rel_pos(rel_pos_h, dh)
+    rel_pos_w = get_rel_pos(rel_pos_w, dw)
+    Rh = rel_pos_h[dist_h.long()]
+    Rw = rel_pos_w[dist_w.long()]
+
+    B, n_head, q_N, dim = q.shape
+
+    r_q = q[:, :, sp_idx:].reshape(B, n_head, q_t, q_h, q_w, dim)
+    rel_h_q = torch.einsum('bythwc,hkc->bythwk', r_q,
+                           Rh)  # [B, H, q_t, qh, qw, k_h]
+    rel_w_q = torch.einsum('bythwc,wkc->bythwk', r_q,
+                           Rw)  # [B, H, q_t, qh, qw, k_w]
+
+    attn[:, :, sp_idx:, sp_idx:] = (
+        attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_t, q_h, q_w, k_t, k_h, k_w)
+        + rel_h_q[:, :, :, :, :, None, :, None]
+        + rel_w_q[:, :, :, :, :, None, None, :]).view(B, -1, q_t * q_h * q_w,
+                                                      k_t * k_h * k_w)
+
+    return attn
+
+
+def cal_rel_pos_temporal(attn, q, has_cls_embed, q_shape, k_shape, rel_pos_t):
+    """
+    Temporal Relative Positional Embeddings.
+    """
+    sp_idx = 1 if has_cls_embed else 0
+    q_t, q_h, q_w = q_shape
+    k_t, k_h, k_w = k_shape
+    dt = int(2 * max(q_t, k_t) - 1)
+    # Intepolate rel pos if needed.
+    rel_pos_t = get_rel_pos(rel_pos_t, dt)
+
+    # Scale up rel pos if shapes for q and k are different.
+    q_t_ratio = max(k_t / q_t, 1.0)
+    k_t_ratio = max(q_t / k_t, 1.0)
+    dist_t = (
+        torch.arange(q_t)[:, None] * q_t_ratio
+        - torch.arange(k_t)[None, :] * k_t_ratio)
+    dist_t += (k_t - 1) * k_t_ratio
+    Rt = rel_pos_t[dist_t.long()]
+
+    B, n_head, q_N, dim = q.shape
+
+    r_q = q[:, :, sp_idx:].reshape(B, n_head, q_t, q_h, q_w, dim)
+    # [B, H, q_t, q_h, q_w, dim] -> [q_t, B, H, q_h, q_w, dim] -> [q_t, B*H*q_h*q_w, dim]
+    r_q = r_q.permute(2, 0, 1, 3, 4, 5).reshape(q_t, B * n_head * q_h * q_w,
+                                                dim)
+
+    # [q_t, B*H*q_h*q_w, dim] * [q_t, dim, k_t] = [q_t, B*H*q_h*q_w, k_t] -> [B*H*q_h*q_w, q_t, k_t]
+    rel = torch.matmul(r_q, Rt.transpose(1, 2)).transpose(0, 1)
+    # [B*H*q_h*q_w, q_t, k_t] -> [B, H, q_t, q_h, q_w, k_t]
+    rel = rel.view(B, n_head, q_h, q_w, q_t, k_t).permute(0, 1, 4, 2, 3, 5)
+
+    attn[:, :, sp_idx:, sp_idx:] = (
+        attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_t, q_h, q_w, k_t, k_h, k_w)
+        + rel[:, :, :, :, :, :, None, None]).view(B, -1, q_t * q_h * q_w,
+                                                  k_t * k_h * k_w)
+
+    return attn
+
+
+class MultiScaleAttention(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        input_size,
+        num_heads=8,
+        qkv_bias=False,
+        drop_rate=0.0,
+        kernel_q=(1, 1, 1),
+        kernel_kv=(1, 1, 1),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        norm_layer=nn.LayerNorm,
+        has_cls_embed=True,
+        # Options include `conv`, `avg`, and `max`.
+        mode='conv',
+        # If True, perform pool before projection.
+        pool_first=False,
+        rel_pos_spatial=False,
+        rel_pos_temporal=False,
+        rel_pos_zero_init=False,
+        residual_pooling=True,
+        separate_qkv=False,
+    ):
+        super().__init__()
+        self.pool_first = pool_first
+        self.separate_qkv = separate_qkv
+        self.drop_rate = drop_rate
+        self.num_heads = num_heads
+        self.dim_out = dim_out
+        head_dim = dim_out // num_heads
+        self.scale = head_dim**-0.5
+        self.has_cls_embed = has_cls_embed
+        padding_q = [int(q // 2) for q in kernel_q]
+        padding_kv = [int(kv // 2) for kv in kernel_kv]
+
+        if pool_first or separate_qkv:
+            self.q = nn.Linear(dim, dim_out, bias=qkv_bias)
+            self.k = nn.Linear(dim, dim_out, bias=qkv_bias)
+            self.v = nn.Linear(dim, dim_out, bias=qkv_bias)
+        else:
+            self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias)
+
+        self.proj = nn.Linear(dim_out, dim_out)
+        if drop_rate > 0.0:
+            self.proj_drop = nn.Dropout(drop_rate)
+
+        # Skip pooling with kernel and stride size of (1, 1, 1).
+        if np.prod(kernel_q) == 1 and np.prod(stride_q) == 1:
+            kernel_q = ()
+        if np.prod(kernel_kv) == 1 and np.prod(stride_kv) == 1:
+            kernel_kv = ()
+        self.mode = mode
+
+        if mode in ('avg', 'max'):
+            pool_op = nn.MaxPool3d if mode == 'max' else nn.AvgPool3d
+            self.pool_q = (
+                pool_op(kernel_q, stride_q, padding_q, ceil_mode=False)
+                if len(kernel_q) > 0 else None)
+            self.pool_k = (
+                pool_op(kernel_kv, stride_kv, padding_kv, ceil_mode=False)
+                if len(kernel_kv) > 0 else None)
+            self.pool_v = (
+                pool_op(kernel_kv, stride_kv, padding_kv, ceil_mode=False)
+                if len(kernel_kv) > 0 else None)
+        elif mode == 'conv' or mode == 'conv_unshared':
+            if pool_first:
+                dim_conv = dim // num_heads if mode == 'conv' else dim
+            else:
+                dim_conv = dim_out // num_heads if mode == 'conv' else dim_out
+            self.pool_q = (
+                nn.Conv3d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_q,
+                    stride=stride_q,
+                    padding=padding_q,
+                    groups=dim_conv,
+                    bias=False,
+                ) if len(kernel_q) > 0 else None)
+            self.norm_q = norm_layer(dim_conv) if len(kernel_q) > 0 else None
+            self.pool_k = (
+                nn.Conv3d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=dim_conv,
+                    bias=False,
+                ) if len(kernel_kv) > 0 else None)
+            self.norm_k = norm_layer(dim_conv) if len(kernel_kv) > 0 else None
+            self.pool_v = (
+                nn.Conv3d(
+                    dim_conv,
+                    dim_conv,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=dim_conv,
+                    bias=False,
+                ) if len(kernel_kv) > 0 else None)
+            self.norm_v = norm_layer(dim_conv) if len(kernel_kv) > 0 else None
+        else:
+            raise NotImplementedError(f'Unsupported model {mode}')
+
+        self.rel_pos_spatial = rel_pos_spatial
+        self.rel_pos_temporal = rel_pos_temporal
+        if self.rel_pos_spatial:
+            assert input_size[1] == input_size[2]
+            size = input_size[1]
+            q_size = size // stride_q[1] if len(stride_q) > 0 else size
+            kv_size = size // stride_kv[1] if len(stride_kv) > 0 else size
+            rel_sp_dim = 2 * max(q_size, kv_size) - 1
+
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, head_dim))
+            if not rel_pos_zero_init:
+                trunc_normal_(self.rel_pos_h, std=0.02)
+                trunc_normal_(self.rel_pos_w, std=0.02)
+        if self.rel_pos_temporal:
+            self.rel_pos_t = nn.Parameter(
+                torch.zeros(2 * input_size[0] - 1, head_dim))
+            # if not rel_pos_zero_init:
+            #     trunc_normal_(self.rel_pos_t, std=0.02)
+
+        self.residual_pooling = residual_pooling
+
+    def forward(self, x, thw_shape):
+        B, N, _ = x.shape
+
+        if self.pool_first:
+            if self.mode == 'conv_unshared':
+                fold_dim = 1
+            else:
+                fold_dim = self.num_heads
+            x = x.reshape(B, N, fold_dim, -1).permute(0, 2, 1, 3)
+            q = k = v = x
+        else:
+            assert self.mode != 'conv_unshared'
+            if not self.separate_qkv:
+                qkv = (
+                    self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                        -1).permute(2, 0, 3, 1, 4))
+                q, k, v = qkv[0], qkv[1], qkv[2]
+            else:
+                q = k = v = x
+                q = (
+                    self.q(q).reshape(B, N, self.num_heads,
+                                      -1).permute(0, 2, 1, 3))
+                k = (
+                    self.k(k).reshape(B, N, self.num_heads,
+                                      -1).permute(0, 2, 1, 3))
+                v = (
+                    self.v(v).reshape(B, N, self.num_heads,
+                                      -1).permute(0, 2, 1, 3))
+
+        q, q_shape = attention_pool(
+            q,
+            self.pool_q,
+            thw_shape,
+            has_cls_embed=self.has_cls_embed,
+            norm=self.norm_q if hasattr(self, 'norm_q') else None,
+        )
+        k, k_shape = attention_pool(
+            k,
+            self.pool_k,
+            thw_shape,
+            has_cls_embed=self.has_cls_embed,
+            norm=self.norm_k if hasattr(self, 'norm_k') else None,
+        )
+        v, v_shape = attention_pool(
+            v,
+            self.pool_v,
+            thw_shape,
+            has_cls_embed=self.has_cls_embed,
+            norm=self.norm_v if hasattr(self, 'norm_v') else None,
+        )
+
+        if self.pool_first:
+            q_N = (
+                np.prod(q_shape)
+                + 1 if self.has_cls_embed else np.prod(q_shape))
+            k_N = (
+                np.prod(k_shape)
+                + 1 if self.has_cls_embed else np.prod(k_shape))
+            v_N = (
+                np.prod(v_shape)
+                + 1 if self.has_cls_embed else np.prod(v_shape))
+
+            q = q.permute(0, 2, 1, 3).reshape(B, q_N, -1)
+            q = (
+                self.q(q).reshape(B, q_N, self.num_heads,
+                                  -1).permute(0, 2, 1, 3))
+
+            v = v.permute(0, 2, 1, 3).reshape(B, v_N, -1)
+            v = (
+                self.v(v).reshape(B, v_N, self.num_heads,
+                                  -1).permute(0, 2, 1, 3))
+
+            k = k.permute(0, 2, 1, 3).reshape(B, k_N, -1)
+            k = (
+                self.k(k).reshape(B, k_N, self.num_heads,
+                                  -1).permute(0, 2, 1, 3))
+
+        N = q.shape[2]
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.rel_pos_spatial:
+            attn = cal_rel_pos_spatial(
+                attn,
+                q,
+                k,
+                self.has_cls_embed,
+                q_shape,
+                k_shape,
+                self.rel_pos_h,
+                self.rel_pos_w,
+            )
+
+        if self.rel_pos_temporal:
+            attn = cal_rel_pos_temporal(
+                attn,
+                q,
+                self.has_cls_embed,
+                q_shape,
+                k_shape,
+                self.rel_pos_t,
+            )
+        attn = attn.softmax(dim=-1)
+
+        x = attn @ v
+
+        if self.residual_pooling:
+            # Minor Difference
+            if self.has_cls_embed:
+                x[:, :, 1:, :] += q[:, :, 1:, :]
+            else:
+                x = x + q
+
+        x = x.transpose(1, 2).reshape(B, -1, self.dim_out)
+        x = self.proj(x)
+
+        if self.drop_rate > 0.0:
+            x = self.proj_drop(x)
+        return x, q_shape
+
+
+class MultiScaleBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        num_heads,
+        input_size,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        up_rate=None,
+        kernel_q=(1, 1, 1),
+        kernel_kv=(1, 1, 1),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        mode='conv',
+        has_cls_embed=True,
+        pool_first=False,
+        rel_pos_spatial=False,
+        rel_pos_temporal=False,
+        rel_pos_zero_init=False,
+        residual_pooling=True,
+        dim_mul_in_att=False,
+        separate_qkv=False,
+        use_grad_checkpoint=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.dim_out = dim_out
+        self.norm1 = norm_layer(dim)
+        self.dim_mul_in_att = dim_mul_in_att
+        kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
+        stride_skip = stride_q
+        padding_skip = [int(skip // 2) for skip in kernel_skip]
+        att_dim = dim_out if dim_mul_in_att else dim
+
+        self.use_grad_checkpoint = use_grad_checkpoint
+
+        self.attn = MultiScaleAttention(
+            dim,
+            att_dim,
+            num_heads=num_heads,
+            input_size=input_size,
+            qkv_bias=qkv_bias,
+            drop_rate=drop_rate,
+            kernel_q=kernel_q,
+            kernel_kv=kernel_kv,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            norm_layer=norm_layer,
+            has_cls_embed=has_cls_embed,
+            mode=mode,
+            pool_first=pool_first,
+            rel_pos_spatial=rel_pos_spatial,
+            rel_pos_temporal=rel_pos_temporal,
+            rel_pos_zero_init=rel_pos_zero_init,
+            residual_pooling=residual_pooling,
+            separate_qkv=separate_qkv,
+        )
+        self.drop_path = (
+            DropPath(drop_path) if drop_path > 0.0 else nn.Identity())
+        self.norm2 = norm_layer(att_dim)
+        mlp_hidden_dim = int(att_dim * mlp_ratio)
+        self.has_cls_embed = has_cls_embed
+        # TODO: check the use case for up_rate, and merge the following lines
+        if up_rate is not None and up_rate > 1:
+            mlp_dim_out = dim * up_rate
+        else:
+            mlp_dim_out = dim_out
+        self.mlp = Mlp(
+            in_features=att_dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=mlp_dim_out,
+            act_layer=act_layer,
+            drop_rate=drop_rate,
+        )
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+
+        self.pool_skip = (
+            nn.MaxPool3d(
+                kernel_skip, stride_skip, padding_skip, ceil_mode=False)
+            if len(kernel_skip) > 0 else None)
+
+    def forward(self, x, thw_shape):
+        x_norm = self.norm1(x)
+        if self.use_grad_checkpoint:
+            x_block, thw_shape_new = checkpoint.checkpoint(
+                self.attn, x_norm, thw_shape)
+        else:
+            x_block, thw_shape_new = self.attn(x_norm, thw_shape)
+
+        if self.dim_mul_in_att and self.dim != self.dim_out:
+            x = self.proj(x_norm)
+        x_res, _ = attention_pool(
+            x, self.pool_skip, thw_shape, has_cls_embed=self.has_cls_embed)
+        x = x_res + self.drop_path(x_block)
+        x_norm = self.norm2(x)
+        if self.use_grad_checkpoint:
+            x_mlp = checkpoint.checkpoint(self.mlp, x_norm)
+        else:
+            x_mlp = self.mlp(x_norm)
+
+        if not self.dim_mul_in_att and self.dim != self.dim_out:
+            x = self.proj(x_norm)
+        x = x + self.drop_path(x_mlp)
+        return x, thw_shape_new
+
+
+class MViTv2(nn.Module):
+    """
+    Improved Multiscale Vision Transformers for Classification and Detection
+    Yanghao Li*, Chao-Yuan Wu*, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik,
+        Christoph Feichtenhofer*
+    https://arxiv.org/abs/2112.01526
+    Multiscale Vision Transformers
+    Haoqi Fan*, Bo Xiong*, Karttikeya Mangalam*, Yanghao Li*, Zhicheng Yan, Jitendra Malik,
+        Christoph Feichtenhofer*
+    https://arxiv.org/abs/2104.11227
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        embed_dim=96,
+        num_classes=1000,
+        num_frames=4,
+        num_heads=1,
+        depth=24,
+        patch_kernel=[3, 7, 7],
+        patch_stride=[2, 4, 4],
+        patch_padding=[1, 3, 3],
+        config=None,
+        dropout_rate=0.,
+        drop_path_rate=0.,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        mode='conv',
+        cls_embed_on=True,
+        use_abs_pos=False,
+        rel_pos_spatial=True,
+        rel_pos_temporal=True,
+        rel_pos_zero_init=False,
+        residual_pooling=True,
+        dim_mul_in_att=True,
+        pool_first=False,
+        zero_decay_pos_cls=False,
+        separate_qkv=False,
+        norm_stem=False,
+        sep_pos_embed=False,
+        use_grad_checkpoint=True,
+    ):
+        super().__init__()
+        # Prepare input.
+        in_chans = 3
+        self.img_size = img_size
+        # Prepare output.
+        self.num_classes = num_classes
+        self.embed_dim = embed_dim
+        # MViT params.
+        self.num_heads = num_heads
+        self.depth = depth
+        self.cls_embed_on = cls_embed_on
+        self.use_abs_pos = use_abs_pos
+        self.zero_decay_pos_cls = zero_decay_pos_cls
+        self.use_grad_checkpoint = use_grad_checkpoint
+        self.sep_pos_embed = sep_pos_embed
+        self.drop_rate = dropout_rate
+
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        if use_grad_checkpoint:
+            self.patch_embed = checkpoint_wrapper(
+                PatchEmbed(
+                    dim_in=in_chans,
+                    dim_out=embed_dim,
+                    kernel=patch_kernel,
+                    stride=patch_stride,
+                    padding=patch_padding,
+                ))
+        else:
+            self.patch_embed = PatchEmbed(
+                dim_in=in_chans,
+                dim_out=embed_dim,
+                kernel=patch_kernel,
+                stride=patch_stride,
+                padding=patch_padding,
+            )
+
+        patch_dims = [
+            num_frames // patch_stride[0],
+            img_size // patch_stride[1],
+            img_size // patch_stride[2],
+        ]
+        num_patches = np.prod(patch_dims)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+
+        if self.cls_embed_on:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+            pos_embed_dim = num_patches + 1
+        else:
+            pos_embed_dim = num_patches
+
+        if self.use_abs_pos:
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, pos_embed_dim, embed_dim))
+
+        if self.use_abs_pos:
+            if self.sep_pos_embed:
+                self.pos_embed_spatial = nn.Parameter(
+                    torch.zeros(1, self.patch_dims[1] * self.patch_dims[2],
+                                embed_dim))
+                self.pos_embed_temporal = nn.Parameter(
+                    torch.zeros(1, self.patch_dims[0], embed_dim))
+                if self.cls_embed_on:
+                    self.pos_embed_class = nn.Parameter(
+                        torch.zeros(1, 1, embed_dim))
+            else:
+                self.pos_embed = nn.Parameter(
+                    torch.zeros(1, pos_embed_dim, embed_dim))
+
+        assert config is not None
+        # MViT backbone configs
+        dim_mul, head_mul, pool_q, pool_kv, stride_q, stride_kv = _prepare_mvit_configs(
+            config)
+        input_size = patch_dims
+
+        self.norm_stem = norm_layer(embed_dim) if norm_stem else None
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            num_heads = round_width(num_heads, head_mul[i])
+            if dim_mul_in_att:
+                dim_out = round_width(
+                    embed_dim,
+                    dim_mul[i],
+                    divisor=round_width(num_heads, head_mul[i]),
+                )
+            else:
+                dim_out = round_width(
+                    embed_dim,
+                    dim_mul[i + 1],
+                    divisor=round_width(num_heads, head_mul[i + 1]),
+                )
+            attention_block = MultiScaleBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                input_size=input_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_rate=self.drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                kernel_q=pool_q[i] if len(pool_q) > i else [],
+                kernel_kv=pool_kv[i] if len(pool_kv) > i else [],
+                stride_q=stride_q[i] if len(stride_q) > i else [],
+                stride_kv=stride_kv[i] if len(stride_kv) > i else [],
+                mode=mode,
+                has_cls_embed=self.cls_embed_on,
+                pool_first=pool_first,
+                rel_pos_spatial=rel_pos_spatial,
+                rel_pos_temporal=rel_pos_temporal,
+                rel_pos_zero_init=rel_pos_zero_init,
+                residual_pooling=residual_pooling,
+                dim_mul_in_att=dim_mul_in_att,
+                separate_qkv=separate_qkv,
+                use_grad_checkpoint=False)
+            if use_grad_checkpoint:
+                attention_block = checkpoint_wrapper(
+                    attention_block, offload_to_cpu=False)
+            self.blocks.append(attention_block)
+
+            if len(stride_q[i]) > 0:
+                input_size = [
+                    size // stride
+                    for size, stride in zip(input_size, stride_q[i])
+                ]
+            embed_dim = dim_out
+
+        self.norm = norm_layer(embed_dim)
+
+        self.head = nn.Identity()
+
+        if self.use_abs_pos:
+            if self.sep_pos_embed:
+                trunc_normal_(self.pos_embed_spatial, std=0.02)
+                trunc_normal_(self.pos_embed_temporal, std=0.02)
+                if self.cls_embed_on:
+                    trunc_normal_(self.pos_embed_class, std=0.02)
+            else:
+                trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_embed_on:
+            trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        names = []
+        if self.zero_decay_pos_cls:
+            if self.use_abs_pos:
+                if self.sep_pos_embed:
+                    names.extend([
+                        'pos_embed_spatial',
+                        'pos_embed_temporal',
+                        'pos_embed_class',
+                    ])
+                else:
+                    names.append(['pos_embed'])
+            if self.rel_pos_spatial:
+                names.extend(['rel_pos_h', 'rel_pos_w', 'rel_pos_hw'])
+            if self.rel_pos_temporal:
+                names.extend(['rel_pos_t'])
+            if self.cls_embed_on:
+                names.append('cls_token')
+
+        return names
+
+    def _get_pos_embed(self, pos_embed, bcthw):
+        t, h, w = bcthw[-3], bcthw[-2], bcthw[-1]
+        if self.cls_embed_on:
+            cls_pos_embed = pos_embed[:, 0:1, :]
+            pos_embed = pos_embed[:, 1:]
+        txy_num = pos_embed.shape[1]
+        p_t, p_h, p_w = self.patch_dims
+        assert p_t * p_h * p_w == txy_num
+
+        if (p_t, p_h, p_w) != (t, h, w):
+            new_pos_embed = F.interpolate(
+                pos_embed[:, :, :].reshape(1, p_t, p_h, p_w,
+                                           -1).permute(0, 4, 1, 2, 3),
+                size=(t, h, w),
+                mode='trilinear',
+            )
+            pos_embed = new_pos_embed.reshape(1, -1,
+                                              t * h * w).permute(0, 2, 1)
+
+        if self.cls_embed_on:
+            pos_embed = torch.cat((cls_pos_embed, pos_embed), dim=1)
+
+        return pos_embed
+
+    def forward_features(self, x):
+        x = x.permute(0, 2, 1, 3, 4)
+        x, bcthw = self.patch_embed(x)
+
+        T, H, W = bcthw[-3], bcthw[-2], bcthw[-1]
+        B, N, C = x.shape
+
+        if self.cls_embed_on:
+            cls_tokens = self.cls_token.expand(
+                B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        if self.use_abs_pos:
+            if self.sep_pos_embed:
+                pos_embed = self.pos_embed_spatial.repeat(
+                    1, self.patch_dims[0], 1) + torch.repeat_interleave(
+                        self.pos_embed_temporal,
+                        self.patch_dims[1] * self.patch_dims[2],
+                        dim=1)
+                if self.cls_embed_on:
+                    pos_embed = torch.cat([self.pos_embed_class, pos_embed], 1)
+                pos_embed = self._get_pos_embed(pos_embed, bcthw)
+                x = x + pos_embed
+            else:
+                pos_embed = self._get_pos_embed(self.pos_embed, bcthw)
+                x = x + pos_embed
+
+        if self.drop_rate:
+            x = self.pos_drop(x)
+
+        if self.norm_stem:
+            x = self.norm_stem(x)
+
+        thw = [T, H, W]
+        for blk in self.blocks:
+            x, thw = blk(x, thw)
+
+        x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
diff --git a/modelscope/models/multi_modal/mplug_for_all_tasks.py b/modelscope/models/multi_modal/mplug_for_all_tasks.py
index 4d2a6ac2..f15b69d2 100644
--- a/modelscope/models/multi_modal/mplug_for_all_tasks.py
+++ b/modelscope/models/multi_modal/mplug_for_all_tasks.py
@@ -11,7 +11,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 
-__all__ = ['MPlugForAllTasks']
+__all__ = ['MPlugForAllTasks', 'HiTeAForAllTasks']
 
 
 @MODELS.register_module(
@@ -81,3 +81,69 @@ class MPlugForAllTasks(TorchModel):
         # evaluate
         topk_ids, _ = output
         return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
+
+
+@MODELS.register_module(
+    Tasks.video_question_answering, module_name=Models.hitea)
+@MODELS.register_module(Tasks.video_captioning, module_name=Models.hitea)
+class HiTeAForAllTasks(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """initialize the hitea model from the `model_dir` path.
+        Args:
+            model_dir (str): the model path.
+        """
+
+        super().__init__(model_dir, *args, **kwargs)
+        from modelscope.models.multi_modal.mplug import HiTeA
+        self.model = HiTeA.from_pretrained(model_dir)
+        self.tokenizer = self.model.tokenizer
+
+    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """return the result by the model
+
+        Args:
+            input (Dict[str, Tensor]): the preprocessed data
+
+        Returns:
+            Dict[str, Tensor]: results
+                Example:
+                    {
+                        'predictions': Tensor([[1377, 4959, 2785, 6392...])]),
+                    }
+        """
+
+        # get task from config file
+        task = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION)).task
+
+        # inference
+        if not self.training and 'question' in input:
+            output = self.model(input['video'], input['question'], train=False)
+            topk_ids, _ = output
+            pred_string: List[str] = \
+                self.tokenizer.decode(topk_ids[0][0], skip_special_tokens=True)
+            output_key = OutputKeys.CAPTION \
+                if task == Tasks.video_captioning else OutputKeys.TEXT
+            return {output_key: pred_string}
+
+        # train and evaluate
+        import addict
+        video = input['video']
+        answer = addict.Dict(
+            input_ids=input['answer_input_ids'],
+            attention_mask=input['answer_attention_mask'])
+        if 'index' not in input:
+            question = addict.Dict(
+                input_ids=input['question_input_ids'],
+                attention_mask=input['question_attention_mask'])
+            output = self.model(video, question, answer, train=self.training)
+        else:
+            index = input['index']
+            output = self.model(video, answer, index, train=self.training)
+        if self.training:
+            return {OutputKeys.LOSS: output}
+
+        # evaluate
+        topk_ids, _ = output
+        return {'sequences': [list_tensor[0] for list_tensor in topk_ids]}
diff --git a/modelscope/models/multi_modal/multi_stage_diffusion/model.py b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
index 05ddc6a5..5912df7b 100644
--- a/modelscope/models/multi_modal/multi_stage_diffusion/model.py
+++ b/modelscope/models/multi_modal/multi_stage_diffusion/model.py
@@ -26,6 +26,7 @@ from modelscope.models.multi_modal.multi_stage_diffusion.upsampler import (
     Upsampler256, Upsampler1024)
 from modelscope.models.multi_modal.multi_stage_diffusion.xglm import XGLM
 from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.device import create_device
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -309,23 +310,17 @@ class UnCLIP(nn.Module):
     Tasks.text_to_image_synthesis, module_name=Models.multi_stage_diffusion)
 class MultiStageDiffusionForTextToImageSynthesis(TorchModel):
 
-    def __init__(self, model_dir, device_id=-1):
-        super().__init__(model_dir=model_dir, device_id=device_id)
+    def __init__(self, model_dir, device='gpu'):
+        device = 'gpu' if torch.cuda.is_available() else 'cpu'
+        super().__init__(model_dir=model_dir, device=device)
         model = UnCLIP(model_dir=model_dir)
         pretrained_params = torch.load(
             osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE), 'cpu')
         model.load_state_dict(pretrained_params)
         model.eval()
 
-        self.device_id = device_id
-        if self.device_id >= 0:
-            self.device = torch.device(f'cuda:{self.device_id}')
-            model.to('cuda:{}'.format(self.device_id))
-            logger.info('Use GPU: {}'.format(self.device_id))
-        else:
-            self.device = torch.device('cpu')
-            logger.info('Use CPU for inference')
-        self.model = model
+        self.device = create_device(device)
+        self.model = model.to(self.device)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         if not isinstance(input, dict):
diff --git a/modelscope/models/multi_modal/ofa/configuration_mmspeech.py b/modelscope/models/multi_modal/ofa/configuration_mmspeech.py
index 4793ee7f..48240877 100644
--- a/modelscope/models/multi_modal/ofa/configuration_mmspeech.py
+++ b/modelscope/models/multi_modal/ofa/configuration_mmspeech.py
@@ -73,7 +73,81 @@ class MMSpeechConfig(PretrainedConfig):
             for more details.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-    """
+                is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether or not the model is used as an encoder/decoder.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Whether or not to scale the embedding. If True, embedding = Math.qrt(d_model) * embedding
+        pad_token_id (`int`, *optional*, defaults to `1`):
+            The id of the _padding_ token.
+        bos_token_id (`int`, *optional*, defaults to `0`):
+            The id of the _beginning-of-stream_ token.
+        decoder_start_token_id (`int`, *optional*, defaults to `0`):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
+        eos_token_id (`int`, *optional*, defaults to `2`):
+            The id of the _end-of-stream_ token.
+        forced_eos_token_id (`int`, *optional*, defaults to `2`):
+            The id of the token to force as the last generated token when `max_length` is reached.
+        encoder_normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization before the self attention and fc layer within encoder layer
+        decoder_normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization before the self attention and fc layer within decoder layer
+        normformer (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization between the self attention layers and fc layer within
+            encoder&decoder layer
+        encoder_drop_path_rate (`float`, *optional*, defaults to `0.0`):
+            The drop path rate using in the encoder. see more about drop path [drop path](https://arxiv.org/abs/1605.07648)
+        decoder_drop_path_rate (`float`, *optional*, defaults to `0.0`):
+            The drop path rate using in the decoder. see more about drop path [drop path](https://arxiv.org/abs/1605.07648)
+        layernorm_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization for text input embedding in encoder and decoder.
+        patch_layernorm_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization for image patch input embedding in encoder and decoder.
+        entangle_position_embedding (`bool`, *optional*, defaults to `False`):
+            Whether or not to entangle position embedding to input embedding.
+        resnet_type (`str`, *optional*, defaults to `"resnet101"`):
+            The image encoder's type in OFA, only works when use_ofasys=False. `"resnet18"`, `"resnet34"`,
+            `"resnet50"`, `"resnet101"` and `"resnet152"` are supported.
+        resnet_model_path (`str`, *optional*, defaults to `None`):
+            The path where can load resnet model. If None, will use random initialized weights.
+        resnet_drop_path_rate
+            The drop path rate using in resnet for image encoding, see more about drop path
+            [drop path](https://arxiv.org/abs/1605.07648)
+        token_bucket_size (`int`, *optional*, defaults to `256`):
+            The number of token buckets to use for each attention layer.
+        image_bucket_size (`int`, *optional*, defaults to `42`):
+            The number of image buckets to use for each attention layer.
+        add_type_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to add type embedding to the input while encoding. So far, type means the type of modality,
+            and only Text&Image modalities is supported, `0`=Text, `1`=Image
+        share_decoder_input_output_embed (`bool`, *optional*, defaults to `True`):
+            Whether or not to share the input embedding table as the weights the output projection in decoder. If False,
+            using a new linear projection.
+        attn_scale_factor (`float`, *optional*, defaults to `2.0`):
+            The position embedding scaling factor. If it works,
+            position_embedding = position_embedding * float(d_model / num_attention_heads * attn_scale_factor)**-0.5
+        code_layernorm_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to user layer normalization for code generation
+        code_image_size (`int`, *optional*, defaults to `128`):
+            Image size of generated images. Also used in calculating the image's position id for attention bias.
+        interpolate_position (`bool`, *optional*, defaults to `False`):
+            Deprecated now, will be deleted in next version.
+        orig_patch_image_size (`int`, *optional*, defaults to `224`):
+            Deprecated now, will be deleted in next version.
+        share_attn_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not to share attn_bias cross transformer layers
+        use_image_feature (`bool`, *optional*, defaults to `True`):
+            Whether or not the model have image modality.
+        disable_entangle (`bool`, *optional*, defaults to `False`):
+            Whether or not to disable the entangle relative configs.
+        use_ofasys (`bool`, *optional*, defaults to `False`):
+            Whether or not the model is come from OFA-Sys. If True, the model structure will be some differences from OFA
+        vit_type (`str`, *optional*, defaults to `"vit_base"`):
+            The image encoder's type in OFA-Sys, only works when use_ofasys=True. `"vit_base"`, `"vit_large"`,
+            `"vit_large_336"` and `"vit_huge"` are supported.
+        vit_drop_path_rate
+            The drop path rate using the image encoder vit. see more about drop path
+            [drop path](https://arxiv.org/abs/1605.07648)
+    """ # noqa
 
     model_type = 'ofa'
     keys_to_ignore_at_inference = ['past_key_values']
@@ -135,6 +209,12 @@ class MMSpeechConfig(PretrainedConfig):
                  use_ofasys=False,
                  vit_type='vit_base',
                  vit_drop_path_rate=0.0,
+                 use_gamma_feature=False,
+                 gamma=1.0,
+                 exclude_mlp=True,
+                 temperature_init_value=None,
+                 remove_decoder_type_embedding=False,
+                 mlp_dim=512,
                  required_seq_len_multiple=2,
                  encoder_pos_conv_depth=5,
                  encoder_conv_pos=95,
@@ -205,6 +285,15 @@ class MMSpeechConfig(PretrainedConfig):
         self.use_ofasys = use_ofasys
         self.vit_type = vit_type
         self.vit_drop_path_rate = vit_drop_path_rate
+        self.use_gamma_feature = use_gamma_feature
+
+        # add some new features from ofa
+        self.use_gamma_feature = use_gamma_feature
+        self.gamma = gamma
+        self.exclude_mlp = exclude_mlp
+        self.temperature_init_value = temperature_init_value
+        self.remove_decoder_type_embedding = remove_decoder_type_embedding
+        self.mlp_dim = mlp_dim
 
         # FP16 optimization
         self.required_seq_len_multiple = required_seq_len_multiple
diff --git a/modelscope/models/multi_modal/ofa/configuration_ofa.py b/modelscope/models/multi_modal/ofa/configuration_ofa.py
index e82b542e..c520db34 100644
--- a/modelscope/models/multi_modal/ofa/configuration_ofa.py
+++ b/modelscope/models/multi_modal/ofa/configuration_ofa.py
@@ -80,7 +80,81 @@ class OFAConfig(PretrainedConfig):
             for more details.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-    """
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether or not the model is used as an encoder/decoder.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Whether or not to scale the embedding. If True, embedding = Math.qrt(d_model) * embedding
+        pad_token_id (`int`, *optional*, defaults to `1`):
+            The id of the _padding_ token.
+        bos_token_id (`int`, *optional*, defaults to `0`):
+            The id of the _beginning-of-stream_ token.
+        decoder_start_token_id (`int`, *optional*, defaults to `0`):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
+        eos_token_id (`int`, *optional*, defaults to `2`):
+            The id of the _end-of-stream_ token.
+        forced_eos_token_id (`int`, *optional*, defaults to `2`):
+            The id of the token to force as the last generated token when `max_length` is reached.
+        encoder_normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization before the self attention and fc layer within encoder layer
+        decoder_normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization before the self attention and fc layer within decoder layer
+        normformer (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization between the self attention layers and fc layer within
+            encoder&decoder layer
+        encoder_drop_path_rate (`float`, *optional*, defaults to `0.0`):
+            The drop path rate using in the encoder. see more about drop path [drop path](https://arxiv.org/abs/1605.07648)
+        decoder_drop_path_rate (`float`, *optional*, defaults to `0.0`):
+            The drop path rate using in the decoder. see more about drop path [drop path](https://arxiv.org/abs/1605.07648)
+        layernorm_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization for text input embedding in encoder and decoder.
+        patch_layernorm_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to use layer normalization for image patch input embedding in encoder and decoder.
+        entangle_position_embedding (`bool`, *optional*, defaults to `False`):
+            Whether or not to entangle position embedding to input embedding.
+        resnet_type (`str`, *optional*, defaults to `"resnet101"`):
+            The image encoder's type in OFA, only works when use_ofasys=False. `"resnet18"`, `"resnet34"`,
+            `"resnet50"`, `"resnet101"` and `"resnet152"` are supported.
+        resnet_model_path (`str`, *optional*, defaults to `None`):
+            The path where can load resnet model. If None, will use random initialized weights.
+        resnet_drop_path_rate
+            The drop path rate using in resnet for image encoding, see more about drop path
+            [drop path](https://arxiv.org/abs/1605.07648)
+        token_bucket_size (`int`, *optional*, defaults to `256`):
+            The number of token buckets to use for each attention layer.
+        image_bucket_size (`int`, *optional*, defaults to `42`):
+            The number of image buckets to use for each attention layer.
+        add_type_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to add type embedding to the input while encoding. So far, type means the type of modality,
+            and only Text&Image modalities is supported, `0`=Text, `1`=Image
+        share_decoder_input_output_embed (`bool`, *optional*, defaults to `True`):
+            Whether or not to share the input embedding table as the weights the output projection in decoder. If False,
+            using a new linear projection.
+        attn_scale_factor (`float`, *optional*, defaults to `2.0`):
+            The position embedding scaling factor. If it works,
+            position_embedding = position_embedding * float(d_model / num_attention_heads * attn_scale_factor)**-0.5
+        code_layernorm_embedding (`bool`, *optional*, defaults to `True`):
+            Whether or not to user layer normalization for code generation
+        code_image_size (`int`, *optional*, defaults to `128`):
+            Image size of generated images. Also used in calculating the image's position id for attention bias.
+        interpolate_position (`bool`, *optional*, defaults to `False`):
+            Deprecated now, will be deleted in next version.
+        orig_patch_image_size (`int`, *optional*, defaults to `224`):
+            Deprecated now, will be deleted in next version.
+        share_attn_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not to share attn_bias cross transformer layers
+        use_image_feature (`bool`, *optional*, defaults to `True`):
+            Whether or not the model have image modality.
+        disable_entangle (`bool`, *optional*, defaults to `False`):
+            Whether or not to disable the entangle relative configs.
+        use_ofasys (`bool`, *optional*, defaults to `False`):
+            Whether or not the model is come from OFA-Sys. If True, the model structure will be some differences from OFA
+        vit_type (`str`, *optional*, defaults to `"vit_base"`):
+            The image encoder's type in OFA-Sys, only works when use_ofasys=True. `"vit_base"`, `"vit_large"`,
+            `"vit_large_336"` and `"vit_huge"` are supported.
+        vit_drop_path_rate
+            The drop path rate using the image encoder vit. see more about drop path
+            [drop path](https://arxiv.org/abs/1605.07648)
+    """ # noqa
 
     model_type = 'ofa'
     keys_to_ignore_at_inference = ['past_key_values']
@@ -142,6 +216,12 @@ class OFAConfig(PretrainedConfig):
                  use_ofasys=False,
                  vit_type='vit_base',
                  vit_drop_path_rate=0.0,
+                 use_gamma_feature=False,
+                 gamma=1.0,
+                 exclude_mlp=True,
+                 temperature_init_value=None,
+                 remove_decoder_type_embedding=False,
+                 mlp_dim=512,
                  **kwargs):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -191,6 +271,14 @@ class OFAConfig(PretrainedConfig):
         self.vit_type = vit_type
         self.vit_drop_path_rate = vit_drop_path_rate
 
+        # add some new features from ofa
+        self.use_gamma_feature = use_gamma_feature
+        self.gamma = gamma
+        self.exclude_mlp = exclude_mlp
+        self.temperature_init_value = temperature_init_value
+        self.remove_decoder_type_embedding = remove_decoder_type_embedding
+        self.mlp_dim = mlp_dim
+
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
diff --git a/modelscope/models/multi_modal/ofa/generate/search.py b/modelscope/models/multi_modal/ofa/generate/search.py
index 0dcaf6b3..f37585a1 100644
--- a/modelscope/models/multi_modal/ofa/generate/search.py
+++ b/modelscope/models/multi_modal/ofa/generate/search.py
@@ -52,7 +52,7 @@ class Search(nn.Module):
             original_batch_idxs: (bsz)
                 the tensor with the batch indices, in the range [0, bsz)
                 this is useful in case there has been applied a re-ordering
-                and we need to know the orignal indices
+                and we need to know the original indices
 
         Return: A tuple of (scores, indices, beams) where:
             scores: (bsz x output_beam_size)
@@ -111,6 +111,13 @@ class Search(nn.Module):
 
 
 class BeamSearch(Search):
+    r"""
+    Beam search strategy.
+
+    step 1. Calculate top k candidates in model's log-probability under descending order. While k is the minor of
+        `beam_size * 2` and `beam_size * vocabulary_size`.
+    step 2. Modify hypothesis score, relative indices, beam indices for the final result.
+    """
 
     def __init__(self, tgt_dict):
         super().__init__(tgt_dict)
@@ -125,6 +132,32 @@ class BeamSearch(Search):
         prev_output_tokens: Optional[Tensor] = None,
         original_batch_idxs: Optional[Tensor] = None,
     ):
+        r"""
+        Take a single search step.
+
+        Args:
+            step (`int`):  Current step, start with 0.
+            lprobs (`Tensor` with size `(bsz, input_beam_size, vocab_size)`):
+                the model's log-probabilities over the vocabulary at the current step.
+            scores (`Tensor` with size `(bsz, input_beam_size, step - 1)`):
+                Previous sampling scores for each beam.
+            prev_output_tokens (`Tensor`, **optional**. default to `None`):
+                Previous output tokens, no usage in this function, will be deprecated in next version.
+            original_batch_idxs (`Tensor`, **optional**, default to `None`):
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the original indices
+
+        Returns: A tuple of (scores_buf, indices_buf, beams_buf), where:
+            scores_buf (`Tensor` with size `(bsz, output_beam_size)`):
+                The model's log-probabilities over the elements selected to sample from.
+                `output_beam_size` is the minor of `2 * input_beam_size` and `vocab_size - 1`.
+                which cumulates the score before.
+            indices_buf (`Tensor` with size `(bsz, output_beam_size)`):
+                The indices of chosen elements.
+            beams_buf (`Tensor` with size `(bsz, output_beam_size)`):
+                The indices of each beam.
+        """
         bsz, beam_size, vocab_size = lprobs.size()
 
         if step == 0:
@@ -156,6 +189,15 @@ class BeamSearch(Search):
 
 
 class PrefixConstrainedBeamSearch(Search):
+    r"""
+    Prefix constrained beam search.
+
+    step 1. Calculate a mask according to a `prefix_allowed_tokens_fn`
+        function with input of previous hypothesis tokens and indices.
+    step 2. Calculate a candidate set of `lprobs` with `lprobs` and mask produced in step 1.
+    step 3. Just like beam search strategy to generate the hypothesis token.
+        And the difference is the k in top k function is the minor of `beam_size` and `vocab_size -1`
+    """
 
     def __init__(self, tgt_dict, prefix_allowed_tokens_fn):
         super().__init__(tgt_dict)
@@ -185,6 +227,31 @@ class PrefixConstrainedBeamSearch(Search):
         prev_output_tokens: Tensor,
         original_batch_idxs: Tensor,
     ):
+        r"""
+        Take a single search step.
+
+        Args:
+            step (`int`):  Current step, start with 0.
+            lprobs (`Tensor` with size `(bsz, input_beam_size, vocab_size)`):
+                the model's log-probabilities over the vocabulary at the current step.
+            scores (`Tensor` with size `(bsz, input_beam_size, step - 1)`):
+                Previous sampling scores for each beam.
+            prev_output_tokens (`Tensor`, **optional**. default to `None`):
+                Previous output tokens, no usage in this function, will be deprecated in next version.
+            original_batch_idxs (`Tensor`, **optional**, default to `None`):
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the original indices
+
+        Returns: A tuple of (scores_buf, indices_buf, beams_buf), where:
+            scores_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The model's log-probabilities over the elements selected to sample from.
+                which cumulates the score before.
+            indices_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The indices of chosen elements.
+            beams_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The indices of each beam.
+        """
         bsz, beam_size, vocab_size = lprobs.size()
 
         lprobs += self.apply_mask(
@@ -553,6 +620,15 @@ class LexicallyConstrainedBeamSearch(Search):
 
 
 class LengthConstrainedBeamSearch(Search):
+    r"""
+    Length constrained beam search for generation.
+
+    step 1. Build length constraints in model's log-probability. If `min_lens` > `step`,
+        set eos token's score to `-math.inf`, so the generation will not be easily stopped.
+        Otherwise, `max_lens` <= `step`, set eos token's score to `0`, so the generation will
+        be easily stopped.
+    step 2. Using beam search to generate the hypothesis tokens with scores.
+    """
 
     def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b):
         super().__init__(tgt_dict)
@@ -571,8 +647,37 @@ class LengthConstrainedBeamSearch(Search):
         prev_output_tokens: Optional[Tensor] = None,
         original_batch_idxs: Optional[Tensor] = None,
     ):
+        r"""
+        Take a single search step.
+
+        Args:
+            step (`int`):  Current step, start with 0.
+            lprobs (`Tensor` with size `(bsz, input_beam_size, vocab_size)`):
+                the model's log-probabilities over the vocabulary at the current step.
+            scores (`Tensor` with size `(bsz, input_beam_size, step - 1)`):
+                Previous sampling scores for each beam.
+            prev_output_tokens (`Tensor`, **optional**. default to `None`):
+                Previous output tokens, no usage in this function, will be deprecated in next version.
+            original_batch_idxs (`Tensor`, **optional**, default to `None`):
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the original indices
+
+        Returns: A tuple of (scores_buf, indices_buf, beams_buf), where:
+            scores_buf (`Tensor` with size `(bsz, output_beam_size)`):
+                The model's log-probabilities over the elements selected to sample from.
+                `output_beam_size` is the minor of `2 * input_beam_size` and `vocab_size - 1`.
+                which cumulates the score before.
+            indices_buf (`Tensor` with size `(bsz, output_beam_size)`):
+                The indices of chosen elements.
+            beams_buf (`Tensor` with size `(bsz, output_beam_size)`):
+                The indices of each beam.
+        """
         min_lens = self.min_len_a * self.src_lengths + self.min_len_b
         max_lens = self.max_len_a * self.src_lengths + self.max_len_b
+        # There seems to be a bug here. Should be right like:
+        # lprobs[[step < min_lens] * len(lprobs), :, self.eos] = -math.inf
+        # lprobs[[step >= max_lens] * len(lprobs), :, self.eos] = 0
         lprobs[step < min_lens, :, self.eos] = -math.inf
         lprobs[step >= max_lens, :, self.eos] = 0
         return self.beam.step(step, lprobs, scores)
@@ -603,6 +708,31 @@ class DiverseBeamSearch(Search):
         prev_output_tokens: Optional[Tensor] = None,
         original_batch_idxs: Optional[Tensor] = None,
     ):
+        r"""
+        Take a single search step.
+
+        Args:
+            step (`int`):  Current step, start with 0.
+            lprobs (`Tensor` with size `(bsz, input_beam_size, vocab_size)`):
+                the model's log-probabilities over the vocabulary at the current step.
+            scores (`Tensor` with size `(bsz, input_beam_size, step - 1)`):
+                Previous sampling scores for each beam.
+            prev_output_tokens (`Tensor`, **optional**. default to `None`):
+                Previous output tokens, no usage in this function, will be deprecated in next version.
+            original_batch_idxs (`Tensor`, **optional**, default to `None`):
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the original indices
+
+        Returns: A tuple of (scores_buf, indices_buf, beams_buf), where:
+            scores_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The model's log-probabilities over the elements selected to sample from,
+                which cumulates the score before.
+            indices_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The indices of chosen elements.
+            beams_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The indices of each beam.
+        """
         bsz, beam_size, vocab_size = lprobs.size()
         if beam_size % self.num_groups != 0:
             raise ValueError(
@@ -648,6 +778,24 @@ class DiverseBeamSearch(Search):
 
 
 class Sampling(Search):
+    r"""
+    Sampling search for generation.
+
+    1. Calculate the sample set.
+        1.1 If `sampling_topk` is not None, chose the candidates which cumulative sum of model's
+            log-probability under descending order is less than `sampling_topk`.
+        1.2 If `sampling_topp` is not None, chose the top k candidates by model's log-probability under
+            the descending order.
+        1.3 Chose the whole input set as sampling set.
+    2. Using multinomial sample strategy to sample candidates from sample set as hypothesis.
+    3. Modify hypothesis score, relative indices, beam indices for the final result.
+
+    Attributes:
+        sampling_topk (`int`, **optional**, default to `-1`):
+            The value of k in the sampling strategy of top k.
+        sampling_topp (`float`, **optional**, default to '-1.0'):
+            The value of p The sampling strategy of top p.
+    """
     sampling_topk: int
     sampling_topp: float
 
@@ -710,6 +858,31 @@ class Sampling(Search):
         prev_output_tokens: Optional[Tensor] = None,
         original_batch_idxs: Optional[Tensor] = None,
     ):
+        r"""
+        Take a single search step.
+
+        Args:
+            step (`int`):  Current step, start with 0.
+            lprobs (`Tensor` with size `(bsz, input_beam_size, vocab_size)`):
+                the model's log-probabilities over the vocabulary at the current step.
+            scores (`Tensor` with size `(bsz, input_beam_size, step - 1)`):
+                Previous sampling scores for each beam.
+            prev_output_tokens (`Tensor`, **optional**. default to `None`):
+                Previous output tokens, no usage in this function, will be deprecated in next version.
+            original_batch_idxs (`Tensor`, **optional**, default to `None`):
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the original indices
+
+        Returns: A tuple of (scores_buf, indices_buf, beams_buf), where:
+            scores_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The model's log-probabilities over the elements selected to sample from.
+                which cumulates the score before.
+            indices_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The indices of chosen elements.
+            beams_buf (`Tensor` with size `(bsz, input_beam_size)`):
+                The indices of each beam.
+        """
         bsz, beam_size, vocab_size = lprobs.size()
 
         if step == 0:
@@ -800,6 +973,32 @@ class DiverseSiblingsSearch(Search):
         prev_output_tokens: Optional[Tensor] = None,
         original_batch_idxs: Optional[Tensor] = None,
     ):
+        r"""
+        Take a single search step.
+
+        Args:
+            step (`int`):  Current step, start with 0.
+            lprobs (`Tensor` with size `(bsz, input_beam_size, vocab_size)`):
+                the model's log-probabilities over the vocabulary at the current step.
+            scores (`Tensor` with size `(bsz, input_beam_size, step - 1)`):
+                Previous sampling scores for each beam.
+            prev_output_tokens (`Tensor`, **optional**. default to `None`):
+                Previous output tokens, no usage in this function, will be deprecated in next version.
+            original_batch_idxs (`Tensor`, **optional**, default to `None`):
+                the tensor with the batch indices, in the range [0, bsz)
+                this is useful in case there has been applied a re-ordering
+                and we need to know the original indices
+
+        Returns: A tuple of (scores_buf, indices_buf, beams_buf), where:
+            final_scores (`Tensor` with size `(bsz, output_beam_size)`):
+                The model's log-probabilities over the elements selected to sample from,
+                which cumulates the score before. `output_beam_size` is the minor of
+                `2 * input_beam_size` and `vocab_size - 1`.
+            final_indices (`Tensor` with size `(bsz, output_beam_size)`):
+                The indices of chosen elements.
+            final_beams (`Tensor` with size `(bsz, ourput_beam_size)`):
+                The indices of each beam.
+        """
         bsz, beam_size, vocab_size = lprobs.size()
         k = min(
             # Take the best 2 x beam_size predictions. We'll choose the first
diff --git a/modelscope/models/multi_modal/ofa/generate/utils.py b/modelscope/models/multi_modal/ofa/generate/utils.py
index 8c8abf99..5a5a3eb5 100644
--- a/modelscope/models/multi_modal/ofa/generate/utils.py
+++ b/modelscope/models/multi_modal/ofa/generate/utils.py
@@ -26,6 +26,9 @@ MANIFOLD_PATH_SEP = '|'
 
 
 def apply_to_sample(f, sample):
+    r"""
+    Apply some function to the sample. The f function will effect on the `Tensor` object, otherwise do nothing.
+    """
     if hasattr(sample, '__len__') and len(sample) == 0:
         return {}
 
@@ -53,7 +56,9 @@ def apply_to_sample(f, sample):
 
 
 def move_to_device(batch, device):
-    r"""Puts each data field to the device"""
+    r"""
+    Puts each data field to the device
+    """
     if isinstance(batch, torch.Tensor):
         return batch.to(device)
     elif isinstance(batch, (list, tuple)):
@@ -68,10 +73,22 @@ def move_to_device(batch, device):
 
 
 def strip_pad(tensor, pad):
+    r"""
+    Get the non pad value from input tensor
+    """
     return tensor[tensor.ne(pad)]
 
 
 def get_token_to_word_mapping(tokens, exclude_list):
+    r"""
+    Get the token to word mapping. The token indicates the original token index, while word indicates the token index
+    excluding the `exclude_list`.
+
+    >>> import torch
+    >>> all_tokens = torch.arange(4)
+    >>> exclude_tokens = [1]
+    >>> get_token_to_word_mapping(all_tokens, exclude_tokens) # {0: 1, 1: 1, 2: 2, 3: 3}
+    """
     n = len(tokens)
     word_start = [int(token not in exclude_list) for token in tokens]
     word_idx = list(accumulate(word_start))
@@ -80,6 +97,10 @@ def get_token_to_word_mapping(tokens, exclude_list):
 
 
 def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
+    r"""
+    @deprecated
+    There is no usage in this project, should be removed.
+    """
     tgt_valid = (((tgt_sent != pad) &  # noqa
                   (tgt_sent != eos)).nonzero(as_tuple=False).squeeze(dim=-1))
     src_invalid = (((src_sent == pad) |  # noqa
@@ -100,6 +121,9 @@ def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
 
 
 def softmax(x, dim: int, onnx_trace: bool = False):
+    r"""
+    softmax function. Using `torch.nn.functional.softmax`
+    """
     if onnx_trace:
         return F.softmax(x.float(), dim=dim)
     else:
@@ -107,6 +131,9 @@ def softmax(x, dim: int, onnx_trace: bool = False):
 
 
 def log_softmax(x, dim: int, onnx_trace: bool = False):
+    r"""
+    log softmax function. Using `torch.nn.functional.log_softmax`
+    """
     if onnx_trace:
         return F.log_softmax(x.float(), dim=dim)
     else:
@@ -114,6 +141,10 @@ def log_softmax(x, dim: int, onnx_trace: bool = False):
 
 
 def extract_soft_alignment(attn, src_sent, tgt_sent, pad, eos):
+    r"""
+    @deprecated
+    There is no usage in this project, should be removed.
+    """
     tgt_valid = (tgt_sent != pad).nonzero(as_tuple=False)
     src_valid = (src_sent != pad).nonzero(as_tuple=False).squeeze(dim=-1)
     alignment = []
diff --git a/modelscope/models/multi_modal/ofa/modeling_mmspeech.py b/modelscope/models/multi_modal/ofa/modeling_mmspeech.py
index 7c76f0bc..c71e23d0 100644
--- a/modelscope/models/multi_modal/ofa/modeling_mmspeech.py
+++ b/modelscope/models/multi_modal/ofa/modeling_mmspeech.py
@@ -11,12 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OFA model."""
+""" PyTorch OFA-MMSpeech model."""
 
 import math
-import random
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Optional, Tuple
 
 import numpy as np
 import torch
@@ -27,22 +26,17 @@ from fairseq.modules import LayerNorm, SamePad, TransposeLast
 from fairseq.modules.transformer_sentence_encoder import init_bert_params
 from fairseq.utils import index_put
 from packaging import version
-from torch import Tensor, nn
+from torch import nn
 from torch.nn import functional as F
-from transformers.activations import ACT2FN
 from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
                                      add_start_docstrings,
                                      add_start_docstrings_to_model_forward)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput,
-    Seq2SeqModelOutput)
-from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 
 from .configuration_mmspeech import MMSpeechConfig
 from .generate import utils
 from .modeling_ofa import (Embedding, OFADecoder, OFAModel, OFAPreTrainedModel,
-                           _expand_mask, shift_tokens_right)
+                           _expand_mask)
 
 logger = logging.get_logger()
 
diff --git a/modelscope/models/multi_modal/ofa/modeling_ofa.py b/modelscope/models/multi_modal/ofa/modeling_ofa.py
index 25e866bc..14aba8ca 100644
--- a/modelscope/models/multi_modal/ofa/modeling_ofa.py
+++ b/modelscope/models/multi_modal/ofa/modeling_ofa.py
@@ -262,6 +262,9 @@ class OFAAttention(nn.Module):
         is_decoder (`bool`): whether or not decoder attention.
         bias (`bool`): whether to add bias.
         scale_heads (`bool`): whether to learn scaling heads, only for Normformer.
+        scale_factor (`float32`, *optional*, defaults to `2.0`):
+            The position embedding scaling factor. If it works,
+            self.scaling = float(self.head_dim * scale_factor)**-0.5
     """
 
     def __init__(
@@ -272,6 +275,7 @@ class OFAAttention(nn.Module):
         is_decoder: bool = False,
         bias: bool = True,
         scale_heads: bool = True,
+        scale_factor: float = 2.0,
     ):
         super().__init__()
         self.embed_dim = embed_dim
@@ -283,7 +287,6 @@ class OFAAttention(nn.Module):
         ), f'embed_dim must be divisible by num_heads ' \
            f'(got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads}).'
         # 1. difference
-        scale_factor = 2
         self.scaling = float(self.head_dim * scale_factor)**-0.5
         self.is_decoder = is_decoder
 
@@ -441,6 +444,7 @@ class OFAEncoderLayer(nn.Module):
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
+            scale_factor=config.attn_scale_factor,
         )
         self.self_attn_layer_norm = LayerNorm(self.embed_dim)
         self.self_attn_mid_layer_norm = LayerNorm(
@@ -457,6 +461,18 @@ class OFAEncoderLayer(nn.Module):
         self.drop_path = DropPath(
             drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
 
+        self.use_gamma_feature = config.use_gamma_feature
+        if self.use_gamma_feature:
+            gamma = getattr(config, 'gamma', 1.)
+
+            # `OFA.from_pretrain()` method will replace the `gamma` to `weight`
+            # in the model key. Here, change the parameters like `xxx_gamma_xxx`
+            # to `xxx_weight_xxx` to adapt this transformation.
+            self.weight_self_attn = nn.Parameter(
+                torch.ones(self.embed_dim) * gamma, requires_grad=True)
+            self.weight_ffn = nn.Parameter(
+                torch.ones(self.embed_dim) * gamma, requires_grad=True)
+
     def residual_connection(self, x, residual):
         r"""
         Residual connection with drop path.
@@ -495,6 +511,8 @@ class OFAEncoderLayer(nn.Module):
         if self.self_attn_mid_layer_norm:
             hidden_states = self.self_attn_mid_layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if self.use_gamma_feature:
+            hidden_states = self.weight_self_attn * hidden_states
         hidden_states = self.residual_connection(hidden_states, residual)
         if not self.normalize_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -509,6 +527,8 @@ class OFAEncoderLayer(nn.Module):
             hidden_states = self.ffn_layer_norm(hidden_states)
         hidden_states = self.fc2(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if self.use_gamma_feature:
+            hidden_states = self.weight_ffn * hidden_states
         hidden_states = self.residual_connection(hidden_states, residual)
         if not self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
@@ -546,6 +566,7 @@ class OFADecoderLayer(nn.Module):
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
+            scale_factor=config.attn_scale_factor,
         )
         self.dropout = nn.Dropout(p=config.dropout)
         self.activation_fn = ACT2FN[config.activation_function]
@@ -559,6 +580,7 @@ class OFADecoderLayer(nn.Module):
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
+            scale_factor=config.attn_scale_factor,
         )
         self.cross_attn_layer_norm = LayerNorm(self.embed_dim)
         self.cross_attn_mid_layer_norm = LayerNorm(
@@ -572,6 +594,20 @@ class OFADecoderLayer(nn.Module):
         self.drop_path = DropPath(
             drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
 
+        self.use_gamma_feature = config.use_gamma_feature
+        if self.use_gamma_feature:
+            gamma = getattr(config, 'gamma', 1.)
+
+            # `OFA.from_pretrain()` method will replace the `gamma` to `weight`
+            # in the model key. Here, change the parameters like `xxx_gamma_xxx`
+            # to `xxx_weight_xxx` to adapt this transformation.
+            self.weight_self_attn = nn.Parameter(
+                torch.ones(self.embed_dim) * gamma, requires_grad=True)
+            self.weight_cross_attn = nn.Parameter(
+                torch.ones(self.embed_dim) * gamma, requires_grad=True)
+            self.weight_ffn = nn.Parameter(
+                torch.ones(self.embed_dim) * gamma, requires_grad=True)
+
     def residual_connection(self, x, residual):
         r"""
         Residual connection with drop path.
@@ -623,6 +659,8 @@ class OFADecoderLayer(nn.Module):
         if self.self_attn_mid_layer_norm:
             hidden_states = self.self_attn_mid_layer_norm(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if self.use_gamma_feature:
+            hidden_states = self.weight_self_attn * hidden_states
         hidden_states = self.residual_connection(hidden_states, residual)
         if not self.normalize_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -648,6 +686,8 @@ class OFADecoderLayer(nn.Module):
             if self.cross_attn_mid_layer_norm:
                 hidden_states = self.cross_attn_mid_layer_norm(hidden_states)
             hidden_states = self.dropout(hidden_states)
+            if self.use_gamma_feature:
+                hidden_states = self.weight_cross_attn * hidden_states
             hidden_states = self.residual_connection(hidden_states, residual)
             if not self.normalize_before:
                 hidden_states = self.cross_attn_layer_norm(hidden_states)
@@ -665,6 +705,8 @@ class OFADecoderLayer(nn.Module):
             hidden_states = self.ffn_layer_norm(hidden_states)
         hidden_states = self.fc2(hidden_states)
         hidden_states = self.dropout(hidden_states)
+        if self.use_gamma_feature:
+            hidden_states = self.weight_ffn * hidden_states
         hidden_states = self.residual_connection(hidden_states, residual)
         if not self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
@@ -1955,6 +1997,14 @@ class OFAModel(OFAPreTrainedModel):
         self.decoder = OFADecoder(config, shared)
         self.use_ofasys = config.use_ofasys
 
+        # exclude mlp head as default
+        if not getattr(config, 'exclude_mlp', True):
+            self.mlp_head = Linear(config.d_model, config.mlp_dim)
+        # None temperature_init_value as default
+        if config.temperature_init_value:
+            self.temp = nn.Parameter(config.temperature_init_value
+                                     * torch.ones([]))
+
         # Initialize weights and apply final processing
         self.post_init()
 
diff --git a/modelscope/models/multi_modal/ofa/resnet.py b/modelscope/models/multi_modal/ofa/resnet.py
index aad0f002..c863aad0 100644
--- a/modelscope/models/multi_modal/ofa/resnet.py
+++ b/modelscope/models/multi_modal/ofa/resnet.py
@@ -177,6 +177,15 @@ class Bottleneck(nn.Module):
 
 
 class ResNet(nn.Module):
+    r"""
+    Deep residual network, copy from https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py.
+
+    You can see more details from https://arxiv.org/abs/1512.03385
+
+    step 1. Get image embedding with `7` as the patch image size, `2` as stride.
+    step 2. Do layer normalization, relu activation and max pooling.
+    step 3. Go through three times residual branch.
+    """
 
     def __init__(self,
                  layers,
@@ -186,6 +195,25 @@ class ResNet(nn.Module):
                  replace_stride_with_dilation=None,
                  norm_layer=None,
                  drop_path_rate=0.0):
+        r"""
+        Args:
+            layers (`Tuple[int]`): There are three layers in resnet, so the length
+                of layers should greater then three. And each element in `layers` is
+                the number of `Bottleneck` in relative residual branch.
+            zero_init_residual (`bool`, **optional**, default to `False`):
+                Whether or not to zero-initialize the last BN in each residual branch.
+            groups (`int`, **optional**, default to `1`):
+                The number of groups. So far, only the value of `1` is supported.
+            width_per_group (`int`, **optional**, default to `64`):
+                The width in each group. So far, only the value of `64` is supported.
+            replace_stride_with_dilation (`Tuple[bool]`, **optional**, default to `None`):
+                Whether or not to replace stride with dilation in each residual branch.
+            norm_layer (`torch.nn.Module`, **optional**, default to `None`):
+                The normalization module. If `None`, will use  `torch.nn.BatchNorm2d`.
+            drop_path_rate (`float`, **optional**, default to 0.0):
+                Drop path rate. See more details about drop path from
+                https://arxiv.org/pdf/1605.07648v4.pdf.
+        """
         super(ResNet, self).__init__()
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
@@ -251,6 +279,29 @@ class ResNet(nn.Module):
                     stride=1,
                     dilate=False,
                     drop_path_rate=0.0):
+        r"""
+        Making a single residual branch.
+
+        step 1. If dilate==`True`, switch the value of dilate and stride.
+        step 2. If the input dimension doesn't equal to th output output dimension
+            in `block`, initialize a down sample module.
+        step 3. Build a sequential of `blocks` number of `block`.
+
+        Args:
+            block (`torch.nn.Module`): The basic block in residual branch.
+            planes (`int`): The output dimension of each basic block.
+            blocks (`int`): The number of `block` in residual branch.
+            stride (`int`, **optional**, default to `1`):
+                The stride using in conv.
+            dilate (`bool`, **optional**, default to `False`):
+                Whether or not to replace dilate with stride.
+            drop_path_rate (`float`, **optional**, default to 0.0):
+                Drop path rate. See more details about drop path from
+                https://arxiv.org/pdf/1605.07648v4.pdf.
+
+        Returns:
+            A sequential of basic layer with type `torch.nn.Sequential[block]`
+        """
         norm_layer = self._norm_layer
         downsample = None
         previous_dilation = self.dilation
diff --git a/modelscope/models/multi_modal/ofa/utils/constant.py b/modelscope/models/multi_modal/ofa/utils/constant.py
index 48e90336..f455e41a 100644
--- a/modelscope/models/multi_modal/ofa/utils/constant.py
+++ b/modelscope/models/multi_modal/ofa/utils/constant.py
@@ -11,5 +11,7 @@ OFA_TASK_KEY_MAPPING = {
     Tasks.text_classification: OutputKeys.LABELS,
     Tasks.image_classification: OutputKeys.LABELS,
     Tasks.visual_entailment: OutputKeys.LABELS,
-    Tasks.auto_speech_recognition: OutputKeys.TEXT
+    Tasks.auto_speech_recognition: OutputKeys.TEXT,
+    Tasks.sudoku: OutputKeys.TEXT,
+    Tasks.text2sql: OutputKeys.TEXT,
 }
diff --git a/modelscope/models/multi_modal/ofa/utils/utils.py b/modelscope/models/multi_modal/ofa/utils/utils.py
index c5aa8483..b874aaa1 100644
--- a/modelscope/models/multi_modal/ofa/utils/utils.py
+++ b/modelscope/models/multi_modal/ofa/utils/utils.py
@@ -43,6 +43,7 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
 class DropPath(nn.Module):
     r"""
     Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    See more details about drop path from https://arxiv.org/pdf/1605.07648v4.pdf.
 
     Args:
         drop_prob: drop path ratio.
diff --git a/modelscope/models/multi_modal/ofa/vit.py b/modelscope/models/multi_modal/ofa/vit.py
index b6bba7ee..36d1707e 100644
--- a/modelscope/models/multi_modal/ofa/vit.py
+++ b/modelscope/models/multi_modal/ofa/vit.py
@@ -1,3 +1,7 @@
+# Copyright (c) 2021 OpenAI
+#
+# This source code is licensed under the MIT license which can be found at
+# https://github.com/openai/CLIP/blob/main/LICENSE
 from collections import OrderedDict
 
 import torch
@@ -16,18 +20,39 @@ __all__ = [
 
 
 class QuickGELU(nn.Module):
+    r"""
+    An activation function module.
+    """
 
     def forward(self, x: torch.Tensor):
         return x * torch.sigmoid(1.702 * x)
 
 
 class ResidualAttentionBlock(nn.Module):
+    r"""
+    A residual attention block module.
+
+    step 1. Calculate the self attention in input with layer normalization.
+    step 2. Add input to the result of self attention's result as I.
+    step 3. Calculate the mlp of input I with layer normalization.
+    step 4. Add I to the result of mlp.
+    """
 
     def __init__(self,
                  d_model: int,
                  n_head: int,
                  attn_mask: torch.Tensor = None,
                  drop_path_rate=0.0):
+        r"""
+        Args:
+            d_model (`int`): The embedding dimensions.
+            n_head (`int`): The number of heads in self attention block.
+            attn_mask (`Tensor`, **optional**, default to None):
+                Attention mask using in self attention.
+            drop_path_rate (`float`, **optional**, default to 0.0):
+                Drop path rate. See more details about drop path from
+                https://arxiv.org/pdf/1605.07648v4.pdf.
+        """
         super().__init__()
 
         self.attn = nn.MultiheadAttention(d_model, n_head)
@@ -43,6 +68,9 @@ class ResidualAttentionBlock(nn.Module):
         self.drop_path = DropPath(drop_path_rate)
 
     def attention(self, x: torch.Tensor):
+        r"""
+        A wrapper of self attention .
+        """
         self.attn_mask = (
             self.attn_mask.to(dtype=x.dtype, device=x.device)
             if self.attn_mask is not None else None)
@@ -56,6 +84,11 @@ class ResidualAttentionBlock(nn.Module):
 
 
 class Transformer(nn.Module):
+    r"""
+    A transformer module using in `VisionTransformer`.
+
+    Execute a sequential of `ResidualAttentionBlock`.
+    """
 
     def __init__(
         self,
@@ -65,6 +98,17 @@ class Transformer(nn.Module):
         attn_mask: torch.Tensor = None,
         drop_path_rate: float = 0.0,
     ):
+        r"""
+        Args:
+            width (`int`): The width of input image.
+            layers (`int`): The number of `ResidualAttentionBlock` layers.
+            heads (int): The number of self attention heads.
+            attn_mask (`Tensor`, **optional**, default to None):
+                Attention mask using in self attention.
+            drop_path_rate (`float`, **optional**, default to 0.0):
+                Drop path rate. See more details about drop path from
+                https://arxiv.org/pdf/1605.07648v4.pdf.
+        """
         super().__init__()
         self.width = width
         self.layers = layers
@@ -78,6 +122,15 @@ class Transformer(nn.Module):
 
 
 class VisionTransformer(nn.Module):
+    r"""
+    Vision transformer module.
+
+    step 1. Using conv2d to get the image embedding.
+    step 2. If the resolution of input image doesn't equal to the initialized one
+        do `bilinear` interpolate to get new patch position embedding.
+    step 3. Add position embedding to image embedding to generate final image representation.
+    step 4. Do `Transformer` to the image representation.
+    """
 
     def __init__(
         self,
@@ -88,6 +141,17 @@ class VisionTransformer(nn.Module):
         heads: int,
         drop_path_rate: float = 0.0,
     ):
+        r"""
+        Args:
+            input_resolution (`int`): The resolution of input image.
+            patch_size  (`int`): The resolution of each patch image.
+            width (`int`): The dimension of each patch image.
+            layers (`int`): The number of `ResidualAttentionBlock` in `Transformer`.
+            heads (`int`): The number of heads in self attention block.
+            drop_path_rate (`float`, **optional**, default to 0.0):
+                Drop path rate. See more details about drop path from
+                https://arxiv.org/pdf/1605.07648v4.pdf.
+        """
         super().__init__()
         self.input_resolution = input_resolution
         self.patch_size = patch_size
@@ -140,16 +204,48 @@ class VisionTransformer(nn.Module):
 
 
 def vit_base(drop_path_rate: float = 0.0):
+    r"""
+    An instance of base vision transformer model.
+
+    Args:
+        drop_path_rate (`float`, **optional**, default to 0.0):
+            Drop path rate. See more details about drop path from
+            https://arxiv.org/pdf/1605.07648v4.pdf.
+    """
     return VisionTransformer(224, 16, 768, 9, 12, drop_path_rate)
 
 
 def vit_large(drop_path_rate: float = 0.0):
+    r"""
+    An instance of large vision transformer model.
+
+    Args:
+        drop_path_rate (`float`, **optional**, default to 0.0):
+            Drop path rate. See more details about drop path from
+            https://arxiv.org/pdf/1605.07648v4.pdf.
+    """
     return VisionTransformer(224, 14, 1024, 18, 16, drop_path_rate)
 
 
 def vit_large_336(drop_path_rate: float = 0.0):
+    r"""
+    An instance of large vision transformer model with 336 as input image width .
+
+    Args:
+        drop_path_rate (`float`, **optional**, default to 0.0):
+            Drop path rate. See more details about drop path from
+            https://arxiv.org/pdf/1605.07648v4.pdf.
+    """
     return VisionTransformer(336, 14, 1024, 18, 16, drop_path_rate)
 
 
 def vit_huge(drop_path_rate: float = 0.0):
+    r"""
+    An instance of huge vision transformer model.
+
+    Args:
+        drop_path_rate (`float`, **optional**, default to 0.0):
+            Drop path rate. See more details about drop path from
+            https://arxiv.org/pdf/1605.07648v4.pdf.
+    """
     return VisionTransformer(224, 14, 1280, 24, 16, drop_path_rate)
diff --git a/modelscope/models/multi_modal/ofa_for_all_tasks.py b/modelscope/models/multi_modal/ofa_for_all_tasks.py
index 3a35be58..3135b2b2 100644
--- a/modelscope/models/multi_modal/ofa_for_all_tasks.py
+++ b/modelscope/models/multi_modal/ofa_for_all_tasks.py
@@ -38,16 +38,65 @@ __all__ = ['OfaForAllTasks']
 @MODELS.register_module(Tasks.text_summarization, module_name=Models.ofa)
 @MODELS.register_module(Tasks.text_classification, module_name=Models.ofa)
 @MODELS.register_module(Tasks.auto_speech_recognition, module_name=Models.ofa)
+@MODELS.register_module(Tasks.sudoku, module_name=Models.ofa)
+@MODELS.register_module(Tasks.text2sql, module_name=Models.ofa)
 class OfaForAllTasks(TorchModel):
+    r"""
+    All ofa tasks using uniform ofa model structure. So far, we support three types of tasks:
+    1. text generation tasks: ocr_recognition, image_captioning and text_summarization
+    2. visual grounding tasks: visual grounding
+    3. classification tasks: text classification and image classification.
+
+    Attributes:
+        cfg: Task configs exclude model configs, such as generator's config.
+        model:  OFA uniform model using in this task.
+        language: The language using in the model. So far, we support three types of language, `en` for English,
+                `zh` and `cn` for Chinese, default to `en`.
+        tokenizer: OFA tokenizer for tokenizing the input for OFA model.
+        batch_size: Batch size.
+        patch_image_size: The image size of input image, default to 480.
+        val_batch_size: The validation batch size.
+        transtab: A translation table of punctuation.
+        gen_type: Generation type, so far, we support two types of gen_type, `generation` for generation tasks,
+                 `traverse` for classification tasks, default to `generation`.
+        bos_item: The id of beginning of a sequence.
+        pad_item: The id of padding of a sequence.
+        eos_item: The id of ending of a sequence.
+        index2ans: A mapping from index to label using in classification tasks.
+        ans2label_dict: A mapping from label to index using in classification tasks.
+        constraint_trie: A trie tree building from label using in classification tasks.
+        val_ans_l: A validation set of label using in classification tasks.
+        val_masks_l: A validation set of mask using in classification tasks.
+        generator: A sequence generator with OFA model to generate image code.
+        task_inference_mapping: A mapping from task name to execution function in task inference.
+        pattern: Regex pattern which find the blanks after/before the words except ` a-zA-Z0-9.,:!?`
+    """
 
     def __init__(self, model_dir, *args, **kwargs):
+        r"""
+        Args:
+            model_dir (`str` or `os.PathLike`)
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co
+                      or modelscope.cn. Valid model ids can be located at the root-level, like `bert-base-uncased`,
+                      or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
+                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
+                      `True`.
+        """
         if os.path.exists(model_dir):
             model_dir = os.path.abspath(model_dir)
         super().__init__(model_dir=model_dir, *args, **kwargs)
         self.cfg = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
         multimodal_type = self.cfg.model.get('multimodal_type', 'default')
-        if multimodal_type == 'default':
+        if multimodal_type in ['default', 'text2sql']:
             model = OFAModel.from_pretrained(model_dir)
         elif multimodal_type == 'mmspeech':
             model = MMSpeechModel.from_pretrained(model_dir)
@@ -76,6 +125,13 @@ class OfaForAllTasks(TorchModel):
                 self.tokenizer.add_tokens(
                     ['<audio_{}>'.format(i) for i in range(30000)])
                 self.cfg.update({'num_bins': 0, 'num_codes': 30000})
+            elif multimodal_type == 'text2sql':
+                self.tokenizer.add_tokens(
+                    ['<code_{}>'.format(i) for i in range(8192)])
+                self.tokenizer.add_tokens(
+                    ['<bin_{}>'.format(i) for i in range(1000)])
+                self.cfg.update({'num_bins': 1000, 'num_codes': 8192})
+                self.tokenizer.add_tokens(['>=', '<='])
 
         self.batch_size = self.cfg.model.get('batch_size', 1)
         self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
@@ -105,6 +161,8 @@ class OfaForAllTasks(TorchModel):
         }
         if hasattr(self.cfg.model, 'beam_search'):
             sg_args.update(self.cfg.model.beam_search)
+        self.num_return_sequences = self.cfg.model.get('num_return_sequences',
+                                                       1)
         if len(self.ans2label_dict) > 0:
             self.constraint_trie = Trie(self.tokenizer.eos_token_id)
             self.val_ans_l = []
@@ -128,11 +186,25 @@ class OfaForAllTasks(TorchModel):
             Tasks.text_classification: inference_d[self.gen_type],
             Tasks.image_classification: inference_d[self.gen_type],
             Tasks.auto_speech_recognition: self._text_gen_inference,
+            Tasks.sudoku: self._text_gen_inference,
+            Tasks.text2sql: self._text_gen_inference,
         }
         pattern_str = '((?<=[^ a-zA-Z0-9.,:!?]) +| +(?=[^ a-zA-Z0-9.,:!?]))'
         self.pattern = re.compile(pattern_str)
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        The entry function of task execution. So far, we support two types of execution pipeline:
+        1. training, return the model's forward results.
+        2. inference, return the result of `self.inference(input)`
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The input of the tasks, the actual value depending on the specific tasks.
+        Returns:
+            `Dict[Str, Any]`
+
+        """
         input = move_to_device(input, self.model.device)
         if self.model.training:
             return self.model(**input['net_input'])
@@ -140,24 +212,51 @@ class OfaForAllTasks(TorchModel):
             return self.inference(input)
 
     def inference(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        assert self.generator.beam_size >= self.num_return_sequences, \
+            'beam search can only return beam size sentences'
+        if self.ans2label_dict and self.gen_type == 'generation':
+            assert self.generator.beam_size <= len(self.ans2label_dict), \
+                'beam search will not work properly.'
+        r"""
+        Task inference function
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The input of the tasks, the actual value depending on the specific tasks.
+        Returns:
+            `Dict[Str, Any]`
+
+        """
         ret = self.task_inference_mapping[self.cfg.task](input)
         if 'samples' in input:
             ret['samples'] = input['samples']
-        for key in [
-                OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
-                OutputKeys.LABELS, OutputKeys.SCORES
-        ]:
-            if key not in ret:
-                ret[key] = None
         return ret
 
     def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        r"""
+        Do post processing after task's forward function is executed. So far, we have three strategies while do post
+        processing.
+        1. If the task is image captioning and using English language, some special words will be removed, such as
+           `!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~`
+        2. If the task is not visual grounding, but a generation task using Chinese language, we will remove the blank
+            after/before the words except ` a-zA-Z0-9.,:!?`
+        3. Other cases will return the input as result.
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The result of task's forward function. The key is one of the keys of OFA_TASK_KEY_MAPPING for
+                distinguishing different ofa tasks, while the value is the result of different tasks.
+
+        Returns:
+            `Dict[Str, Any]`
+        """
         if not self.model.training and self.cfg.task == Tasks.image_captioning:
             caption = input[OutputKeys.CAPTION]
             result_l = list()
             for cap in caption:
                 if self.language == 'en':
-                    result_l.append(cap.translate(self.transtab).strip())
+                    result_l.append(
+                        [c.translate(self.transtab).strip() for c in cap])
                 else:
                     result_l.append(cap)
             input[OutputKeys.CAPTION] = result_l
@@ -166,35 +265,71 @@ class OfaForAllTasks(TorchModel):
         ] and self.cfg.task != Tasks.visual_grounding:
             ret_l = list()
             for text in input[OFA_TASK_KEY_MAPPING[self.cfg.task]]:
-                ret_l.append(self.detokenizer(text))
+                ret_l.append([self.detokenizer(t) for t in text])
             input[OFA_TASK_KEY_MAPPING[self.cfg.task]] = ret_l
+        for key in [
+                OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
+                OutputKeys.LABELS, OutputKeys.SCORES
+        ]:
+            if key not in input:
+                input[key] = None
+            else:
+                if (len(input[key]) == 1 and isinstance(input[key], list)) \
+                        and self.cfg.task != Tasks.visual_grounding:
+                    input[key] = input[key][0]
         return input
 
     def _text_gen_inference(self, input):
+        r"""
+        The inference function fo text generation tasks.
+        1. Using OFA sequence generator which match the api of other fairseq generators to generate the token indices.
+        2. Decode the token indices to actual language tokens and skip the special tokens.
+        3. For the usage of classification scenario, add default score with `len(result)`.
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The input of the tasks, the actual value depending on the specific tasks.
+        Returns:
+            `Dict[Str, Any]`
+        """
         gen_outputs = self.generator.generate([self.model],
                                               input,
                                               prefix_tokens=input.get(
                                                   'prefix_tokens', None))
-        gen_l = list()
+        results = list()
         for idx, gen_out in enumerate(gen_outputs):
-            if len(gen_out) > 0:
-                decode_tokens = gen_out[0]['tokens']
+            gen_token_l = []
+            for beam_gen_out in gen_out[:self.num_return_sequences]:
+                decode_tokens = beam_gen_out['tokens']
                 if 'prefix_tokens' in input:
                     prefix_len = input['prefix_tokens'][idx].ne(
                         self.pad_item.to(self.model.device)).sum()
                     decode_tokens = decode_tokens[prefix_len:]
-                gen_l.append(decode_tokens)
-            else:
-                gen_l.append('')
-        result = self.tokenizer.batch_decode(gen_l, skip_special_tokens=True)
-        result = [item.strip() for item in result]
+                gen_token_l.append(decode_tokens)
+            result = self.tokenizer.batch_decode(
+                gen_token_l, skip_special_tokens=True)
+            result = [item.strip() for item in result]
+            result.extend([''] * (self.num_return_sequences - len(result)))
+            results.append(result)
         # text generation tasks have no score
-        ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result}
-        if self.cfg.task.endswith('classification'):
-            ret[OutputKeys.SCORES] = [1.0] * len(result)
+        ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: results}
+        if self.ans2label_dict:
+            ret[OutputKeys.SCORES] = [[1.0]] * len(results)
         return ret
 
     def _visual_grounding_inference(self, input):
+        r"""
+        The inference function for visual grounding tasks.
+        1. Using OFA sequence generator which match the api of other fairseq generators to generate the token indices.
+        2. Decode the token indices into region boxes.
+        3. Add default score with `batch_size`
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The input of the tasks, the actual value depending on the specific tasks.
+        Returns:
+            `Dict[Str, Any]`
+        """
         gen_output = self.generator.generate([self.model], input)
         tokens = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
         region_coord_l = list()
@@ -214,6 +349,15 @@ class OfaForAllTasks(TorchModel):
         }
 
     def _traverse_inference(self, input):
+        r"""
+        The inference function fo classification tasks.
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The input of the tasks, the actual value depending on the specific tasks.
+        Returns:
+            `Dict[Str, Any]`
+        """
         encoder_input = dict()
         for key in input['net_input'].keys():
             encoder_input[key] = input['net_input'][key]
@@ -295,6 +439,9 @@ class OfaForAllTasks(TorchModel):
         return {OutputKeys.LABELS: hyps, OutputKeys.SCORES: scores}
 
     def build_trie(self):
+        r"""
+        Building a trie tree for classification label and mask.
+        """
         answer_item_list = []
 
         for i, answer in enumerate(self.ans2label_dict.keys()):
@@ -327,6 +474,9 @@ class OfaForAllTasks(TorchModel):
             ]
 
     def load_ans2label(self):
+        r"""
+        Load answer to label dict from file, using in building trie function.
+        """
         if self.cfg.model.get('answer2label', None):
             ans2label_file = osp.join(self.model_dir,
                                       self.cfg.model.answer2label)
@@ -339,6 +489,23 @@ class OfaForAllTasks(TorchModel):
                         save_function: Callable = None,
                         config: Optional[dict] = None,
                         **kwargs):
+        r"""
+        Save the task model, its configuration and other related files to a directory, so that it can be re-loaded
+
+        Args:
+            target_folder (Union[str, os.PathLike]):
+            Directory to which to save. Will be created if it doesn't exist.
+
+            save_checkpoint_names (Union[str, List[str]]):
+            The checkpoint names to be saved in the target_folder
+
+            save_function (Callable, optional):
+            The function to use to save the state dictionary.
+
+            config (Optional[dict], optional):
+            The config for the configuration.json, might not be identical with model.config
+
+        """
         super(OfaForAllTasks, self). \
             save_pretrained(target_folder=target_folder,
                             save_checkpoint_names=save_checkpoint_names,
@@ -347,4 +514,7 @@ class OfaForAllTasks(TorchModel):
                             **kwargs)
 
     def detokenizer(self, text):
+        r"""
+        Remove the blank after/before the words except ` a-zA-Z0-9.,:!?`
+        """
         return self.pattern.sub('', text)
diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
index 655d36d2..76ab1170 100644
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -38,6 +38,15 @@ __all__ = ['OfaForTextToImageSynthesis']
 
 
 def custom_to_pil(x):
+    r"""
+    Change the custom array to PIL image.
+
+    Args:
+        x (`object`)
+            Object with array interface.
+    Returns:
+        A pillow image object.
+    """
     x = x.detach().cpu()
     x = torch.clamp(x, -1., 1.)
     x = (x + 1.) / 2.
@@ -50,6 +59,20 @@ def custom_to_pil(x):
 
 
 def load_vqgan(config, ckpt_path=None, is_gumbel=False):
+    r"""
+    Load checkpoint for vqgan model.
+
+    Args:
+        config (`Dict[Str, Any]`):
+            Model configs for vgqan model initialization.
+        ckpt_path (`str` or `os.PathLike`, **optional**, default to `None`):
+            Checkpoint path. IF not None, it will load model parameters from the checkpoint path.
+        is_gumbel (`bool`, **optional**, default to `False`):
+            Whether or not to use gumbel vqgan.
+
+    Returns:
+        A vqgan model with evaluation state.
+    """
     if is_gumbel:
         model = GumbelVQ(**config['model']['params'])
     else:
@@ -61,6 +84,16 @@ def load_vqgan(config, ckpt_path=None, is_gumbel=False):
 
 
 def build_clip_model(model_path):
+    r"""
+    Build clip model, the model structure can be found in `modelscope.models.multi_modal.mmr.models.module_clip.CLIP`
+
+    Args:
+        model_path (`str` or `os.PathLike`):
+            Model path in which store the clip model's parameters.
+
+    Returns:
+        A clip model with evaluation state.
+    """
     state_dict = torch.load(model_path, map_location='cpu').state_dict()
     vit = 'visual.proj' in state_dict
     if vit:
@@ -114,10 +147,22 @@ def build_clip_model(model_path):
 
 
 def _convert_image_to_rgb(image):
+    r"""
+    Convert the mode of the image to `RGB`.
+    """
     return image.convert('RGB')
 
 
 def build_clip_transform(n_px):
+    r"""
+    Build image transformation. All images sent to clip model will be transformed in this transformation.
+
+    Args:
+        n_px(`int` or `sequence`):
+            Desired output size of resize and crop transformation.
+    Returns:
+        A compose of transformations.
+    """
     return Compose([
         Resize(n_px, interpolation=BICUBIC),
         CenterCrop(n_px),
@@ -130,8 +175,37 @@ def build_clip_transform(n_px):
 
 @MODELS.register_module(Tasks.text_to_image_synthesis, module_name=Models.ofa)
 class OfaForTextToImageSynthesis(Model):
+    r"""
+    OFA task for text to image synthesis.
+
+    Attributes:
+        model: OFA uniform model using in this task.
+        cfg: Task configs exclude model configs, such as generator's config.
+        tokenizer: OFA tokenizer for tokenizing the input for OFA model.
+        vqgan_model: A vqgan model for image decoding.
+        clip_tokenizer: CLIP tokenizer for tokenizing the input for CLIP model.
+        clip_model: A CLIP model for ranking the generating image with original input text to select the best one.
+        generator: A sequence generator with OFA model to generate image code.
+    """
 
     def __init__(self, model_dir, *args, **kwargs):
+        r"""
+        Args:
+            model_dir (`str` or `os.PathLike`)
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co
+                      or modelscope.cn. Valid model ids can be located at the root-level, like `bert-base-uncased`,
+                      or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
+                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
+                      `True`.
+        """
         super().__init__(model_dir=model_dir, *args, **kwargs)
         # Initialize ofa
         model = OFAModel.from_pretrained(model_dir)
@@ -210,6 +284,18 @@ class OfaForTextToImageSynthesis(Model):
         return result
 
     def forward(self, input: Dict[str, Any]):
+        r"""
+        The entry function of text to image synthesis task.
+        1. Using OFA model to generate an image code candidate set.
+        2. Using vqgan model to decode the generated images to a pillow image set.
+        3. Using CLIP model to rank the candidate set, choosing the best generated image.
+
+        Args:
+            input (`Dict[Str, Any]`):
+                The input of the task
+        Returns:
+            A generated pillow image.
+        """
 
         text = input['samples'][0]['text']
         input = move_to_device(input, self._device)
diff --git a/modelscope/models/multi_modal/vldoc/__init__.py b/modelscope/models/multi_modal/vldoc/__init__.py
new file mode 100644
index 00000000..8a231402
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .model import VLDocForDocVLEmbedding
diff --git a/modelscope/models/multi_modal/vldoc/conv_fpn_trans.py b/modelscope/models/multi_modal/vldoc/conv_fpn_trans.py
new file mode 100644
index 00000000..65e27ab5
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/conv_fpn_trans.py
@@ -0,0 +1,293 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import random
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath, trunc_normal_
+
+from modelscope.models.multi_modal.vldoc.convnext import convnext_tiny
+from modelscope.utils.logger import get_logger
+
+try:
+    import apex
+    import apex.normalization
+    LN = apex.normalization.FusedLayerNorm
+except ImportError:
+    LN = torch.nn.LayerNorm
+
+logging = get_logger()
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None,
+                 expand_ratio=4.0,
+                 init_values: float = None):
+        """
+        The implementation of the transformer block refers to:
+        https://github.com/openai/CLIP/blob/b46f5ac7587d2e1862f8b7b1573179d80dcdd620/clip/model.py
+        """
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LN(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * expand_ratio)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * expand_ratio,
+                                              d_model))]))
+        self.ln_2 = LN(d_model)
+        self.attn_mask = attn_mask
+        if init_values is not None:
+            self.gamma_1 = nn.Parameter(
+                init_values * torch.ones((d_model)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(
+                init_values * torch.ones((d_model)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = 1.0, 1.0
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.gamma_1 * self.attention(self.ln_1(x))
+        x = x + self.gamma_2 * self.mlp(self.ln_2(x))
+        return x
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+def drop_grid(grid_map, drop_range=(0.3, 0.8), training=False):
+    """
+    only drop in the training phase.
+    grid_map: [N, D, T1, ...]
+    """
+    if training:
+        drop_ratio = random.random() * (drop_range[1]
+                                        - drop_range[0]) + drop_range[0]
+        # [N, T1, ...], True will be dropped
+        mask = (torch.rand_like(grid_map[:, 0]) < drop_ratio).bool()
+        grid_map = grid_map.masked_fill(mask.unsqueeze(1), 0.0)
+    return grid_map
+
+
+class GumbelSample(nn.Module):
+
+    def __init__(self, in_dim, num_keep):
+        super(GumbelSample, self).__init__()
+        self.keep_layer = nn.Sequential(
+            nn.Conv2d(
+                in_dim, 256, 3, stride=1, padding=3, dilation=3, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                256, 256, 3, stride=1, padding=2, dilation=2, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 1, 3, stride=1, padding=1, dilation=1),
+            nn.Sigmoid(),
+        )
+        self.num_keep = num_keep
+        self.diffusion = nn.Conv2d(in_dim, in_dim, 3, padding=1)
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, x, tau=1):
+        """
+        x: [N, C, H, W]
+        """
+        N = x.size(0)
+        keep_score = self.keep_layer(x)
+        keep_score = torch.clamp(keep_score, min=0.0, max=1.0)
+        keep_score = torch.cat([keep_score, 1 - keep_score],
+                               dim=1) + 1e-5  # [N, 2, H, W]
+        gumbel_score = F.gumbel_softmax(
+            keep_score.log(), tau=tau, hard=False, dim=1)
+        # differentiable hard mode
+        index = gumbel_score.max(dim=1, keepdim=True)[1]
+        gumbel_hard = torch.zeros_like(
+            gumbel_score,
+            memory_format=torch.legacy_contiguous_format).scatter_(
+                1, index, 1.0)
+        gumbel_hard = gumbel_hard - gumbel_score.detach() + gumbel_score
+        #
+        gumbel_score = gumbel_score[:, 0].contiguous().view(N,
+                                                            -1)  # [N, H x W]
+        gumbel_hard = gumbel_hard[:, 0].contiguous().view(N, -1)
+        # sort by score
+        idx_true = torch.topk(
+            gumbel_score, self.num_keep, dim=1)[1]  # [N, num_keep]
+        topk_mask = torch.zeros_like(gumbel_score).bool().fill_(
+            False).scatter_(1, idx_true, True)  # [N, H x W]
+        return topk_mask, gumbel_hard, keep_score[:, 0]
+
+    def sample(self, x, topk_mask, gumbel_hard):
+        N, D, H, W = x.size()
+        x = x.contiguous().view(N, D, -1)  # [N, D, HxW]
+        x = x * gumbel_hard.unsqueeze(1)
+        x = x.transpose(1, 2)  # [N, HxW, D]
+        x = x[topk_mask].contiguous().view(N, -1, D)  # [N, num_keep, D]
+        x = drop_grid(
+            x.transpose(1, 2), drop_range=(0.0, 0.2),
+            training=self.training).transpose(1, 2)
+        return x
+
+    def random_sample(self, x):
+        N, D, H, W = x.size()
+        x = x.contiguous().view(N, D, -1)  # [N, D, HxW]
+        x = x.transpose(1, 2)  # [N, HxW, D]
+        # generate random mask
+        idx_true = torch.topk(
+            torch.rand_like(x[:, :, 0]), self.num_keep,
+            dim=1)[1]  # [N, num_keep]
+        topk_mask = torch.zeros_like(x[:, :, 0]).bool().fill_(False).scatter_(
+            1, idx_true, True)  # [N, H x W]
+        # apply the mask
+        x = x[topk_mask].contiguous().view(N, -1, D)  # [N, num_keep, D]
+        x = drop_grid(
+            x.transpose(1, 2), drop_range=(0.0, 0.2),
+            training=self.training).transpose(1, 2)
+        return x, topk_mask
+
+    def restore(self, x, topk_mask, src):
+        """
+        x: [N, D, H, W]
+        topk_mask: [N, HxW]
+        src: [N, num_keep, D]
+        """
+        N, D, H, W = x.size()
+        x = drop_grid(x, drop_range=(0.2, 0.8), training=self.training)
+        x = x.contiguous().view(N, D, -1).transpose(1, 2)  # [N, HxW, D]
+        x = x.masked_scatter(topk_mask.unsqueeze(-1), src)
+        x = x.transpose(1, 2).contiguous().view(N, D, H, W)
+        x = self.dropout(self.diffusion(x))
+        return x
+
+
+class FPNTrans(nn.Module):
+
+    def __init__(self,
+                 trans_layers=2,
+                 inner_channels=256,
+                 img_size=(896, 896),
+                 inner_vit=False,
+                 out_sampling=False):
+        super(FPNTrans, self).__init__()
+        self.cnn = convnext_tiny(pretrained=True, in_22k=True)
+        self.dims = self.cnn.dims
+        self.img_size = img_size
+        # FPN in DB
+        self.up5 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up4 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up3 = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.in5 = nn.Conv2d(self.dims[-1], inner_channels, 1, bias=False)
+        self.in4 = nn.Conv2d(self.dims[-2], inner_channels, 1, bias=False)
+        self.in3 = nn.Conv2d(self.dims[-3], inner_channels, 1, bias=False)
+        self.in2 = nn.Conv2d(self.dims[-4], inner_channels, 1, bias=False)
+
+        self.out5 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=False),
+            nn.Upsample(scale_factor=8, mode='nearest'))
+        self.out4 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=False),
+            nn.Upsample(scale_factor=4, mode='nearest'))
+        self.out3 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=False),
+            nn.Upsample(scale_factor=2, mode='nearest'))
+        self.out2 = nn.Conv2d(
+            inner_channels, inner_channels // 4, 3, padding=1, bias=False)
+        self.inner_vit = inner_vit
+        if inner_vit:
+            # mini vit
+            self.num_keep1 = (self.img_size[0] // 64)**2
+            self.gumble_sample1 = GumbelSample(
+                inner_channels, num_keep=self.num_keep1)
+            self.pos_emb1 = nn.Parameter(
+                torch.randn(inner_channels, self.img_size[0] // 32,
+                            self.img_size[1] // 32))
+            trunc_normal_(self.pos_emb1, std=.02)
+            self.mini_vit = nn.Sequential(*[
+                ResidualAttentionBlock(
+                    inner_channels, 4, expand_ratio=2, init_values=0.1)
+                for _ in range(trans_layers)
+            ])
+        self.dropout_pos = nn.Dropout(0.1)
+        if out_sampling:
+            # sample for co-attention
+            self.num_keep2 = (self.img_size[0] // 64)**2
+            self.gumble_sample2 = GumbelSample(
+                inner_channels, num_keep=self.num_keep2)
+            self.pos_emb2 = nn.Parameter(
+                torch.randn(inner_channels, self.img_size[0] // 4,
+                            self.img_size[1] // 4))
+            trunc_normal_(self.pos_emb2, std=.02)
+        self.out_sampling = out_sampling
+
+        self.drop_path = DropPath(0.1)
+
+    def forward(self, x):
+        ms_features = self.cnn(x)
+        c2, c3, c4, c5 = ms_features
+        in5 = self.in5(c5)
+        in4 = self.in4(c4)
+        in3 = self.in3(c3)
+        in2 = self.in2(c2)
+        N, D5, H5, W5 = in5.size()
+        if self.inner_vit:
+            # random sample
+            keep_score = None
+            in5_pos = self.dropout_pos(in5 + self.pos_emb1.unsqueeze(0))
+            in5_pos_in, topk_mask = self.gumble_sample1.random_sample(in5_pos)
+            in5_pos_in = in5_pos_in.transpose(0, 1)  # [num_keep1, N, D5]
+            in5_pos_out = self.mini_vit(in5_pos_in).permute(
+                1, 2, 0)  # [N, D5, num_keep1]
+            in5 = self.gumble_sample1.restore(in5, topk_mask,
+                                              in5_pos_out.transpose(1, 2))
+        else:
+            keep_score = None
+        # FPN for fused multi-scale visual feature
+        out4 = self.up5(in5) + self.drop_path(in4)
+        out3 = self.up4(out4) + self.drop_path(in3)
+        out2 = self.up3(out3) + self.drop_path(in2)
+        p5 = self.out5(in5)
+        p4 = self.out4(out4)
+        p3 = self.out3(out3)
+        p2 = self.out2(out2)
+        feat_ms = torch.cat((p5, p4, p3, p2), 1)
+        ret_dict = dict(
+            feat_ms=feat_ms,
+            keep_score=keep_score,
+        )
+        if self.out_sampling:
+            # gumbel sampling
+            topk_mask2, gumbel_hard2, keep_score2 = self.gumble_sample2(
+                feat_ms)
+            feat_ms_pos = self.dropout_pos(feat_ms
+                                           + self.pos_emb2.unsqueeze(0))
+            feat_ms_pos_sampled = self.gumble_sample2.sample(
+                feat_ms_pos, topk_mask2,
+                gumbel_hard2).transpose(0, 1)  # [num_keep2, N, inner_c]
+            ret_dict_sup = dict(
+                feat_ms_pos_sampled=feat_ms_pos_sampled,
+                sampler=self.gumble_sample2,
+                keep_score2=keep_score2,
+            )
+            ret_dict.update(ret_dict_sup)
+        return ret_dict
diff --git a/modelscope/models/multi_modal/vldoc/convnext.py b/modelscope/models/multi_modal/vldoc/convnext.py
new file mode 100644
index 00000000..3cf7cd63
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/convnext.py
@@ -0,0 +1,168 @@
+# The implementation is borrowed and partly modified from ConvNext,
+# made publicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt.
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath, trunc_normal_
+from timm.models.registry import register_model
+
+
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim,
+            4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(
+            layer_scale_init_value * torch.ones((dim)),
+            requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(
+        self,
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.,
+        layer_scale_init_value=1e-6,
+    ):
+        super().__init__()
+
+        self.downsample_layers = nn.ModuleList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format='channels_first'))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format='channels_first'),
+                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(*[
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        # self.norm = nn.LayerNorm(dims[-1], eps=1e-6) # final norm layer
+        self.dims = dims
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        xs = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            xs.append(x)
+        # x = x.permute(0, 2, 3, 1) # [N, H, W, C]
+        # x = self.norm(x)
+        # x = x.permute(0, 3, 1, 2) # [N, C, H, W]
+        return tuple(xs)
+
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self,
+                 normalized_shape,
+                 eps=1e-6,
+                 data_format='channels_last'):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ['channels_last', 'channels_first']:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == 'channels_last':
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == 'channels_first':
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+@register_model
+def convnext_tiny(pretrained=False, in_22k=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    return model
diff --git a/modelscope/models/multi_modal/vldoc/model.py b/modelscope/models/multi_modal/vldoc/model.py
new file mode 100644
index 00000000..5b21bf10
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/model.py
@@ -0,0 +1,433 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+import logging
+import math
+import os
+import re
+import sys
+
+import json
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torchvision.ops import roi_align
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.multi_modal.vldoc.conv_fpn_trans import FPNTrans
+from modelscope.models.multi_modal.vldoc.modeling_layout_roberta import (
+    LayoutRobertaModel, LayoutRobertaPreTrainedModel)
+from modelscope.models.multi_modal.vldoc.transformer_local import (
+    TransformerDecoder, TransformerDecoderLayer)
+from modelscope.utils.constant import ModeKeys, ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['VLDocForDocVLEmbedding']
+
+
+class GeoVLDocModelOutputs(object):
+
+    def __init__(
+        self,
+        text_features,
+        text_mm_features,
+        block_vis_features,
+        block_vis_mm_features,
+        image_mm_features,
+    ):
+        # [batch size, sequence length, hidden size]
+        self.text_features = text_features
+        # [batch size, sequence length, hidden size]
+        self.text_mm_features = text_mm_features
+        # [batch size, block num, hidden size]
+        self.block_vis_features = block_vis_features
+        # [batch size, block num, hidden size]
+        self.block_vis_mm_features = block_vis_mm_features
+        # [batch size, hidden size]
+        self.image_mm_features = image_mm_features
+
+
+class GeoVLDocModel(LayoutRobertaPreTrainedModel):
+
+    def __init__(self, config, hard_negtive_sampling=False):
+        super().__init__(config)
+        self.config = config
+        self.hard_negtive_sampling = hard_negtive_sampling
+
+        if getattr(self.config, 'architectures', None):
+            if self.config.architectures[0] == 'LayoutRobertaModel':
+                self.text_encoder = LayoutRobertaModel(config)
+            else:
+                self.text_encoder = LayoutRobertaModel(config)
+        else:
+            self.text_encoder = LayoutRobertaModel(config)
+        self.visual_encoder = FPNTrans(
+            img_size=self.config.image_size, inner_vit=False)
+        self.pool = nn.AdaptiveAvgPool2d([1, 1])
+        self.vis_linear = nn.Linear(256, self.config.hidden_size)
+
+        cross_modal_text_layer = TransformerDecoderLayer(
+            self.config.hidden_size,
+            self.config.num_attention_heads,
+            self.config.intermediate_size,
+            self_attn=True)
+        self.cross_modal_text = TransformerDecoder(cross_modal_text_layer, 1)
+
+        cross_modal_visual_layer = TransformerDecoderLayer(
+            self.config.hidden_size,
+            self.config.num_attention_heads,
+            self.config.intermediate_size,
+            self_attn=True)
+        self.cross_modal_visual = TransformerDecoder(cross_modal_visual_layer,
+                                                     1)
+
+        self.init_weights()
+
+    def from_pretrained(self, ckpt_path: str):
+        state_dict = torch.load(ckpt_path, map_location='cpu')
+        state_dict_new = {}
+        for k, v in state_dict.items():
+            k = k.replace('geo_vl_doc_model.', '')
+            state_dict_new[k] = v
+        self.load_state_dict(state_dict_new)
+
+    def forward(self,
+                input_ids=None,
+                image=None,
+                bbox=None,
+                bbox_4p_normalized=None,
+                attention_mask=None,
+                first_token_idxes=None,
+                first_token_idxes_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+
+        batch_size, seq_len = input_ids.shape
+
+        return_dict = (
+            return_dict
+            if return_dict is not None else self.config.use_return_dict)
+
+        kwargs['line_bbox'] = bbox
+        # ################ get text representation ################
+        if self.config.architectures[0] == 'LayoutRobertaModel':
+            outputs = self.text_encoder(
+                input_ids,
+                bbox=bbox_4p_normalized,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs)
+        else:
+            outputs = self.text_encoder(
+                input_ids,
+                bbox=bbox_4p_normalized,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs)
+
+        # sequence_output: [batch_size, seq_len, hidden_size]
+        # pooled_output: [batch_size, hidden_size]
+        sequence_output, pooled_output = outputs[:2]
+
+        # ################ get visual representation ################
+        _, num_first = first_token_idxes.shape
+        B_batch_dim = torch.arange(
+            0, batch_size,
+            device=input_ids.device).reshape(batch_size,
+                                             1).expand(batch_size, num_first)
+
+        feature_bbox = bbox[B_batch_dim, first_token_idxes]
+        _, block_num, _ = feature_bbox.shape
+
+        visual_out = self.visual_encoder(image)
+        batch_idxs = torch.arange(
+            0, batch_size, device=sequence_output.device).reshape(
+                batch_size, 1).expand(batch_size, block_num).unsqueeze(-1)
+
+        # [batch_size*block_num, 5]
+        batch_idx_with_bbox = torch.cat(
+            (batch_idxs, feature_bbox),
+            2).reshape(batch_size * block_num,
+                       5).to(dtype=visual_out['feat_ms'].dtype)
+
+        if visual_out['feat_ms'].dtype == torch.float16:
+            # [batch_size*block_num, 256, 1, 1]
+            blk_vis_features = roi_align(
+                visual_out['feat_ms'].to(torch.float32),
+                batch_idx_with_bbox.to(torch.float32),
+                1,
+                spatial_scale=visual_out['feat_ms'].size(-1) / 1000.0)
+            blk_vis_features = blk_vis_features.to(
+                dtype=visual_out['feat_ms'].dtype)
+        else:
+            blk_vis_features = roi_align(
+                visual_out['feat_ms'],
+                batch_idx_with_bbox.to(torch.float32),
+                1,
+                spatial_scale=visual_out['feat_ms'].size(-1) / 1000.0)
+
+        # [batch_size*block_num, 256]
+        blk_vis_features = blk_vis_features.squeeze(2).squeeze(2).reshape(
+            batch_size, block_num, 256)
+
+        # visual block features:
+        # blk_vis_features: [batch_size, block_num, hidden_size]
+        blk_vis_features = self.vis_linear(blk_vis_features)
+        blk_vis_features = blk_vis_features * first_token_idxes_mask.unsqueeze(
+            2)
+        # [batch_size, 256]
+        full_img_features = self.pool(
+            visual_out['feat_ms']).squeeze(2).squeeze(2)
+        # [batch_size, hidden_size]
+        full_img_features = self.vis_linear(full_img_features).unsqueeze(1)
+
+        # ################ multi-modal fusion ################
+
+        # cross attention inputs
+        vis_inps = torch.cat((full_img_features, blk_vis_features), 1)
+
+        glb_feat_attn = torch.ones((batch_size, 1)).to(input_ids.device)
+
+        vis_mask = torch.cat((glb_feat_attn, first_token_idxes_mask), 1)
+
+        # When we use transformer in torch.nn, the input size is
+        # [seq_len, batch_size, hidden_size]
+        # In attention_mask, 1 denotes masked
+        new_attention_mask = (1 - attention_mask) > 0
+        new_vis_mask = (1 - vis_mask) > 0
+
+        text_mm_feat = self.cross_modal_text(
+            tgt=sequence_output.transpose(0, 1),
+            memory=vis_inps.transpose(0, 1),
+            tgt_key_padding_mask=new_attention_mask,
+            memory_key_padding_mask=new_vis_mask)
+
+        vis_mm_feat = self.cross_modal_visual(
+            tgt=vis_inps.transpose(0, 1),
+            memory=sequence_output.transpose(0, 1),
+            tgt_key_padding_mask=new_vis_mask,
+            memory_key_padding_mask=new_attention_mask,
+        )
+
+        # [batch_size, seq_len, hidden_size]
+        text_mm_feat = text_mm_feat.transpose(0, 1)
+        # [batch_size, 1+block_num, hidden_size]
+        vis_mm_feat = vis_mm_feat.transpose(0, 1)
+
+        # image_mm_features = vis_mm_feat[:, 0, :]
+        block_vis_mm_features = vis_mm_feat[:, 1:]
+
+        return GeoVLDocModelOutputs(
+            text_features=sequence_output,
+            text_mm_features=text_mm_feat,
+            block_vis_features=blk_vis_features,
+            block_vis_mm_features=block_vis_mm_features,
+            image_mm_features=vis_mm_feat,
+        )
+
+
+@MODELS.register_module(Tasks.document_vl_embedding, module_name=Models.vldoc)
+class VLDocForDocVLEmbedding(TorchModel):
+    """
+    Generate multi-modal document embeddings in segment-level and token-level.
+
+    Args:
+        model_dir:
+            the path in model hub, e.g., 'damo/multi-modal_convnext-roberta-base_vldoc-embedding'
+    """
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir=model_dir, *args, **kwargs)
+
+        # Initialize the model.
+        from modelscope.models.multi_modal.vldoc.modeling_layout_roberta import LayoutRobertaConfig
+        model_cfg_path = os.path.join(model_dir, 'config.json')
+        logger.info('Loading config file from {}'.format(model_cfg_path))
+        assert os.path.exists(model_cfg_path)
+        self.config = LayoutRobertaConfig.from_json_file(model_cfg_path)
+        self.doc_model = GeoVLDocModel(self.config)
+
+        # restore the pretrained weight
+        model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        assert os.path.exists(model_path)
+        self.doc_model.from_pretrained(model_path)
+        logger.info('Loading model from {}'.format(model_path))
+
+        # Initialize the tokenizer.
+        from modelscope.models.multi_modal.vldoc.tokenization import VLDocXLMTokenizer
+        tokenizer_path = os.path.join(model_dir, ModelFile.TOKENIZER_FOLDER)
+        self.tokenizer = VLDocXLMTokenizer.from_pretrained(tokenizer_path)
+
+        # place the model
+        self.device = 'cuda:{}'.format(int(os.environ.get(
+            'LOCAL_RANK', 0))) if torch.cuda.is_available() else 'cpu'
+        if torch.cuda.is_available():
+            self.doc_model.to(self.device)
+            logger.info('Use GPU {} for finetuning & inference'.format(
+                int(os.environ.get('LOCAL_RANK', 0))))
+        else:
+            self.doc_model.float()
+            logger.info('Use CPU for finetuning & inference')
+
+    def forward(self,
+                input_ids=None,
+                image=None,
+                bbox=None,
+                bbox_4p_normalized=None,
+                attention_mask=None,
+                first_token_idxes=None,
+                first_token_idxes_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        """
+        Args:
+            - input_ids: :math:`(B, T, E)`, the input tokens, where B is the batch size,
+              T is the max token size, E is the embedding dimension.
+            - image: :math:`(B, C, H, W)`, normalized images.
+            - bbox: :math:`(B, T, 4)`, segment boxes denoted by top-left and bottom-right
+              vertexes whose values are normalized to [0, 1000).
+            - bbox_4p_normalized: :math:`(B, T, 8)`, word boxes denoted by 4 vertexes, whose
+              values are normalized to [0, 1).
+            - attention_mask: :math:`(B, T)`, mask for input tokens, where 0 means masked.
+            - first_token_idxes: :math:`(B, S)`, indexes of the corresponding first tokens
+              of all segments, where S is the max segment size.
+            - first_token_idxes_mask: :math:`(B, S)`, mask for segments, where 0 means masked.
+        Optional:
+            - line_rank_id: :math:`(B, T)`, orders of segments.
+            - line_rank_inner_id: :math:`(B, T)`, BIE-like tags.
+
+        To be more specific, please refer to the class `TextLayoutSerializer` in
+          `modelscope/models/multi_modal/vldoc/processing.py`.
+        """
+
+        vldoc_outputs = self.doc_model(
+            input_ids=input_ids,
+            image=image,
+            bbox=bbox,
+            bbox_4p_normalized=bbox_4p_normalized,
+            attention_mask=attention_mask,
+            first_token_idxes=first_token_idxes,
+            first_token_idxes_mask=first_token_idxes_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs)
+
+        return dict(
+            img_embedding=vldoc_outputs.image_mm_features,
+            text_embedding=vldoc_outputs.text_mm_features,
+        )
+
+
+def init_pretrained_weight(
+    model,
+    pretrained_model_path,
+    state_dict=None,
+    cache_dir=None,
+    init_backbone='roberta',
+):
+    if state_dict is None:
+        state_dict = torch.load(pretrained_model_path, map_location='cpu')
+
+    old_keys = []
+    new_keys = []
+    state_dict_keys = list(state_dict.keys())
+
+    if init_backbone == 'roberta':
+        for i in range(len(state_dict_keys)):
+            key = state_dict_keys[i]
+            new_key = None
+
+            if key.startswith('roberta.'):
+                new_key = key.replace('roberta.',
+                                      'geo_vl_doc_model.text_encoder.')
+                key = copy.deepcopy(new_key)
+
+            if new_key:
+                old_keys.append(state_dict_keys[i])
+                new_keys.append(new_key)
+
+    for old_key, new_key in zip(old_keys, new_keys):
+        state_dict[new_key] = state_dict.pop(old_key)
+
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    def load(module, prefix=''):
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     missing_keys, unexpected_keys, error_msgs)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    start_prefix = ''
+    if not hasattr(model, 'geo_vl_doc_model') and any(
+            s.startswith('geo_vl_doc_model.') for s in state_dict.keys()):
+        start_prefix = 'geo_vl_doc_model.'
+    load(model, prefix=start_prefix)
+    if len(missing_keys) > 0:
+        logger.info(
+            'Weights of {} not initialized from pretrained model: {}'.format(
+                model.__class__.__name__, missing_keys))
+    if len(unexpected_keys) > 0:
+        logger.info('Weights from pretrained model not used in {}: {}'.format(
+            model.__class__.__name__, unexpected_keys))
+    if len(error_msgs) > 0:
+        raise RuntimeError(
+            'Error(s) in loading state_dict for {}:\n\t{}'.format(
+                model.__class__.__name__, '\n\t'.join(error_msgs)))
+
+    return model
diff --git a/modelscope/models/multi_modal/vldoc/modeling_layout_roberta.py b/modelscope/models/multi_modal/vldoc/modeling_layout_roberta.py
new file mode 100644
index 00000000..c47294e3
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/modeling_layout_roberta.py
@@ -0,0 +1,1140 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright 2021-2022 The Alibaba DAMO Duguang Team Authors. All rights reserved.
+
+import math
+import os
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from transformers.activations import ACT2FN, gelu
+from transformers.configuration_utils import PretrainedConfig
+from transformers.file_utils import (add_code_sample_docstrings,
+                                     add_start_docstrings,
+                                     add_start_docstrings_to_model_forward,
+                                     replace_return_docstrings)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions, MaskedLMOutput,
+    MultipleChoiceModelOutput, QuestionAnsweringModelOutput,
+    SequenceClassifierOutput, TokenClassifierOutput)
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+from transformers.utils import logging
+
+logger = logging.get_logger()
+
+
+class LayoutRobertaConfig(PretrainedConfig):
+    model_type = 'layoutroberta'
+
+    def __init__(self,
+                 vocab_size=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=1,
+                 bos_token_id=0,
+                 eos_token_id=2,
+                 bbox_scale=100.0,
+                 pe_type='crel',
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 classifier_dropout=None,
+                 **kwargs):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            hidden_dropout_prob=hidden_dropout_prob,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            initializer_range=initializer_range,
+            layer_norm_eps=layer_norm_eps,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+        self.bbox_scale = bbox_scale
+        self.pe_type = pe_type
+
+
+class PositionalEmbedding1D(nn.Module):
+    # Reference:
+    # https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15
+
+    def __init__(self, demb):
+        super(PositionalEmbedding1D, self).__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000**(torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        seq_size = pos_seq.size()
+
+        if len(seq_size) == 2:
+            b1, b2 = seq_size
+            sinusoid_inp = pos_seq.view(b1, b2, 1) * self.inv_freq.view(
+                1, 1, self.demb // 2)
+        elif len(seq_size) == 3:
+            b1, b2, b3 = seq_size
+            sinusoid_inp = pos_seq.view(b1, b2, b3, 1) * self.inv_freq.view(
+                1, 1, 1, self.demb // 2)
+        else:
+            raise ValueError(f'Invalid seq_size={len(seq_size)}')
+
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        return pos_emb
+
+
+class PositionalEmbedding2D(nn.Module):
+
+    def __init__(self, demb, dim_bbox=8):
+        super(PositionalEmbedding2D, self).__init__()
+
+        self.demb = demb
+        self.dim_bbox = dim_bbox
+
+        self.x_pos_emb = PositionalEmbedding1D(demb // dim_bbox)
+        self.y_pos_emb = PositionalEmbedding1D(demb // dim_bbox)
+
+        inv_freq = 1 / (10000**(torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, bbox):
+        # bbox: [seq_length, batch_size, dim_bbox]
+        stack = []
+        for i in range(self.dim_bbox):
+            if i % 2 == 0:
+                stack.append(self.x_pos_emb(bbox[..., i]))
+            else:
+                stack.append(self.y_pos_emb(bbox[..., i]))
+        bbox_pos_emb = torch.cat(stack, dim=-1)
+        return bbox_pos_emb
+
+
+class LayoutRobertaEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # layout-related embeddings
+        self.line_rank_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size)
+
+        self.line_rank_inner_embeddings = nn.Embedding(4, config.hidden_size)
+
+        self.x_position_embeddings = nn.Embedding(
+            config.max_2d_position_embeddings, config.coordinate_size)
+        self.y_position_embeddings = nn.Embedding(
+            config.max_2d_position_embeddings, config.coordinate_size)
+        self.h_position_embeddings = nn.Embedding(
+            config.max_2d_position_embeddings, config.shape_size)
+        self.w_position_embeddings = nn.Embedding(
+            config.max_2d_position_embeddings, config.shape_size)
+
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+        if version.parse(torch.__version__) > version.parse('1.6.0'):
+            self.register_buffer(
+                'token_type_ids',
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
+                persistent=False,
+            )
+
+        if config.pe_type == 'pdpdq_ws':
+            dim_bbox_sinusoid_emb = config.hidden_size
+            dim_bbox_projection = config.hidden_size
+        elif config.pe_type == 'crel':
+            dim_bbox_sinusoid_emb = config.hidden_size // 4
+            dim_bbox_projection = config.hidden_size // config.num_attention_heads
+        else:
+            raise ValueError(f'Unknown config.pe_type={config.pe_type}')
+
+        self.bbox_sinusoid_emb = PositionalEmbedding2D(
+            dim_bbox_sinusoid_emb, dim_bbox=8)
+        self.bbox_projection = nn.Linear(
+            dim_bbox_sinusoid_emb, dim_bbox_projection, bias=False)
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            padding_idx=self.padding_idx)
+
+    def _cal_spatial_position_embeddings(self, bbox):
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :,
+                                                                       0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :,
+                                                                        1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :,
+                                                                        2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :,
+                                                                        3])
+        except IndexError as e:
+            raise IndexError(
+                'The :obj:`bbox`coordinate values should be within 0-1000 range.'
+            ) from e
+
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3]
+                                                           - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2]
+                                                           - bbox[:, :, 0])
+
+        spatial_position_embeddings = torch.cat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            dim=-1,
+        )
+        return spatial_position_embeddings
+
+    def forward(self,
+                input_ids=None,
+                token_type_ids=None,
+                position_ids=None,
+                inputs_embeds=None,
+                past_key_values_length=0,
+                **kwargs):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(
+                    inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when
+        # tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape,
+                    dtype=torch.long,
+                    device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        if 'line_bbox' in kwargs:
+            embeddings += self._cal_spatial_position_embeddings(
+                kwargs['line_bbox'])
+
+        if 'line_rank_id' in kwargs:
+            embeddings += self.line_rank_embeddings(kwargs['line_rank_id'])
+
+        if 'line_rank_inner_id' in kwargs:
+            embeddings += self.line_rank_inner_embeddings(
+                kwargs['line_rank_inner_id'])
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def calc_bbox_pos_emb(self, bbox, pe_type):
+        # bbox_t: [seq_length, batch_size, dim_bbox]
+        bbox_t = bbox.transpose(0, 1)
+
+        if pe_type == 'pdpdq_ws':
+            bbox_pos = bbox_t
+        elif pe_type == 'crel':
+            # bbox_pos: [seq_length, seq_length, batch_size, dim_bbox]
+            bbox_pos = bbox_t[None, :, :, :] - bbox_t[:, None, :, :]
+        else:
+            raise ValueError(f'Unknown pe_type={pe_type}')
+
+        bbox_pos_emb = self.bbox_sinusoid_emb(bbox_pos)
+        bbox_pos_emb = self.bbox_projection(bbox_pos_emb)
+
+        return bbox_pos_emb
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly.
+        We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1,
+            sequence_length + self.padding_idx + 1,
+            dtype=torch.long,
+            device=inputs_embeds.device)
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta
+class LayoutRobertaSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+        self.pe_type = config.pe_type
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        bbox_pos_emb=None,
+        bbox_pos_mask=None,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if (self.position_embedding_type == 'relative_key'
+                or self.position_embedding_type == 'relative_key_query'):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = (
+                    attention_scores + relative_position_scores_query
+                    + relative_position_scores_key)
+
+        # bbox positional encoding
+        batch_size, n_head, seq_length, d_head = query_layer.shape
+        if self.pe_type == 'pdpdq_ws':
+            head_q_pos = self.query(bbox_pos_emb)
+            head_k_pos = self.key(bbox_pos_emb)
+            head_q_pos = head_q_pos.view(seq_length, batch_size, n_head,
+                                         d_head)
+            head_k_pos = head_k_pos.view(seq_length, batch_size, n_head,
+                                         d_head)
+            head_q_pos = head_q_pos.permute([1, 2, 0, 3])
+            head_k_pos = head_k_pos.permute([1, 2, 0, 3])
+
+            bbox_pos_scores_1 = torch.einsum(
+                'bnid,bnjd->bnij',
+                (torch.mul(query_layer, head_q_pos), head_k_pos))
+            bbox_pos_scores_2 = torch.einsum('bnid,bnjd->bnij',
+                                             (head_q_pos, head_k_pos))
+            bbox_pos_scores = bbox_pos_scores_1 + bbox_pos_scores_2
+        elif self.pe_type == 'crel':
+            bbox_pos_emb = bbox_pos_emb.view(seq_length, seq_length,
+                                             batch_size, d_head)
+            bbox_pos_emb = bbox_pos_emb.permute([2, 0, 1, 3])
+            bbox_pos_scores = torch.einsum('bnid,bijd->bnij',
+                                           (query_layer, bbox_pos_emb))
+        else:
+            raise ValueError(f'Unknown self.pe_type={self.pe_type}')
+
+        if bbox_pos_mask is not None:
+            # bbox_pos_mask is [batch_size, seq_length]
+            bbox_pos_mask = 1 - bbox_pos_mask
+            # [batch_size, 1, seq_length]
+            M1 = bbox_pos_mask.unsqueeze(1)
+            # [batch_size, seq_length, 1]
+            MT = M1.permute(0, 2, 1)
+            # [batch_size, seq_length, seq_length]
+            bbox_pos_mask_final = torch.matmul(
+                MT.to(bbox_pos_scores.dtype), M1.to(bbox_pos_scores.dtype))
+        else:
+            bbox_pos_mask_final = None
+
+        if bbox_pos_mask_final is not None:
+            bbox_pos_scores = torch.mul(bbox_pos_scores,
+                                        bbox_pos_mask_final.unsqueeze(1))
+
+        # [batch_size, d_head, seq_length, seq_length]
+        attention_scores = attention_scores + bbox_pos_scores
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class LayoutRobertaSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+class LayoutRobertaAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = LayoutRobertaSelfAttention(
+            config, position_embedding_type=position_embedding_type)
+        self.output = LayoutRobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        bbox_pos_emb=None,
+        bbox_pos_mask=None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            bbox_pos_emb=bbox_pos_emb,
+            bbox_pos_mask=bbox_pos_mask,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class LayoutRobertaIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class LayoutRobertaOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Roberta
+class LayoutRobertaLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = LayoutRobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = LayoutRobertaAttention(
+                config, position_embedding_type='absolute')
+        self.intermediate = LayoutRobertaIntermediate(config)
+        self.output = LayoutRobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        bbox_pos_emb=None,
+        bbox_pos_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            bbox_pos_emb=bbox_pos_emb,
+            bbox_pos_mask=bbox_pos_mask,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise ValueError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers'
+                    ' by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Roberta
+class LayoutRobertaEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([
+            LayoutRobertaLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        bbox_pos_emb=None,
+        bbox_pos_mask=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    bbox_pos_emb,
+                    bbox_pos_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    bbox_pos_emb,
+                    bbox_pos_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class LayoutRobertaPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class LayoutRobertaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LayoutRobertaConfig
+    base_model_prefix = 'layoutroberta'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LayoutRobertaEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [
+                k for k in self._keys_to_ignore_on_save
+                if k not in del_keys_to_ignore
+            ]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing
+                if k not in del_keys_to_ignore
+            ]
+
+
+class LayoutRobertaModel(LayoutRobertaPreTrainedModel):
+    """
+
+    BROS + Roberta
+
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = LayoutRobertaEmbeddings(config)
+        self.encoder = LayoutRobertaEncoder(config)
+
+        self.pooler = LayoutRobertaPooler(
+            config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(self,
+                input_ids=None,
+                bbox=None,
+                bbox_mask=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple
+            having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else
+            self.config.output_attentions)
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, 'token_type_ids'):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :
+                                                                         seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(
+                    input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            **kwargs)
+
+        scaled_bbox = bbox * self.config.bbox_scale
+        bbox_pos_emb = self.embeddings.calc_bbox_pos_emb(
+            scaled_bbox, self.config.pe_type)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox_pos_emb=bbox_pos_emb,
+            bbox_pos_mask=bbox_mask,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+def create_position_ids_from_input_ids(input_ids,
+                                       padding_idx,
+                                       past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask)
+                           + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
diff --git a/modelscope/models/multi_modal/vldoc/processing.py b/modelscope/models/multi_modal/vldoc/processing.py
new file mode 100644
index 00000000..afef8bdb
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/processing.py
@@ -0,0 +1,538 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+Processor class for GeoLayoutLM.
+"""
+
+from collections import defaultdict
+from typing import Dict, Iterable, List, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from torchvision import transforms
+
+from modelscope.preprocessors.image import LoadImage
+
+
+def custom_tokenize(tokenizer, text):
+    toks = tokenizer.tokenize('pad ' + text)[1:]
+    toks2 = toks[1:] if len(toks) > 0 and toks[0] == '▁' else toks
+    return toks2
+
+
+class ImageProcessor(object):
+    r"""
+    Construct a GeoLayoutLM image processor
+    Args:
+        do_preprocess (`bool`): whether to do preprocess to unify the image format,
+            resize and convert to tensor.
+        do_rescale: only works when we disable do_preprocess.
+    """
+
+    def __init__(self,
+                 do_preprocess: bool = True,
+                 do_resize: bool = False,
+                 image_size: Dict[str, int] = None,
+                 do_rescale: bool = False,
+                 rescale_factor: float = 1. / 255,
+                 do_normalize: bool = True,
+                 image_mean: Union[float, Iterable[float]] = None,
+                 image_std: Union[float, Iterable[float]] = None,
+                 apply_ocr: bool = True,
+                 **kwargs) -> None:
+        self.do_preprocess = do_preprocess
+        self.do_resize = do_resize
+        self.size = image_size if image_size is not None else {
+            'height': 768,
+            'width': 768
+        }
+        self.do_rescale = do_rescale and (not do_preprocess)
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        image_mean = IMAGENET_DEFAULT_MEAN if image_mean is None else image_mean
+        image_std = IMAGENET_DEFAULT_STD if image_std is None else image_std
+        self.image_mean = (image_mean, image_mean, image_mean) if isinstance(
+            image_mean, float) else image_mean
+        self.image_std = (image_std, image_std, image_std) if isinstance(
+            image_std, float) else image_std
+        self.apply_ocr = apply_ocr
+        self.kwargs = kwargs
+
+        self.totensor = transforms.ToTensor()
+
+    def preprocess(self, image: Union[np.ndarray, PIL.Image.Image]):
+        """ unify the image format, resize and convert to tensor.
+        """
+        image = LoadImage.convert_to_ndarray(image)[:, :, ::-1]
+        size_raw = image.shape[:2]
+        if self.do_resize:
+            image = cv2.resize(image,
+                               (self.size['width'], self.size['height']))
+        # convert to pytorch tensor
+        image_pt = self.totensor(image)
+        return image_pt, size_raw
+
+    def __call__(self, images: Union[list, np.ndarray, PIL.Image.Image, str]):
+        """
+        Args:
+            images: list of np.ndarrays, PIL images or image tensors.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        sizes_raw = []
+        if self.do_preprocess:
+            for i in range(len(images)):
+                images[i], size_raw = self.preprocess(images[i])
+                sizes_raw.append(size_raw)
+        images_pt = torch.stack(images, dim=0)  # [b, c, h, w]
+        if self.do_rescale:
+            images_pt = images_pt * self.rescale_factor
+        if self.do_normalize:
+            mu = torch.tensor(self.image_mean).view(1, 3, 1, 1)
+            std = torch.tensor(self.image_std).view(1, 3, 1, 1)
+            images_pt = (images_pt - mu) / (std + 1e-8)
+
+        # TODO: apply OCR
+        ocr_infos = None
+        if self.apply_ocr:
+            raise NotImplementedError('OCR service is not available yet!')
+        if len(sizes_raw) == 0:
+            sizes_raw = None
+        data = {
+            'images': images_pt,
+            'ocr_infos': ocr_infos,
+            'sizes_raw': sizes_raw
+        }
+        return data
+
+
+class OCRUtils(object):
+
+    def __init__(self):
+        self.version = 'v0'
+
+    def __call__(self, ocr_infos):
+        """
+        sort boxes, filtering or other preprocesses
+        should return sorted ocr_infos
+        """
+        raise NotImplementedError
+
+
+def bound_box(box, height, width):
+    # box: [x_tl, y_tl, x_br, y_br] or ...
+    assert len(box) == 4 or len(box) == 8
+    for i in range(len(box)):
+        if i & 1:
+            box[i] = max(0, min(box[i], height))
+        else:
+            box[i] = max(0, min(box[i], width))
+    return box
+
+
+def bbox2pto4p(box2p):
+    box4p = [
+        box2p[0], box2p[1], box2p[2], box2p[1], box2p[2], box2p[3], box2p[0],
+        box2p[3]
+    ]
+    return box4p
+
+
+def bbox4pto2p(box4p):
+    box2p = [
+        min(box4p[0], box4p[2], box4p[4], box4p[6]),
+        min(box4p[1], box4p[3], box4p[5], box4p[7]),
+        max(box4p[0], box4p[2], box4p[4], box4p[6]),
+        max(box4p[1], box4p[3], box4p[5], box4p[7]),
+    ]
+    return box2p
+
+
+def stack_tensor_dict(tensor_dicts: List[Dict[str, torch.Tensor]]):
+    one_dict = defaultdict(list)
+    for td in tensor_dicts:
+        for k, v in td.items():
+            one_dict[k].append(v)
+    res_dict = {}
+    for k, v in one_dict.items():
+        res_dict[k] = torch.stack(v, dim=0)
+    return res_dict
+
+
+class TextLayoutSerializer(object):
+
+    def __init__(self,
+                 max_seq_length: int,
+                 max_block_num: int,
+                 tokenizer,
+                 width=768,
+                 height=768,
+                 use_roberta_tokenizer: bool = True,
+                 ocr_utils: OCRUtils = None):
+        self.version = 'v0'
+
+        self.max_seq_length = max_seq_length
+        self.max_block_num = max_block_num
+        self.tokenizer = tokenizer
+        self.width = width
+        self.height = height
+        self.use_roberta_tokenizer = use_roberta_tokenizer
+        self.ocr_utils = ocr_utils
+
+        self.pad_token_id = tokenizer.pad_token_id
+        self.cls_token_id = tokenizer.bos_token_id
+        self.sep_token_id = tokenizer.eos_token_id
+        self.unk_token_id = tokenizer.unk_token_id
+        self.cls_bbs_word = [0.0] * 8
+        self.cls_bbs_line = [0] * 4
+
+    def label2seq(self, ocr_info: list, label_info: list):
+        raise NotImplementedError
+
+    def serialize_single(
+        self,
+        ocr_info: list = None,
+        input_ids: list = None,
+        bbox_line: List[List] = None,
+        bbox_word: List[List] = None,
+        width: int = 768,
+        height: int = 768,
+    ):
+        r"""
+        Either ocr_info or (input_ids, bbox_line, bbox_word)
+            should be provided.
+        If (input_ids, bbox_line, bbox_word) is provided,
+            convinient plug into the serialization (customization)
+            is offered. The tokens must be organised by blocks and words.
+        Else, ocr_info must be provided, to be parsed
+            to sequences directly (the simplest way).
+        Args:
+            ocr_info: [
+                {"text": "xx", "box": [a,b,c,d],
+                 "words": [{"text": "x", "box": [e,f,g,h]}, ...]},
+                ...
+            ]
+            bbox_line: the coordinate value should match the original image
+                (i.e., not be normalized).
+        """
+        if input_ids is not None:
+            assert len(input_ids) == len(bbox_line)
+            assert len(input_ids) == len(bbox_word)
+            input_ids, bbs_word, bbs_line, first_token_idxes, \
+                line_rank_ids, line_rank_inner_ids, word_rank_ids = \
+                self.halfseq2seq(input_ids, bbox_line, bbox_word, width, height)
+        else:
+            assert ocr_info is not None
+            input_ids, bbs_word, bbs_line, first_token_idxes, \
+                line_rank_ids, line_rank_inner_ids, word_rank_ids = \
+                self.ocr_info2seq(ocr_info, width, height)
+
+        token_seq = {}
+        token_seq['input_ids'] = torch.ones(
+            self.max_seq_length, dtype=torch.int64) * self.pad_token_id
+        token_seq['attention_mask'] = torch.zeros(
+            self.max_seq_length, dtype=torch.int64)
+        token_seq['first_token_idxes'] = torch.zeros(
+            self.max_block_num, dtype=torch.int64)
+        token_seq['first_token_idxes_mask'] = torch.zeros(
+            self.max_block_num, dtype=torch.int64)
+        token_seq['bbox_4p_normalized'] = torch.zeros(
+            self.max_seq_length, 8, dtype=torch.float32)
+        token_seq['bbox'] = torch.zeros(
+            self.max_seq_length, 4, dtype=torch.float32)
+        token_seq['line_rank_id'] = torch.zeros(
+            self.max_seq_length, dtype=torch.int64)  # start from 1
+        token_seq['line_rank_inner_id'] = torch.ones(
+            self.max_seq_length, dtype=torch.int64)  # 1 2 2 3
+        token_seq['word_rank_id'] = torch.zeros(
+            self.max_seq_length, dtype=torch.int64)  # start from 1
+
+        # expand using cls and sep tokens
+        sep_bbs_word = [width, height] * 4
+        sep_bbs_line = [width, height] * 2
+        input_ids = [self.cls_token_id] + input_ids + [self.sep_token_id]
+        bbs_line = [self.cls_bbs_line] + bbs_line + [sep_bbs_line]
+        bbs_word = [self.cls_bbs_word] + bbs_word + [sep_bbs_word]
+
+        # assign
+        len_tokens = len(input_ids)
+        len_lines = len(first_token_idxes)
+        token_seq['input_ids'][:len_tokens] = torch.tensor(input_ids)
+        token_seq['attention_mask'][:len_tokens] = 1
+        token_seq['first_token_idxes'][:len_lines] = torch.tensor(
+            first_token_idxes)
+        token_seq['first_token_idxes_mask'][:len_lines] = 1
+        token_seq['line_rank_id'][1:len_tokens
+                                  - 1] = torch.tensor(line_rank_ids)
+        token_seq['line_rank_inner_id'][1:len_tokens - 1] = torch.tensor(
+            line_rank_inner_ids)
+        token_seq['line_rank_inner_id'] = token_seq[
+            'line_rank_inner_id'] * token_seq['attention_mask']
+        token_seq['word_rank_id'][1:len_tokens
+                                  - 1] = torch.tensor(word_rank_ids)
+
+        token_seq['bbox_4p_normalized'][:len_tokens, :] = torch.tensor(
+            bbs_word)
+        # word bbox normalization -> [0, 1]
+        token_seq['bbox_4p_normalized'][:, [0, 2, 4, 6]] = \
+            token_seq['bbox_4p_normalized'][:, [0, 2, 4, 6]] / width
+        token_seq['bbox_4p_normalized'][:, [1, 3, 5, 7]] = \
+            token_seq['bbox_4p_normalized'][:, [1, 3, 5, 7]] / height
+
+        token_seq['bbox'][:len_tokens, :] = torch.tensor(bbs_line)
+        # line bbox -> [0, 1000)
+        token_seq['bbox'][:,
+                          [0, 2]] = token_seq['bbox'][:, [0, 2]] / width * 1000
+        token_seq['bbox'][:,
+                          [1, 3]] = token_seq['bbox'][:,
+                                                      [1, 3]] / height * 1000
+        token_seq['bbox'] = token_seq['bbox'].long()
+
+        return token_seq
+
+    def ocr_info2seq(self, ocr_info: list, width: int, height: int):
+        input_ids = []
+        bbs_word = []
+        bbs_line = []
+        first_token_idxes = []
+        line_rank_ids = []
+        line_rank_inner_ids = []
+        word_rank_ids = []
+
+        early_stop = False
+        for line_idx, line in enumerate(ocr_info):
+            if line_idx == self.max_block_num:
+                early_stop = True
+            if early_stop:
+                break
+            lbox = line['box']
+            lbox = bound_box(lbox, height, width)
+            is_first_word = True
+            for word_id, word_info in enumerate(line['words']):
+                wtext = word_info['text']
+                wbox = word_info['box']
+                wbox = bound_box(wbox, height, width)
+                wbox4p = bbox2pto4p(wbox)
+                if self.use_roberta_tokenizer:
+                    wtokens = custom_tokenize(self.tokenizer, wtext)
+                else:
+                    wtokens = self.tokenizer.tokenize(wtext)
+                wtoken_ids = self.tokenizer.convert_tokens_to_ids(wtokens)
+                if len(wtoken_ids) == 0:
+                    wtoken_ids.append(self.unk_token_id)
+                n_tokens = len(wtoken_ids)
+                # reserve for cls and sep
+                if len(input_ids) + n_tokens > self.max_seq_length - 2:
+                    early_stop = True
+                    break  # chunking early for long documents
+                if is_first_word:
+                    first_token_idxes.append(len(input_ids) + 1)
+                input_ids.extend(wtoken_ids)
+                bbs_word.extend([wbox4p] * n_tokens)
+                bbs_line.extend([lbox] * n_tokens)
+                word_rank_ids.extend([word_id + 1] * n_tokens)
+                line_rank_ids.extend([line_idx + 1] * n_tokens)
+                if is_first_word:
+                    if len(line_rank_inner_ids
+                           ) > 0 and line_rank_inner_ids[-1] == 2:
+                        line_rank_inner_ids[-1] = 3
+                    line_rank_inner_ids.extend([1] + (n_tokens - 1) * [2])
+                    is_first_word = False
+                else:
+                    line_rank_inner_ids.extend(n_tokens * [2])
+        if len(line_rank_inner_ids) > 0 and line_rank_inner_ids[-1] == 2:
+            line_rank_inner_ids[-1] = 3
+
+        return input_ids, bbs_word, bbs_line, first_token_idxes, line_rank_ids, \
+            line_rank_inner_ids, word_rank_ids
+
+    def halfseq2seq(self, input_ids: list, bbox_line: List[List],
+                    bbox_word: List[List], width: int, height: int):
+        """
+        for convinient plug into the serialization, given the 3 customized sequences.
+        They should not contain special tokens like [CLS] or [SEP].
+        """
+        bbs_word = []
+        bbs_line = []
+        first_token_idxes = []
+        line_rank_ids = []
+        line_rank_inner_ids = []
+        word_rank_ids = []
+
+        n_real_tokens = len(input_ids)
+        lb_prev, wb_prev = None, None
+        line_id = 0
+        word_id = 1
+        for i in range(n_real_tokens):
+            lb_now = bbox_line[i]
+            wb_now = bbox_word[i]
+            line_start = lb_prev is None or lb_now != lb_prev
+            word_start = wb_prev is None or wb_now != wb_prev
+            lb_prev, wb_prev = lb_now, wb_now
+
+            if len(lb_now) == 8:
+                lb_now = bbox4pto2p(lb_now)
+            assert len(lb_now) == 4
+            lb_now = bound_box(lb_now, height, width)
+            if len(wb_now) == 4:
+                wb_now = bbox2pto4p(wb_now)
+            assert len(wb_now) == 8
+            wb_now = bound_box(wb_now, height, width)
+
+            bbs_word.append(wb_now)
+            bbs_line.append(lb_now)
+
+            if word_start:
+                word_id += 1
+            if line_start:
+                line_id += 1
+                first_token_idxes.append(i + 1)
+                if len(line_rank_inner_ids
+                       ) > 0 and line_rank_inner_ids[-1] == 2:
+                    line_rank_inner_ids[-1] = 3
+                line_rank_inner_ids.append(1)
+                word_id = 1
+            else:
+                line_rank_inner_ids.append(2)
+            line_rank_ids.append(line_id)
+            word_rank_ids.append(word_id)
+
+        if len(line_rank_inner_ids) > 0 and line_rank_inner_ids[-1] == 2:
+            line_rank_inner_ids[-1] = 3
+
+        return input_ids, bbs_word, bbs_line, first_token_idxes, \
+            line_rank_ids, line_rank_inner_ids, word_rank_ids
+
+    def __call__(
+        self,
+        ocr_infos: List[List] = None,
+        input_ids: list = None,
+        bboxes_line: List[List] = None,
+        bboxes_word: List[List] = None,
+        sizes_raw: list = None,
+        **kwargs,
+    ):
+        n_samples = len(ocr_infos) if ocr_infos is not None else len(input_ids)
+        if sizes_raw is None:
+            sizes_raw = [(self.height, self.width)] * n_samples
+        seqs = []
+        if input_ids is not None:
+            assert len(input_ids) == len(bboxes_line)
+            assert len(input_ids) == len(bboxes_word)
+            for input_id, bbox_line, bbox_word, size_raw in zip(
+                    input_ids, bboxes_line, bboxes_word, sizes_raw):
+                height, width = size_raw
+                token_seq = self.serialize_single(None, input_id, bbox_line,
+                                                  bbox_word, width, height)
+                seqs.append(token_seq)
+        else:
+            assert ocr_infos is not None, 'For serialization, ocr_infos must not be NoneType!'
+            if self.ocr_utils is not None:
+                ocr_infos = self.ocr_utils(ocr_infos)
+            for ocr_info, size_raw in zip(ocr_infos, sizes_raw):
+                height, width = size_raw
+                token_seq = self.serialize_single(
+                    ocr_info, width=width, height=height)
+                seqs.append(token_seq)
+        pt_seqs = stack_tensor_dict(seqs)
+        return pt_seqs
+
+
+class Processor(object):
+    r"""Construct a GeoLayoutLM processor.
+
+    Args:
+        max_seq_length: max length for token
+        max_block_num: max number of text lines (blocks or segments)
+        img_processor: type of ImageProcessor.
+        tokenizer: to tokenize strings.
+        use_roberta_tokenizer: Whether the tokenizer is originated from RoBerta tokenizer
+            (True by default).
+        ocr_utils: a tool to preprocess ocr_infos.
+        width: default width. It can be used only when all the images are of the same shape.
+        height: default height. It can be used only when all the images are of the same shape.
+
+    In `serialize_from_tokens`, the 3 sequences (i.e., `input_ids`, `bboxes_line`, `bboxes_word`)
+        must not contain special tokens like [CLS] or [SEP].
+    The boxes in `bboxes_line` and `bboxes_word` can be presented by either 2 points or 4 points.
+    The value in boxes should keep original.
+    Here is an example of the 3 arguments:
+        ```
+        input_ids ->
+        [[6, 2391, 6, 31833, 6, 10132, 6, 2283, 6, 17730, 6, 2698, 152]]
+        bboxes_line ->
+        [[[230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38],
+            [230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38], [230, 1, 353, 38],
+            [257, 155, 338, 191], [257, 155, 338, 191], [257, 155, 338, 191], [257, 155, 338, 191],
+            [257, 155, 338, 191]]]
+        bboxes_word ->
+        [[[231, 2, 267, 2, 267, 38, 231, 38], [231, 2, 267, 2, 267, 38, 231, 38],
+            [264, 7, 298, 7, 298, 36, 264, 36], [264, 7, 298, 7, 298, 36, 264, 36],
+            [293, 3, 329, 3, 329, 41, 293, 41], [293, 3, 329, 3, 329, 41, 293, 41],
+            [330, 4, 354, 4, 354, 39, 330, 39], [330, 4, 354, 4, 354, 39, 330, 39],
+            [258, 156, 289, 156, 289, 193, 258, 193], [258, 156, 289, 156, 289, 193, 258, 193],
+            [288, 158, 321, 158, 321, 192, 288, 192], [288, 158, 321, 158, 321, 192, 288, 192],
+            [321, 156, 336, 156, 336, 190, 321, 190]]]
+        ```
+
+    """
+
+    def __init__(self,
+                 max_seq_length,
+                 max_block_num,
+                 img_processor: ImageProcessor,
+                 tokenizer=None,
+                 use_roberta_tokenizer: bool = True,
+                 ocr_utils: OCRUtils = None,
+                 width=768,
+                 height=768,
+                 **kwargs):
+        self.img_processor = img_processor
+        self.tokenizer = tokenizer
+        self.kwargs = kwargs
+
+        self.serializer = TextLayoutSerializer(
+            max_seq_length,
+            max_block_num,
+            tokenizer,
+            width,
+            height,
+            use_roberta_tokenizer=use_roberta_tokenizer,
+            ocr_utils=ocr_utils)
+
+    def __call__(
+        self,
+        images: Union[list, np.ndarray, PIL.Image.Image, str],
+        ocr_infos: List[List] = None,
+        token_seqs: dict = None,
+        sizes_raw: list = None,
+    ):
+        img_data = self.img_processor(images)
+        images = img_data['images']
+        ocr_infos = img_data['ocr_infos'] if ocr_infos is None else ocr_infos
+        sizes_raw = img_data['sizes_raw'] if sizes_raw is None else sizes_raw
+        if token_seqs is None:
+            token_seqs = self.serializer(ocr_infos, sizes_raw=sizes_raw)
+        else:
+            token_seqs = self.serializer(
+                None, sizes_raw=sizes_raw, **token_seqs)
+        assert token_seqs is not None, 'token_seqs must not be NoneType!'
+        batch = {}
+        batch['image'] = images
+        for k, v in token_seqs.items():
+            batch[k] = token_seqs[k]
+        return batch
+
+    def serialize_from_tokens(self,
+                              images,
+                              input_ids,
+                              bboxes_line,
+                              bboxes_word,
+                              sizes_raw=None):
+        half_batch = {}
+        half_batch['input_ids'] = input_ids
+        half_batch['bboxes_line'] = bboxes_line
+        half_batch['bboxes_word'] = bboxes_word
+        return self(images, None, half_batch, sizes_raw)
diff --git a/modelscope/models/multi_modal/vldoc/tokenization.py b/modelscope/models/multi_modal/vldoc/tokenization.py
new file mode 100644
index 00000000..a16c5849
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/tokenization.py
@@ -0,0 +1,89 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+
+from transformers import XLMRobertaTokenizer
+
+SPIECE_UNDERLINE = '▁'
+
+
+class VLDocXLMTokenizer(XLMRobertaTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+    model_input_names = ['input_ids', 'attention_mask']
diff --git a/modelscope/models/multi_modal/vldoc/transformer_local.py b/modelscope/models/multi_modal/vldoc/transformer_local.py
new file mode 100644
index 00000000..4c0dd55d
--- /dev/null
+++ b/modelscope/models/multi_modal/vldoc/transformer_local.py
@@ -0,0 +1,204 @@
+# The implementation is borrowed and modified from the official PyTorch website and ABINet:
+# https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html
+# https://github.com/FangShancheng/ABINet/blob/main/modules/transformer.py
+
+import copy
+
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import Dropout, LayerNorm, Linear, Module, ModuleList
+from torch.nn import functional as F
+
+
+class TransformerDecoder(Module):
+    r"""TransformerDecoder is a stack of N decoder layers
+
+    Args:
+        decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+        num_layers: the number of sub-decoder-layers in the decoder (required).
+        norm: the layer normalization component (optional).
+
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = transformer_decoder(tgt, memory)
+    """
+    __constants__ = ['norm']
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                tgt,
+                memory,
+                memory2=None,
+                tgt_mask=None,
+                memory_mask=None,
+                memory_mask2=None,
+                tgt_key_padding_mask=None,
+                memory_key_padding_mask=None,
+                memory_key_padding_mask2=None):
+        r"""Pass the inputs (and mask) through the decoder layer in turn.
+
+        Args:
+            tgt: the sequence to the decoder (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+
+        Shape:
+            see the docs in Transformer class.
+        """
+        output = tgt
+
+        for mod in self.layers:
+            output = mod(
+                output,
+                memory,
+                memory2=memory2,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                memory_mask2=memory_mask2,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_key_padding_mask2=memory_key_padding_mask2)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(Module):
+    r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of intermediate layer, relu or gelu (default=relu).
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 self_attn=True,
+                 siamese=False,
+                 debug=False):
+        super(TransformerDecoderLayer, self).__init__()
+        self.has_self_attn, self.siamese = self_attn, siamese
+        self.debug = debug
+        if self.has_self_attn:
+            self.self_attn = nn.MultiheadAttention(
+                d_model, nhead, dropout=dropout)
+            self.norm1 = LayerNorm(d_model)
+            self.dropout1 = Dropout(dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model)
+
+        self.norm2 = LayerNorm(d_model)
+        self.norm3 = LayerNorm(d_model)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+        if self.siamese:
+            self.multihead_attn2 = nn.MultiheadAttention(
+                d_model, nhead, dropout=dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super(TransformerDecoderLayer, self).__setstate__(state)
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                tgt_key_padding_mask=None,
+                memory_key_padding_mask=None,
+                memory2=None,
+                memory_mask2=None,
+                memory_key_padding_mask2=None):
+        r"""Pass the inputs (and mask) through the decoder layer.
+
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+
+        Shape:
+            see the docs in Transformer class.
+        """
+        if self.has_self_attn:
+            tgt2, attn = self.self_attn(
+                tgt,
+                tgt,
+                tgt,
+                attn_mask=tgt_mask,
+                key_padding_mask=tgt_key_padding_mask,
+            )
+            tgt = tgt + self.dropout1(tgt2)
+            tgt = self.norm1(tgt)
+            if self.debug:
+                self.attn = attn
+        tgt2, attn2 = self.multihead_attn(
+            tgt,
+            memory,
+            memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)
+        if self.debug:
+            self.attn2 = attn2
+
+        if self.siamese:
+            tgt3, attn3 = self.multihead_attn2(
+                tgt,
+                memory2,
+                memory2,
+                attn_mask=memory_mask2,
+                key_padding_mask=memory_key_padding_mask2)
+            tgt = tgt + self.dropout2(tgt3)
+            if self.debug:
+                self.attn3 = attn3
+
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+def _get_clones(module, N):
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+    if activation == 'relu':
+        return F.relu
+    elif activation == 'gelu':
+        return F.gelu
+
+    raise RuntimeError(
+        'activation should be relu/gelu, not {}'.format(activation))
diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py
index 3a32e44f..0c72a4a0 100644
--- a/modelscope/models/nlp/__init__.py
+++ b/modelscope/models/nlp/__init__.py
@@ -18,9 +18,15 @@ if TYPE_CHECKING:
     from .csanmt import CsanmtForTranslation
     from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
     from .gpt_neo import GPTNeoModel
+    from .gpt2 import GPT2Model
     from .gpt3 import GPT3ForTextGeneration, DistributedGPT3
     from .gpt_moe import GPTMoEForTextGeneration, DistributedGPTMoE
     from .heads import SequenceClassificationHead
+    from .megatron_bert import (
+        MegatronBertConfig,
+        MegatronBertForMaskedLM,
+        MegatronBertModel,
+    )
     from .palm_v2 import PalmForTextGeneration
     from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig
     from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
@@ -54,12 +60,14 @@ if TYPE_CHECKING:
                        VecoForTokenClassification, VecoModel)
     from .bloom import BloomModel
     from .unite import UniTEModel
+    from .use import UserSatisfactionEstimation
 else:
     _import_structure = {
         'backbones': ['SbertModel'],
         'bart': ['BartForTextErrorCorrection'],
         'csanmt': ['CsanmtForTranslation'],
         'heads': ['SequenceClassificationHead'],
+        'gpt2': ['GPT2Model'],
         'gpt3': ['GPT3ForTextGeneration', 'DistributedGPT3'],
         'gpt_moe': ['GPTMoEForTextGeneration', 'DistributedGPTMoE'],
         'structbert': [
@@ -86,6 +94,11 @@ else:
             'BertModel',
             'BertConfig',
         ],
+        'megatron_bert': [
+            'MegatronBertConfig',
+            'MegatronBertForMaskedLM',
+            'MegatronBertModel',
+        ],
         'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'],
         'palm_v2': ['PalmForTextGeneration'],
         'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'],
@@ -113,7 +126,8 @@ else:
         ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'],
         'gpt_neo': ['GPTNeoModel'],
         'bloom': ['BloomModel'],
-        'unite': ['UniTEModel']
+        'unite': ['UniTEModel'],
+        'use': ['UserSatisfactionEstimation']
     }
 
     import sys
diff --git a/modelscope/models/nlp/bart/text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py
index ab765190..9ff619f1 100644
--- a/modelscope/models/nlp/bart/text_error_correction.py
+++ b/modelscope/models/nlp/bart/text_error_correction.py
@@ -78,18 +78,16 @@ class BartForTextErrorCorrection(TorchModel):
         """
         import fairseq.utils
 
-        if len(input['net_input']['src_tokens'].size()) == 1:
-            input['net_input']['src_tokens'] = input['net_input'][
-                'src_tokens'].view(1, -1)
+        batch_size = input['src_tokens'].size(0)
 
+        input = {'net_input': input}
         if torch.cuda.is_available():
             input = fairseq.utils.move_to_cuda(input, device=self._device)
 
-        sample = input
-
         translations = self.task.inference_step(self.generator, self.models,
-                                                sample)
-
-        # get 1-best List[Tensor]
-        preds = translations[0][0]['tokens']
-        return TextErrorCorrectionOutput(predictions=preds)
+                                                input)
+        batch_preds = []
+        for i in range(batch_size):
+            # get 1-best List[Tensor]
+            batch_preds.append(translations[i][0]['tokens'])
+        return TextErrorCorrectionOutput(predictions=batch_preds)
diff --git a/modelscope/models/nlp/bert/token_classification.py b/modelscope/models/nlp/bert/token_classification.py
index b1b26a37..5fb92302 100644
--- a/modelscope/models/nlp/bert/token_classification.py
+++ b/modelscope/models/nlp/bert/token_classification.py
@@ -212,16 +212,6 @@ class BertForTokenClassification(BertPreTrainedModel):
                 loss = loss_fct(
                     logits.view(-1, self.num_labels), labels.view(-1))
 
-        if label_mask is not None:
-            mask = label_mask
-            masked_lengths = mask.sum(-1).long()
-            masked_logits = torch.zeros_like(logits)
-            for i in range(len(mask)):
-                masked_logits[
-                    i, :masked_lengths[i], :] = logits[i].masked_select(
-                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
-            logits = masked_logits
-
         if not return_dict:
             output = (logits, ) + outputs[2:]
             return ((loss, ) + output) if loss is not None else output
diff --git a/modelscope/models/nlp/gpt2/__init__.py b/modelscope/models/nlp/gpt2/__init__.py
new file mode 100644
index 00000000..5c344d43
--- /dev/null
+++ b/modelscope/models/nlp/gpt2/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .backbone import GPT2Model
+else:
+    _import_structure = {
+        'backbone': ['GPT2Model'],
+    }
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/gpt2/backbone.py b/modelscope/models/nlp/gpt2/backbone.py
new file mode 100644
index 00000000..8f8f88f6
--- /dev/null
+++ b/modelscope/models/nlp/gpt2/backbone.py
@@ -0,0 +1,15 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import GPT2Config
+from transformers import GPT2Model as GPT2ModelTransform
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import BACKBONES
+from modelscope.utils.constant import Tasks
+
+
+@BACKBONES.register_module(group_key=Tasks.backbone, module_name=Models.gpt2)
+class GPT2Model(GPT2ModelTransform):
+
+    def __init__(self, **kwargs):
+        config = GPT2Config(**kwargs)
+        super().__init__(config)
diff --git a/modelscope/models/nlp/gpt3/backbone.py b/modelscope/models/nlp/gpt3/backbone.py
index 4647428e..a86f01e4 100644
--- a/modelscope/models/nlp/gpt3/backbone.py
+++ b/modelscope/models/nlp/gpt3/backbone.py
@@ -23,8 +23,10 @@ from torch import nn
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
+from modelscope.outputs import TokenGeneratorOutput
 from modelscope.utils.constant import ModelFile
 from .configuration import GPT3Config
+from .distributed_gpt3 import sample
 
 
 class GPT3SelfAttention(nn.Module):
@@ -351,5 +353,63 @@ class GPT3Model(PreTrainedModel):
         model.load_state_dict(state_dict)
         return model
 
-    def prepare_inputs_for_generation(self, input_ids, *args, **kwargs):
-        return {'input_ids': input_ids}
+    def generate(self, tokens, temperature=1.0, **kwargs):
+
+        batch_size = tokens.size(0)
+        lengths = kwargs.pop(
+            'prompt_length',
+            torch.tensor([tokens.size(1)], device=tokens.device))
+
+        min_prompt_length = lengths.min().item()
+        max_sequence_length = tokens.size(1)
+        max_sequence_length = min(max_sequence_length,
+                                  self.config.max_position_embeddings)
+
+        # If the context is too big, this happens
+        if min_prompt_length >= max_sequence_length:
+            raise ValueError('context length + tokens_to_generate too large')
+
+        # Added termination_id to support the case that we want to terminate the
+        # generation once that id is generated.
+        termination_id = self.config.eod_id
+
+        # Whether we have reached a termination id.
+        is_generation_done = torch.zeros(
+            batch_size, dtype=torch.uint8, device=tokens.device)
+
+        with torch.no_grad():
+            for context_length in range(min_prompt_length,
+                                        max_sequence_length):
+
+                # Pick the slice that we need to pass through the network.
+                tokens2use = tokens[:, :context_length]
+
+                # logits will be meanigful only in the last pipeline stage.
+                logits = self(tokens2use).logits
+
+                # Sample.
+                last_token_logits = logits[:, -1, :]
+                new_sample = sample(
+                    last_token_logits,
+                    top_k=self.config.top_k,
+                    top_p=self.config.top_p,
+                    temperature=temperature,
+                    vocab_size=self.config.vocab_size)
+
+                # If a prompt length is smaller or equal th current context
+                # length, it means we have started generating tokens
+                started = lengths <= context_length
+                # Update the tokens.
+                tokens[started, context_length] = new_sample[started]
+
+                done_token = (new_sample == termination_id).byte() & \
+                    started.byte()
+
+                is_generation_done = is_generation_done | done_token
+                done = torch.all(is_generation_done)
+
+                if done:
+                    break
+
+        tokens = tokens[:, :(context_length + 1)]
+        return TokenGeneratorOutput(sequences=tokens)
diff --git a/modelscope/models/nlp/gpt3/distributed_gpt3.py b/modelscope/models/nlp/gpt3/distributed_gpt3.py
index 424e43b4..e469f866 100644
--- a/modelscope/models/nlp/gpt3/distributed_gpt3.py
+++ b/modelscope/models/nlp/gpt3/distributed_gpt3.py
@@ -20,26 +20,22 @@ from os import path as osp
 from typing import Callable, Dict, List, Optional, Union
 
 import torch
-from megatron import mpu
-from megatron.global_vars import get_global_memory_buffer, set_global_variables
-from megatron.model import (AttnMaskType, Float16Module, LayerNorm,
-                            bias_gelu_impl)
-from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron_util import mpu
+from megatron_util.global_vars import get_global_memory_buffer
+from megatron_util.model import (AttnMaskType, Float16Module, LayerNorm,
+                                 bias_gelu_impl)
+from megatron_util.model.fused_softmax import FusedScaleMaskSoftmax
 from torch import nn
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
 
 from modelscope.fileio import File
-from modelscope.metainfo import Models
 from modelscope.models import TorchModel
-from modelscope.models.builder import MODELS
 from modelscope.models.nlp.gpt3 import GPT3Config
 from modelscope.outputs import TextGenerationModelOutput, TokenGeneratorOutput
 from modelscope.utils.checkpoint import weights_to_cpu
-from modelscope.utils.constant import Tasks
-from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.megatron_utils import init_megatron_util
 from modelscope.utils.nlp.load_checkpoint import pre_load
-from modelscope.utils.torch_utils import set_random_seed_mpu
 
 
 class GPT3ParallelMLP(nn.Module):
@@ -54,8 +50,7 @@ class GPT3ParallelMLP(nn.Module):
         super().__init__()
 
         # Project to 4h.
-        self.dense_h_to_4h = mpu.ColumnParallelLinearV3(
-            config,
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
             config.hidden_size,
             config.ffn_hidden_size,
             gather_output=False,
@@ -66,8 +61,7 @@ class GPT3ParallelMLP(nn.Module):
         self.activation_func = F.gelu
 
         # Project back to h.
-        self.dense_4h_to_h = mpu.RowParallelLinearV3(
-            config,
+        self.dense_4h_to_h = mpu.RowParallelLinear(
             config.ffn_hidden_size,
             config.hidden_size,
             input_is_parallel=True,
@@ -198,7 +192,7 @@ class GPT3CoreAttention(nn.Module):
         projection_size = config.kv_channels * config.num_attention_heads
 
         # Per attention head and per partition values.
-        world_size = mpu.get_model_parallel_world_size()
+        world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_partition = mpu.divide(projection_size,
                                                     world_size)
         self.hidden_size_per_attention_head = mpu.divide(
@@ -324,15 +318,14 @@ class GPT3ParallelAttention(nn.Module):
         projection_size = config.kv_channels * config.num_attention_heads
 
         # Per attention head and per partition values.
-        world_size = mpu.get_model_parallel_world_size()
+        world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_attention_head = mpu.divide(
             projection_size, config.num_attention_heads)
         self.num_attention_heads_per_partition = mpu.divide(
             config.num_attention_heads, world_size)
 
         # Strided linear layer.
-        self.query_key_value = mpu.ColumnParallelLinearV3(
-            config,
+        self.query_key_value = mpu.ColumnParallelLinear(
             config.hidden_size,
             3 * projection_size,
             gather_output=False,
@@ -341,8 +334,7 @@ class GPT3ParallelAttention(nn.Module):
         self.core_attention = GPT3CoreAttention(config, self.layer_number)
 
         # Output.
-        self.dense = mpu.RowParallelLinearV3(
-            config,
+        self.dense = mpu.RowParallelLinear(
             projection_size,
             config.hidden_size,
             input_is_parallel=True,
@@ -801,21 +793,22 @@ class GPT3Model(PreTrainedModel):
         logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply(
             lm_output, self.word_embeddings_weight(), None, False, True,
             self.config.sequence_parallel)
-        # Gather if needed.
 
-        output = logits_parallel
-
-        if labels is None:
-            output = mpu.gather_from_model_parallel_region(logits_parallel)
-            # [s b h] => [b s h]
-            return output.transpose(0, 1).contiguous()
-        else:
+        losses = None
+        if labels is not None:
             # [b s] => [s b]
             labels = labels.transpose(0, 1).contiguous()
-            loss = mpu.vocab_parallel_cross_entropy(output.float(), labels)
+            losses = mpu.vocab_parallel_cross_entropy(
+                logits_parallel.clone().float(), labels)
             # [s b] => [b s]
-            loss = loss.transpose(0, 1).contiguous()
-            return loss
+            losses = losses.transpose(0, 1).contiguous()
+
+        # Gather if needed.
+        logits = mpu.gather_from_tensor_model_parallel_region(logits_parallel)
+        # [s b h] => [b s h]
+        logits = logits.transpose(0, 1).contiguous()
+
+        return logits, losses
 
 
 def modify_logits_for_top_k_filtering(logits, top_k):
@@ -858,8 +851,6 @@ def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None):
 
     # Check logits for consistency.
     assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.'
-    assert logits.type() == 'torch.cuda.FloatTensor', \
-        'input logits should be floats.'
 
     # Greedy is just simple argmax.
     if top_k == 1:
@@ -946,7 +937,7 @@ def split_state_dict(state_dict: Dict[str, torch.Tensor], model: GPT3Model,
                      partitions: int) -> Dict[str, torch.Tensor]:
     if partitions == 1:
         return state_dict
-    rank: int = mpu.get_model_parallel_rank()
+    rank: int = mpu.get_tensor_model_parallel_rank()
     for name, parameters in model.named_parameters():
         if parameters.shape == state_dict[name].shape:
             continue
@@ -957,12 +948,12 @@ def split_state_dict(state_dict: Dict[str, torch.Tensor], model: GPT3Model,
     return state_dict
 
 
-def save_checkpoint(model: torch.nn.Module, filename: str) -> None:
+def save_checkpoint(model: torch.nn.Module, filename: str, **kwargs) -> None:
     if isinstance(model, torch.nn.parallel.DistributedDataParallel):
         model = model.module
 
     checkpoint = {'module': weights_to_cpu(model.state_dict())}
-    mp_rank = mpu.get_model_parallel_rank()
+    mp_rank = mpu.get_tensor_model_parallel_rank()
     filename = osp.join(
         osp.dirname(filename), 'model',
         'mp_rank_{:02d}'.format(mp_rank) + '_model_states.pt')
@@ -981,12 +972,8 @@ class DistributedGPT3(TorchModel):
                  *args,
                  **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-        initialize_distributed(rank, mpu, kwargs['world_size'],
-                               kwargs['model_parallel_size'],
-                               kwargs['master_ip'], kwargs['master_port'])
-        seed = 0 if 'seed' not in kwargs else kwargs['seed']
-        set_random_seed_mpu(seed)
-        set_global_variables()
+
+        init_megatron_util(model_dir=model_dir, rank=rank)
 
         self.config = GPT3Config.from_pretrained(model_dir)
         # Build model.
@@ -1004,9 +991,9 @@ class DistributedGPT3(TorchModel):
 
         self.dist_model = model
 
-        tensor_ws = mpu.get_model_parallel_world_size()
+        tensor_ws = mpu.get_tensor_model_parallel_world_size()
         ckpt_ws = kwargs.pop('checkpoint_model_parallel_size', tensor_ws)
-        ckpt_rank = mpu.get_model_parallel_rank() * ckpt_ws // tensor_ws
+        ckpt_rank = mpu.get_tensor_model_parallel_rank() * ckpt_ws // tensor_ws
         load_model = pre_load(ckpt_rank, model_dir, tag=path_load_tag)
         load_model = split_state_dict(load_model, model, tensor_ws // ckpt_ws)
 
@@ -1024,33 +1011,39 @@ class DistributedGPT3(TorchModel):
                 attention_mask=None,
                 position_ids=None,
                 labels=None,
-                prompt_length=None):
-        outputs = self.dist_model(
+                prompt_length=None,
+                is_pair=(False, )):
+
+        logits, losses = self.dist_model(
             tokens,
             attention_mask,
             position_ids,
             inference_params=self.inference_params,
             labels=labels)
+
+        loss = None
         if labels is None:
             self.inference_params.sequence_len_offset += tokens.size(1)
-            return TextGenerationModelOutput(logits=outputs)
         else:
             loss_mask = torch.ones(
                 tokens.size(), dtype=torch.float, device=tokens.device)
+            if is_pair[0]:
+                for i, length in enumerate(prompt_length):
+                    loss_mask[i, :length] = 0
 
-            losses = outputs.float()
+            losses = losses.float()
             loss_mask = loss_mask.view(-1).float()
             loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
 
-            return TextGenerationModelOutput(loss=loss)
+        return TextGenerationModelOutput(logits=logits, loss=loss)
 
-    def generate(self,
-                 tokens,
-                 temperature=1.0,
-                 use_eod_token_for_early_termination=True,
-                 stop_on_double_eol=False,
-                 stop_on_eol=False,
-                 **kwargs):
+    def sample(self,
+               tokens,
+               temperature=1.0,
+               use_eod_token_for_early_termination=True,
+               stop_on_double_eol=False,
+               stop_on_eol=False,
+               **kwargs):
         batch_size = tokens.size(0)
         lengths = kwargs.pop(
             'prompt_length',
@@ -1085,76 +1078,253 @@ class DistributedGPT3(TorchModel):
         # Run infernece
         # =============
 
-        with torch.no_grad():
-            attention_mask, position_ids = \
-                GPT3Model.build_attention_mask_and_position_ids(tokens)
-            prev_context_length = 0
-            for context_length in range(min_prompt_length,
-                                        max_sequence_length):
+        attention_mask, position_ids = \
+            GPT3Model.build_attention_mask_and_position_ids(tokens)
+        prev_context_length = 0
+        for context_length in range(min_prompt_length, max_sequence_length):
 
-                # Pick the slice that we need to pass through the network.
-                tokens2use = tokens[:, prev_context_length:context_length]
-                positions2use = position_ids[:, prev_context_length:
-                                             context_length]
-                attention_mask2use = attention_mask[
-                    ..., prev_context_length:context_length, :context_length]
+            # Pick the slice that we need to pass through the network.
+            tokens2use = tokens[:, prev_context_length:context_length]
+            positions2use = position_ids[:, prev_context_length:context_length]
+            attention_mask2use = attention_mask[
+                ..., prev_context_length:context_length, :context_length]
 
-                # logits will be meanigful only in the last pipeline stage.
-                logits = self(tokens2use, attention_mask2use,
-                              positions2use).logits
+            # logits will be meanigful only in the last pipeline stage.
+            logits = self(tokens2use, attention_mask2use, positions2use).logits
 
-                # Sample.
-                last_token_logits = logits[:, -1, :]
-                new_sample = sample(
-                    last_token_logits,
-                    top_k=self.config.top_k,
-                    top_p=self.config.top_p,
-                    temperature=temperature,
-                    vocab_size=self.config.vocab_size)
+            # Sample.
+            last_token_logits = logits[:, -1, :]
+            new_sample = sample(
+                last_token_logits,
+                top_k=self.config.top_k,
+                top_p=self.config.top_p,
+                temperature=temperature,
+                vocab_size=self.config.vocab_size)
 
-                # If a prompt length is smaller or equal th current context
-                # length, it means we have started generating tokens
-                started = lengths <= context_length
-                # Update the tokens.
-                tokens[started, context_length] = new_sample[started]
+            # If a prompt length is smaller or equal th current context
+            # length, it means we have started generating tokens
+            started = lengths <= context_length
+            # Update the tokens.
+            tokens[started, context_length] = new_sample[started]
 
-                # Update the context length for the next token generation.
-                prev_context_length = context_length
+            # Update the context length for the next token generation.
+            prev_context_length = context_length
 
-                # instead tokenization should be in the inference loop so stop sequences can be used
-                if stop_on_double_eol:
-                    hit_double_eol = (new_sample
-                                      == 628).byte() & started.byte()
-                    hit_two_eols = (new_sample == 198).byte() & (
-                        tokens[:, context_length - 1]
-                        == 198).byte() & started.byte()
-                    done_token = hit_double_eol | hit_two_eols
-                elif stop_on_eol:
-                    hit_double_eol = (new_sample
-                                      == 628).byte() & started.byte()
-                    hit_eol = (new_sample == 198).byte() & started.byte()
-                    done_token = hit_double_eol | hit_eol
-                else:
-                    done_token = (new_sample == termination_id).byte() & \
-                        started.byte()
+            # instead tokenization should be in the inference loop so stop sequences can be used
+            if stop_on_double_eol:
+                hit_double_eol = (new_sample == 628).byte() & started.byte()
+                hit_two_eols = (new_sample == 198).byte() & (
+                    tokens[:,
+                           context_length - 1] == 198).byte() & started.byte()
+                done_token = hit_double_eol | hit_two_eols
+            elif stop_on_eol:
+                hit_double_eol = (new_sample == 628).byte() & started.byte()
+                hit_eol = (new_sample == 198).byte() & started.byte()
+                done_token = hit_double_eol | hit_eol
+            else:
+                done_token = (new_sample == termination_id).byte() & \
+                    started.byte()
 
-                is_generation_done = is_generation_done | done_token
-                done = torch.all(is_generation_done)
+            is_generation_done = is_generation_done | done_token
+            done = torch.all(is_generation_done)
 
-                if use_eod_token_for_early_termination and done:
-                    break
+            if use_eod_token_for_early_termination and done:
+                break
 
         tokens = tokens[:, :(context_length + 1)]
         return TokenGeneratorOutput(sequences=tokens)
 
-    def state_dict(self):
-        return self.dist_model.state_dict()
+    def beam_search(self, tokens, beam_size=5, num_return_gen=1, **kwargs):
+        batch_size = tokens.size(0)
+        assert (batch_size == 1)
+        prompt_length = kwargs.pop(
+            'prompt_length',
+            torch.tensor([tokens.size(1)], device=tokens.device)).item()
+        stop_token = self.config.eod_id
+        pads = torch.ones(
+            1, self.config.tokens_to_generate,
+            device=tokens.device).long() * stop_token
+        tokens = torch.cat((tokens, pads), dim=-1)
+        final_sequence_length = tokens.size(1)
+        final_sequence_length = min(final_sequence_length,
+                                    self.config.max_position_embeddings)
+
+        # If the context is too big, this happens
+        if prompt_length >= final_sequence_length:
+            raise ValueError('context length + tokens_to_generate too large')
+
+        # Initialize inference parameters.
+        self.inference_params = InferenceParams(beam_size,
+                                                final_sequence_length)
+
+        beam_hyp = BeamHypotheses(beam_size)
+        done = False
+        scores = torch.zeros(
+            beam_size, dtype=torch.float32,
+            device=torch.cuda.current_device()).unsqueeze(1)
+
+        # =============
+        # Run infernece
+        # =============
+        tokens = tokens.repeat(beam_size, 1)
+        attention_mask, position_ids = \
+            GPT3Model.build_attention_mask_and_position_ids(tokens)
+        prev_context_length = 0
+        for context_length in range(prompt_length, final_sequence_length):
+
+            # Pick the slice that we need to pass through the network.
+            tokens2use = tokens[:, prev_context_length:context_length]
+            positions2use = position_ids[:, prev_context_length:context_length]
+            attention_mask2use = attention_mask[
+                ..., prev_context_length:context_length, :context_length]
+
+            # logits will be meanigful only in the last pipeline stage.
+            logits = self(tokens2use, attention_mask2use, positions2use).logits
+
+            vocab_size = logits.size(2)
+            log_probs = F.log_softmax(logits, dim=2)
+            new_scores = log_probs[:, -1, :] + scores
+
+            if context_length == prompt_length:  # if this is the first one
+                sorted_scores, indices = torch.sort(
+                    new_scores[0, :], descending=True)
+            else:
+                sorted_scores, indices = torch.sort(
+                    new_scores.view(-1), descending=True)
+
+            best_beam_ids = torch.div(indices[:2 * beam_size],
+                                      vocab_size).trunc().long()
+            best_words = indices[:2 * beam_size] % vocab_size
+            best_scores = sorted_scores[:2 * beam_size]
+
+            next_beams = []
+            for beam_token_rank, (token_id, beam_score, beam_id) in enumerate(
+                    zip(best_words, best_scores, best_beam_ids)):
+                if token_id.item() == stop_token:
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= beam_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(tokens[beam_id].clone(), beam_score,
+                                 context_length + 1 - prompt_length)
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beams.append((token_id, beam_score, beam_id))
+
+                if len(next_beams) == beam_size:
+                    break
+
+            if beam_hyp.is_done(best_scores.max().item(),
+                                context_length + 1 - prompt_length):
+                done = True
+                break
+
+            best_batches = tokens.new([item[2] for item in next_beams])
+            tokens = tokens[best_batches, :]
+            tokens[:, context_length] = tokens.new(
+                [item[0] for item in next_beams])
+            scores = scores.new([item[1] for item in next_beams]).unsqueeze(1)
+
+            # set inference key values to make it consistent with best beam index
+            self.inference_params.swap_key_value_dict(best_batches)
+
+            # Update the context length for the next token generation.
+            prev_context_length = context_length
+
+        # if cannot find stop token, add open beams to hyps
+        if not done:
+            for beam_id in range(beam_size):
+                beam_hyp.add(tokens[beam_id].clone(), scores[beam_id],
+                             context_length + 1 - prompt_length)
+
+        # rank based on scores
+        sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0], reverse=True)
+        num_return_gen = min(num_return_gen, len(sorted_hyps))
+        scores = [sorted_hyps[i][0] for i in range(num_return_gen)]
+        tokens = [sorted_hyps[i][1] for i in range(num_return_gen)]
+        scores = torch.stack(scores, dim=0)
+        tokens = torch.stack(tokens, dim=0)
+
+        return TokenGeneratorOutput(sequences=tokens, scores=scores)
+
+    @torch.no_grad()
+    def generate(self, tokens, do_sample=True, *args, **kwargs):
+        if do_sample:
+            return self.sample(tokens, *args, **kwargs)
+        else:
+            return self.beam_search(tokens, *args, **kwargs)
+
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.dist_model.state_dict(destination, prefix, keep_vars)
 
     def save_pretrained(self,
                         target_folder: Union[str, os.PathLike],
                         save_checkpoint_names: Union[str, List[str]] = None,
-                        save_function: Callable = save_checkpoint,
+                        save_function: Callable = None,
                         config: Optional[dict] = None,
                         **kwargs):
+        # DistributedPipeline type is different from task name
+        config['pipeline']['type'] = 'gpt3-generation'
+        # a temp fix for master_ip, master_port and rank
+        # can be removed after refactoring megatron_util
+        for unused_key in ('master_ip', 'master_port', 'rank'):
+            config['model'].pop(unused_key, None)
+
         return super().save_pretrained(target_folder, save_checkpoint_names,
-                                       save_function, config, **kwargs)
+                                       save_checkpoint, config, **kwargs)
+
+
+class BeamHypotheses:
+
+    def __init__(self,
+                 num_beams: int,
+                 length_penalty: float = 1.0,
+                 early_stopping: bool = False):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self,
+            hyp: torch.LongTensor,
+            sum_logprobs: float,
+            beam_indices: Optional[torch.LongTensor] = None):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (hyp.shape[-1]**self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, beam_indices))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([
+                    (s, idx) for idx, (s, _, _) in enumerate(self.beams)
+                ])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len**self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
diff --git a/modelscope/models/nlp/gpt3/text_generation.py b/modelscope/models/nlp/gpt3/text_generation.py
index 74335de6..27ce09d6 100644
--- a/modelscope/models/nlp/gpt3/text_generation.py
+++ b/modelscope/models/nlp/gpt3/text_generation.py
@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from typing import Dict
 
+import torch
 from transformers import BertTokenizer
 
 from modelscope.metainfo import Models
@@ -27,7 +27,7 @@ class GPT3ForTextGeneration(TorchModel):
         # Temporarily compatible with DistributedGPT3 and GPT3Model,
         # the base/large model based on GPT3Model will be replaced in the future,
         # and GPT3Model will be deprecated
-        if 'model_parallel_size' in kwargs:
+        if 'world_size' in kwargs:
             from modelscope.models.nlp import DistributedGPT3
             self.model = DistributedGPT3(model_dir, **kwargs)
         else:
@@ -49,24 +49,19 @@ class GPT3ForTextGeneration(TorchModel):
         """
         return self.model(**input)
 
-    def generate(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def generate(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
         if not isinstance(self.model, GPT3Model):
-            return self.model.generate(**input)
+            return self.model.generate(**inputs)
 
-        assert 'input_ids' in input, "generate function must accept 'input_ids' key"
-        input_ids = input['input_ids']
-        if 'attention_mask' in input:
-            attention_mask = input['attention_mask']
-            input_ids = input_ids[0][attention_mask[0].nonzero()] \
-                .squeeze().unsqueeze(0)
-        # remove sep token at the end of tokenizer output
-        input_ids = input_ids[:, :-1]
+        tokens = inputs['input_ids']
+        lengths = self._get_length(inputs['attention_mask'])
+        return self.model.generate(tokens, prompt_length=lengths)
 
-        gen_params = dict()
-        gen_params['inputs'] = input_ids
-        gen_params['do_sample'] = input.pop('do_sample', True)
-        gen_params['max_length'] = input.pop('max_length', 128)
-        gen_params['top_k'] = input.pop('top_k', 10)
-        gen_params['top_p'] = input.pop('top_p', None)
-        sample_output = self.model.generate(**gen_params)
-        return {'sequences': sample_output[0]}
+    @staticmethod
+    def _get_length(attention_mask: torch.Tensor) -> Tensor:
+        return attention_mask.sum(-1) - 1
+
+    def save_pretrained(self, *args, **kwargs):
+        if not isinstance(self.model, GPT3Model):
+            return self.model.save_pretrained(*args, **kwargs)
+        return super().save_pretrained(*args, **kwargs)
diff --git a/modelscope/models/nlp/gpt_moe/checkpointing.py b/modelscope/models/nlp/gpt_moe/checkpointing.py
index 68b66e97..d5980e8a 100644
--- a/modelscope/models/nlp/gpt_moe/checkpointing.py
+++ b/modelscope/models/nlp/gpt_moe/checkpointing.py
@@ -16,29 +16,15 @@
 import os
 
 import torch
-from megatron import mpu
-from megatron.model import Float16Module
+from megatron_util import mpu
+from megatron_util.model import Float16Module
+from megatron_util.utils import unwrap_model
 from torch.nn.parallel import DistributedDataParallel as torchDDP
 
 from .configuration import logger
 from .moe.layer import MoE
 
 
-def unwrap_model(model, module_instances=(torchDDP)):
-    return_list = True
-    if not isinstance(model, list):
-        model = [model]
-        return_list = False
-    unwrapped_model = []
-    for model_module in model:
-        while isinstance(model_module, module_instances):
-            model_module = model_module.module
-        unwrapped_model.append(model_module)
-    if not return_list:
-        return unwrapped_model[0]
-    return unwrapped_model
-
-
 def get_checkpoint_names(checkpoints_path,
                          path_load_tag,
                          num_experts,
@@ -46,7 +32,7 @@ def get_checkpoint_names(checkpoints_path,
                          expp_rank=None):
     """Determine the directory name for this rank's checkpoint."""
     if tensor_rank is None:
-        tensor_rank = mpu.get_model_parallel_rank()
+        tensor_rank = mpu.get_tensor_model_parallel_rank()
 
     common_path = os.path.join(checkpoints_path, path_load_tag,
                                f'mp_rank_{tensor_rank:02d}')
@@ -64,7 +50,7 @@ def get_checkpoint_names(checkpoints_path,
 
 
 def _get_expert_ckpt_name(checkpoints_path, layer_id, expert_id):
-    mp_rank = mpu.get_model_parallel_rank()
+    mp_rank = mpu.get_tensor_model_parallel_rank()
     ckpt_name = os.path.join(
         os.path.join(checkpoints_path, 'model'),
         f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt'
diff --git a/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py b/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py
index 31ca48f9..e8b36aca 100644
--- a/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py
+++ b/modelscope/models/nlp/gpt_moe/distributed_gpt_moe.py
@@ -16,11 +16,11 @@
 import math
 
 import torch
-from megatron import mpu
-from megatron.global_vars import get_global_memory_buffer, set_global_variables
-from megatron.model import (AttnMaskType, Float16Module, LayerNorm,
-                            bias_gelu_impl)
-from megatron.model.fused_softmax import FusedScaleMaskSoftmax
+from megatron_util import mpu
+from megatron_util.global_vars import get_global_memory_buffer
+from megatron_util.model import (AttnMaskType, Float16Module, LayerNorm,
+                                 bias_gelu_impl)
+from megatron_util.model.fused_softmax import FusedScaleMaskSoftmax
 from torch import nn
 from torch.nn import functional as F
 from transformers.modeling_utils import PreTrainedModel
@@ -28,8 +28,7 @@ from transformers.modeling_utils import PreTrainedModel
 from modelscope.models import TorchModel
 from modelscope.models.nlp.gpt_moe import GPTMoEConfig
 from modelscope.outputs import TextGenerationModelOutput, TokenGeneratorOutput
-from modelscope.utils.nlp.distributed import initialize_distributed
-from modelscope.utils.torch_utils import set_random_seed_mpu
+from modelscope.utils.megatron_utils import init_megatron_util
 from .checkpointing import load_checkpoint
 from .moe.layer import MoE
 
@@ -44,8 +43,7 @@ class GPTMoEParallelMLP(nn.Module):
                  enable_expert_tensor_parallelism=False):
         super().__init__()
         # Project to 4h.
-        self.dense_h_to_4h = mpu.ColumnParallelLinearV3(
-            config,
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
             config.hidden_size,
             config.ffn_hidden_size,
             gather_output=False,
@@ -57,8 +55,7 @@ class GPTMoEParallelMLP(nn.Module):
         self.bias_gelu_fusion = config.bias_gelu_fusion
         self.activation_func = F.gelu
         # Project back to h.
-        self.dense_4h_to_h = mpu.RowParallelLinearV3(
-            config,
+        self.dense_4h_to_h = mpu.RowParallelLinear(
             config.ffn_hidden_size,
             config.hidden_size,
             input_is_parallel=True,
@@ -219,7 +216,7 @@ class GPTMoECoreAttention(nn.Module):
         projection_size = config.kv_channels * config.num_attention_heads
 
         # Per attention head and per partition values.
-        world_size = mpu.get_model_parallel_world_size()
+        world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_partition = mpu.divide(projection_size,
                                                     world_size)
         self.hidden_size_per_attention_head = mpu.divide(
@@ -345,15 +342,14 @@ class GPTMoEParallelAttention(nn.Module):
         projection_size = config.kv_channels * config.num_attention_heads
 
         # Per attention head and per partition values.
-        world_size = mpu.get_model_parallel_world_size()
+        world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_attention_head = mpu.divide(
             projection_size, config.num_attention_heads)
         self.num_attention_heads_per_partition = mpu.divide(
             config.num_attention_heads, world_size)
 
         # Strided linear layer.
-        self.query_key_value = mpu.ColumnParallelLinearV3(
-            config,
+        self.query_key_value = mpu.ColumnParallelLinear(
             config.hidden_size,
             3 * projection_size,
             gather_output=False,
@@ -362,8 +358,7 @@ class GPTMoEParallelAttention(nn.Module):
         self.core_attention = GPTMoECoreAttention(config, self.layer_number)
 
         # Output.
-        self.dense = mpu.RowParallelLinearV3(
-            config,
+        self.dense = mpu.RowParallelLinear(
             projection_size,
             config.hidden_size,
             input_is_parallel=True,
@@ -1114,19 +1109,14 @@ class DistributedGPTMoE(TorchModel):
                  *args,
                  **kwargs):
         super().__init__(model_dir, *args, **kwargs)
-        initialize_distributed(rank, mpu, kwargs['world_size'],
-                               kwargs['model_parallel_size'],
-                               kwargs['master_ip'], kwargs['master_port'])
+
+        init_megatron_util(model_dir=model_dir, rank=rank)
 
         self.config = GPTMoEConfig.from_pretrained(model_dir)
         if self.config.num_experts[0] > 0:
             mpu.create_expert_and_data_parallel(
                 self.config.moe_expert_parallel_size)
 
-        seed = 0 if 'seed' not in kwargs else kwargs['seed']
-        set_random_seed_mpu(seed)
-        set_global_variables()
-
         # Build model.
         model = GPTMoEModel(self.config)
 
diff --git a/modelscope/models/nlp/gpt_moe/moe/layer.py b/modelscope/models/nlp/gpt_moe/moe/layer.py
index 99767bb6..10650f00 100644
--- a/modelscope/models/nlp/gpt_moe/moe/layer.py
+++ b/modelscope/models/nlp/gpt_moe/moe/layer.py
@@ -5,7 +5,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
 import typing
 
 import torch
-from megatron import mpu
+from megatron_util import mpu
 
 from .experts import Experts
 from .sharded_moe import MOELayer, TopKGate
diff --git a/modelscope/models/nlp/gpt_moe/moe/mappings.py b/modelscope/models/nlp/gpt_moe/moe/mappings.py
index a3fb85f7..011c4c2a 100644
--- a/modelscope/models/nlp/gpt_moe/moe/mappings.py
+++ b/modelscope/models/nlp/gpt_moe/moe/mappings.py
@@ -3,7 +3,7 @@ Copyright 2020 The Microsoft DeepSpeed Team
 '''
 
 import torch
-from megatron import mpu
+from megatron_util import mpu
 
 
 def _gather_tokens(input_, dim=0):
@@ -15,7 +15,7 @@ def _gather_tokens(input_, dim=0):
 
     tensor_list = [
         torch.empty_like(input_)
-        for _ in range(mpu.get_model_parallel_world_size())
+        for _ in range(mpu.get_tensor_model_parallel_world_size())
     ]
     tensor_list[rank] = input_
     torch.distributed.all_gather(
@@ -29,8 +29,8 @@ def _gather_tokens(input_, dim=0):
 
 def _drop_tokens(input_, dim=0):
     """Divide a tensor among the tensor parallel ranks"""
-    total_chunks = mpu.get_model_parallel_world_size()
-    this_chunk = mpu.get_model_parallel_rank()
+    total_chunks = mpu.get_tensor_model_parallel_world_size()
+    this_chunk = mpu.get_tensor_model_parallel_rank()
     assert input_.shape[
         dim] % total_chunks == 0, f'input dimension {dim} ({input_.shape[dim]}) ' \
                                   f'is not divisible by tensor parallel world size ({total_chunks})'
@@ -74,14 +74,14 @@ class _DropTokens(torch.autograd.Function):
 
 
 def gather_tokens(input_, dim=0):
-    if mpu is None or mpu.get_model_parallel_world_size() == 1:
+    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
         # no tensor parallelism for non-experts
         return input_
     return _GatherTokens.apply(input_, dim)
 
 
 def drop_tokens(input_, dim=0):
-    if mpu is None or mpu.get_model_parallel_world_size() == 1:
+    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
         # no tensor parallelism for non-experts
         return input_
     return _DropTokens.apply(input_, dim)
diff --git a/modelscope/models/nlp/gpt_moe/moe/sharded_moe.py b/modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
index 86c591c9..f8eed00f 100644
--- a/modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
+++ b/modelscope/models/nlp/gpt_moe/moe/sharded_moe.py
@@ -18,7 +18,7 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from megatron import mpu
+from megatron_util import mpu
 from scipy.special import binom
 from torch import Tensor, nn
 from torch.nn import Module
diff --git a/modelscope/models/nlp/megatron_bert/__init__.py b/modelscope/models/nlp/megatron_bert/__init__.py
new file mode 100644
index 00000000..c39609e7
--- /dev/null
+++ b/modelscope/models/nlp/megatron_bert/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .configuration import MegatronBertConfig
+    from .backbone import MegatronBertModel
+    from .fill_mask import MegatronBertForMaskedLM
+else:
+    _import_structure = {
+        'configuration': ['MegatronBertConfig'],
+        'backbone': ['MegatronBertModel'],
+        'distributed_plug': ['MegatronBertForMaskedLM'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/megatron_bert/backbone.py b/modelscope/models/nlp/megatron_bert/backbone.py
new file mode 100644
index 00000000..56bea4ae
--- /dev/null
+++ b/modelscope/models/nlp/megatron_bert/backbone.py
@@ -0,0 +1,897 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch MegatronBERT model."""
+
+import math
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import (PreTrainedModel,
+                                         apply_chunking_to_forward,
+                                         find_pruneable_heads_and_indices,
+                                         prune_linear_layer)
+
+from modelscope.metainfo import Models
+from modelscope.models import Model, TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionBackboneModelOutput
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.nlp.utils import parse_labels_in_order
+from .configuration import MegatronBertConfig
+
+logger = get_logger()
+
+_CONFIG_FOR_DOC = 'MegatronBertConfig'
+
+
+class MegatronBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+
+        # In Megatron, layer-norm is applied after the 1st dropout.
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            'position_ids',
+            torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:,
+                                             past_key_values_length:seq_length
+                                             + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == 'absolute':
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        # Megatron BERT moves that layer norm after the drop-out (and to each layer).
+        # embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->MegatronBert
+class MegatronBertSelfAttention(nn.Module):
+
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+                config, 'embedding_size'):
+            raise ValueError(
+                f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
+                f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size
+                                       / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, 'position_embedding_type', 'absolute')
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+# Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to MegatronBertAttention below.
+class MegatronBertSelfOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, residual):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return residual + hidden_states
+
+
+# Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
+class MegatronBertAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.self = MegatronBertSelfAttention(config)
+        self.output = MegatronBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads,
+            self.self.attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(
+            heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        ln_outputs = self.ln(hidden_states)
+        self_outputs = self.self(
+            ln_outputs,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,
+                   ) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->MegatronBert
+class MegatronBertIntermediate(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.
+class MegatronBertOutput(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor,
+                input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return input_tensor + hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.
+class MegatronBertLayer(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MegatronBertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise TypeError(
+                    f'{self} should be used as a decoder model if cross attention is added'
+                )
+            self.crossattention = MegatronBertAttention(config)
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.intermediate = MegatronBertIntermediate(config)
+        self.output = MegatronBertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[
+                1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, 'crossattention'):
+                raise AttributeError(
+                    f'If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers'
+                    ' by setting `config.add_cross_attention=True`')
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[
+                -2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[
+                1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value, )
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        ln_output = self.ln(attention_output)
+        intermediate_output = self.intermediate(ln_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MegatronBertEncoder(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([
+            MegatronBertLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+
+        # The final layer norm. We removed the 1st LN, moved LN to each hidden layer and this one
+        # is simply the final LN (Transformer's BERT has it attached to each hidden layer).
+        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+        ) if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states, )
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[
+                i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value,
+                                      output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            # Because we moved the layer-norm at the end of the hidden layer, we have non-normali-
+            # zed data here. If that's really needed, we must apply LN to match Transformer's BERT.
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1], )
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (
+                    layer_outputs[1], )
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (
+                        layer_outputs[2], )
+
+        # Finalize the hidden states.
+        hidden_states = self.ln(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(v for v in [
+                hidden_states,
+                next_decoder_cache,
+                all_hidden_states,
+                all_self_attentions,
+                all_cross_attentions,
+            ] if v is not None)
+        return AttentionBackboneModelOutput(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->MegatronBert
+class MegatronBertPooler(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class MegatronBertPreTrainedModel(TorchModel, PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MegatronBertConfig
+    base_model_prefix = 'bert'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r'position_ids']
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config.name_or_path, **kwargs)
+        super(Model, self).__init__(config)
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(
+                mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MegatronBertEncoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def _instantiate(cls, **kwargs):
+        """Instantiate the model.
+
+        Args:
+            kwargs: Input args.
+                    model_dir: The model dir used to load the checkpoint and the label information.
+                    num_labels: An optional arg to tell the model how many classes to initialize.
+                                    Method will call utils.parse_label_mapping if num_labels not supplied.
+                                    If num_labels is not found, the model will use the default setting (2 classes).
+
+        Returns:
+            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
+        """
+
+        model_dir = kwargs.pop('model_dir', None)
+        cfg = kwargs.pop('cfg', None)
+        model_args = parse_labels_in_order(model_dir, cfg, **kwargs)
+        if model_dir is None:
+            config = MegatronBertConfig(**model_args)
+            model = cls(config)
+        else:
+            model = super(Model, cls).from_pretrained(
+                pretrained_model_name_or_path=model_dir, **model_args)
+        model.model_dir = model_dir
+        return model
+
+
+@MODELS.register_module(
+    group_key=Tasks.backbone, module_name=Models.megatron_bert)
+class MegatronBertModel(MegatronBertPreTrainedModel):
+    """The bare MegatronBert Model transformer outputting raw hidden-states without any specific head on top.",
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MegatronBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = MegatronBertEmbeddings(config)
+        self.encoder = MegatronBertEncoder(config)
+
+        self.pooler = MegatronBertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self,
+                input_ids=None,
+                attention_mask=None,
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                inputs_embeds=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                past_key_values=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                **kwargs) -> AttentionBackboneModelOutput:
+        r"""
+        Args:
+        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the
+            inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
+        num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask
+            values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
+        *optional*):
+            Optionally, instead of passing `input_ids` you can choose to
+            directly pass an embedded representation. This is useful if you want
+            more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a
+            plain tuple.
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the
+            encoder. Used in the cross-attention if the model is configured as a
+            decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
+        sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of
+            the encoder input. This mask is used in the cross-attention if the
+            model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length
+        `config.n_layers` with each tuple having 4 tensors of shape
+        `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention
+            blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only
+            the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead
+            of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned
+            and can be used to speed up decoding (see `past_key_values`).
+        Others (**kwargs)
+            some additional parameters might passed in from upstream pipeline,
+            which not influence the results.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else
+            self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                'You cannot specify both input_ids and inputs_embeds at the same time'
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                'You have to specify either input_ids or inputs_embeds')
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)),
+                device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
+            )
+            encoder_hidden_shape = (encoder_batch_size,
+                                    encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask,
+                                       self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(
+            sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return AttentionBackboneModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
diff --git a/modelscope/models/nlp/megatron_bert/configuration.py b/modelscope/models/nlp/megatron_bert/configuration.py
new file mode 100644
index 00000000..951fd7d1
--- /dev/null
+++ b/modelscope/models/nlp/megatron_bert/configuration.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MEGATRON_BERT model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class MegatronBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MegatronBertModel`]. It is used to instantiate a
+    MEGATRON_BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
+    [nvidia/megatron-bert-uncased-345m](https://huggingface.co/nvidia/megatron-bert-uncased-345m) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 29056):
+            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`MegatronBertModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`MegatronBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Examples:
+
+    ```python
+    >>> from transformers import MegatronBertConfig, MegatronBertModel
+
+    >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
+    >>> configuration = MegatronBertConfig()
+
+    >>> # Initializing a model (with random weights) from the bert-base-uncased style configuration
+    >>> model = MegatronBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = 'megatron-bert'
+
+    def __init__(self,
+                 vocab_size=29056,
+                 hidden_size=1024,
+                 num_hidden_layers=24,
+                 num_attention_heads=16,
+                 intermediate_size=4096,
+                 hidden_act='gelu',
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 pad_token_id=0,
+                 position_embedding_type='absolute',
+                 use_cache=True,
+                 **kwargs):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
diff --git a/modelscope/models/nlp/megatron_bert/fill_mask.py b/modelscope/models/nlp/megatron_bert/fill_mask.py
new file mode 100644
index 00000000..2aa51d3d
--- /dev/null
+++ b/modelscope/models/nlp/megatron_bert/fill_mask.py
@@ -0,0 +1,285 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.outputs import AttentionFillMaskModelOutput
+from modelscope.utils import logger as logging
+from modelscope.utils.constant import Tasks
+from .backbone import MegatronBertModel, MegatronBertPreTrainedModel
+from .configuration import MegatronBertConfig
+
+logger = logging.get_logger()
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->MegatronBert
+class MegatronBertPredictionHeadTransform(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->MegatronBert
+class MegatronBertLMPredictionHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.transform = MegatronBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(
+            config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->MegatronBert
+class MegatronBertOnlyMLMHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->MegatronBert
+class MegatronBertOnlyNSPHead(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->MegatronBert
+class MegatronBertPreTrainingHeads(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@MODELS.register_module(Tasks.fill_mask, module_name=Models.megatron_bert)
+class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r'pooler', r'seq_relationship']
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids', r'predictions.decoder.bias'
+    ]
+
+    def __init__(self, config: MegatronBertConfig, **kwargs):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                'If you want to use `MegatronBertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+
+        self.bert = MegatronBertModel(config, add_pooling_layer=False)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
+        *optional*):
+            Labels for computing the masked language modeling loss. Indices
+            should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
+            docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ...,
+            config.vocab_size]`
+
+        Returns:
+            Returns `modelscope.outputs.AttentionFillMaskModelOutput`
+
+        Examples:
+            >>> from modelscope.models import Model
+            >>> from modelscope.preprocessors import Preprocessor
+            >>> model = Model.from_pretrained('damo/nlp_megatronbert_backbone_base_std')
+            >>> preprocessor = Preprocessor.from_pretrained('damo/nlp_megatronbert_backbone_base_std')
+            >>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, ) + outputs[2:]
+            return ((masked_lm_loss, )
+                    + output) if masked_lm_loss is not None else output
+
+        return AttentionFillMaskModelOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            input_ids=input_ids,
+        )
+
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      attention_mask=None,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+        attention_mask_new_zeros = attention_mask.new_zeros(
+            (attention_mask.shape[0], 1))
+        attention_mask = torch.cat([attention_mask, attention_mask_new_zeros],
+                                   dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
diff --git a/modelscope/models/nlp/mglm/blocklm_utils.py b/modelscope/models/nlp/mglm/blocklm_utils.py
index 9af83f67..b05cd2c2 100644
--- a/modelscope/models/nlp/mglm/blocklm_utils.py
+++ b/modelscope/models/nlp/mglm/blocklm_utils.py
@@ -7,11 +7,9 @@ import random
 import numpy as np
 import torch
 import torch.utils.data
+from megatron_util import mpu, print_rank_0
 from scipy.stats import poisson
 
-from . import mpu
-from .utils import print_rank_0
-
 
 def rindex(lst, val, start=None):
     if start is None:
diff --git a/modelscope/models/nlp/mglm/configure_data.py b/modelscope/models/nlp/mglm/configure_data.py
index 6921de08..2761fec1 100644
--- a/modelscope/models/nlp/mglm/configure_data.py
+++ b/modelscope/models/nlp/mglm/configure_data.py
@@ -22,11 +22,11 @@ from itertools import accumulate
 import numpy as np
 import torch
 import torch.utils.data
+from megatron_util import mpu, print_rank_0
 
-from . import data_utils, mpu
+from . import data_utils
 from .blocklm_utils import ConstructBlockStrategy
 from .data_utils.tokenization import make_tokenizer
-from .utils import print_rank_0
 
 
 class MultiTaskDataset(torch.utils.data.Dataset):
diff --git a/modelscope/models/nlp/mglm/fp16/__init__.py b/modelscope/models/nlp/mglm/fp16/__init__.py
deleted file mode 100644
index 90d20bcf..00000000
--- a/modelscope/models/nlp/mglm/fp16/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .fp16 import *  # noqa
-from .fp16util import (BN_convert_float, FP16Model, clip_grad_norm,
-                       convert_module, convert_network,
-                       master_params_to_model_params,
-                       model_grads_to_master_grads, network_to_half,
-                       prep_param_lists, to_python_float, tofp16)
-from .loss_scaler import *  # noqa
diff --git a/modelscope/models/nlp/mglm/fp16/fp16.py b/modelscope/models/nlp/mglm/fp16/fp16.py
deleted file mode 100755
index 10fbd804..00000000
--- a/modelscope/models/nlp/mglm/fp16/fp16.py
+++ /dev/null
@@ -1,660 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Stable version of apex FP16 Optimizer"""
-import torch
-from torch import nn
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-
-from .fp16util import (clip_grad_norm, master_params_to_model_params,
-                       model_grads_to_master_grads)
-from .loss_scaler import DynamicLossScaler, LossScaler
-
-FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
-HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
-
-
-def conversion_helper(val, conversion):
-    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
-    if not isinstance(val, (tuple, list)):
-        return conversion(val)
-    rtn = [conversion_helper(v, conversion) for v in val]
-    if isinstance(val, tuple):
-        rtn = tuple(rtn)
-    return rtn
-
-
-def fp32_to_fp16(val):
-    """Convert fp32 `val` to fp16"""
-
-    def half_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, FLOAT_TYPES):
-            val = val.half()
-        return val
-
-    return conversion_helper(val, half_conversion)
-
-
-def fp16_to_fp32(val):
-    """Convert fp16 `val` to fp32"""
-
-    def float_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, HALF_TYPES):
-            val = val.float()
-        return val
-
-    return conversion_helper(val, float_conversion)
-
-
-class FP16_Module(nn.Module):
-
-    def __init__(self, module):
-        super(FP16_Module, self).__init__()
-        self.add_module('module', module.half())
-
-    def forward(self, *inputs, **kwargs):
-        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
-
-    def named_parameters(self, prefix: str = '', recurse: bool = True):
-        return self.module.named_parameters(prefix=prefix, recurse=recurse)
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        return self.module.state_dict(destination, prefix, keep_vars)
-
-    def load_state_dict(self, state_dict, strict=True):
-        return self.module.load_state_dict(state_dict, strict=strict)
-
-
-# TODO:  Update overflow check + downscale to use Carl's fused kernel.
-class FP16_Optimizer(object):
-    """
-    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
-    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
-    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
-    and changing the call to ``backward``.
-
-    Example::
-
-        model = torch.nn.Linear(D_in, D_out).cuda().half()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-        # Name the FP16_Optimizer instance to replace the existing optimizer
-        # (recommended but not required):
-        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-        ...
-        # loss.backward() becomes:
-        optimizer.backward(loss)
-        ...
-
-    Example with dynamic loss scaling::
-
-        ...
-        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-                                   # optional arg to control dynamic loss scaling behavior
-                                   # dynamic_loss_args={'scale_window' : 500})
-                                   # Usually, dynamic_loss_args is not necessary.
-
-    Args:
-        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
-        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
-        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
-        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
-        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
-
-    ``init_optimizer`` is expected to have been constructed in the ordinary way.
-    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
-    named to replace ``init_optimizer``, for two reasons:
-    First, it means that references to the same name
-    later in the file will not have to change.
-    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
-    modify ``init_optimizer``.  If you do choose a unique name for the new
-    :class:`FP16_Optimizer` instance, you should only work with this new instance,
-    because the preexisting optimizer might no longer behave as expected.
-
-    ``init_optimizer`` may be any Pytorch optimizer.
-    It may contain a mixture of fp16 and fp32 parameters organized into any number of
-    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
-    ingest these ``param_groups`` and remember them.
-
-    Calls to ::
-
-        loss.backward()
-
-    must be replaced with ::
-
-        optimizer.backward(loss)
-
-    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
-    loss scaling and copies to master gradients.
-
-    .. note::
-        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
-        are downscaled before being applied.  This means that adjusting the loss scale, or using
-        dynamic loss scaling, should not require retuning the learning rate or any other
-        hyperparameters.
-
-
-    **Advanced options**
-
-    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
-    See docstring for :attr:`step`.
-
-    **Gradient clipping**:  Use :attr:`clip_master_grads`.
-
-    **Multiple losses**:  If your model accumulates gradients from multiple losses,
-    this can be made more efficient by supplying ``update_master_grads=False``
-    to :attr:`backward`.  See docstring for :attr:`backward`.
-
-    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
-
-        print(optimizer.loss_scale)
-        optimizer.loss_scale = new_loss_scale
-
-    For static loss scaling, manually adjusting the loss scale over time is a reasonable
-    thing to do.  During later epochs, gradients may become smaller, and a
-    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
-    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
-    the loss scale is not recommended.
-
-    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
-    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
-    should still work as intended.
-    """ # noqa
-
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=False):
-        if not torch.cuda.is_available:
-            raise SystemError('Cannot use fp16 without CUDA.')
-
-        self.verbose = verbose
-
-        self.optimizer = init_optimizer
-        # init_state_dict sets up an alternative way to cast per-param state tensors.
-        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
-        # init_state_dict = init_optimizer.state_dict()
-
-        self.fp16_groups = []
-        self.fp32_from_fp16_groups = []
-        self.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            self.maybe_print(
-                'FP16_Optimizer processing param group {}:'.format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        self.maybe_print(
-                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
-                            .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        # Copythe model parallel flag.
-                        master_param.model_parallel = param.model_parallel
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[
-                                master_param] = self.optimizer.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        self.maybe_print(
-                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
-                            .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError(
-                            'Wrapped parameters must be either '
-                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
-                            'Received {}'.format(param.type()))
-
-            self.fp16_groups.append(fp16_params_this_group)
-            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-        # alternative way to cast per-param state tensors:
-        # self.optimizer.load_state_dict(init_state_dict)
-
-        if dynamic_loss_scale:
-            self.dynamic_loss_scale = True
-            if dynamic_loss_args is not None:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-            else:
-                self.loss_scaler = DynamicLossScaler()
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(static_loss_scale)
-
-        self.overflow = False
-        self.first_closure_call_this_step = True
-
-        self.clip_grad_norm = clip_grad_norm
-
-    def maybe_print(self, msg):
-        if self.verbose:
-            print(msg)
-
-    def __getstate__(self):
-        raise RuntimeError(
-            'FP16_Optimizer should be serialized using state_dict().')
-
-    def __setstate__(self, state):
-        raise RuntimeError(
-            'FP16_Optimizer should be deserialized using load_state_dict().')
-
-    def zero_grad(self, set_grads_to_None=False):
-        """
-        Zero fp32 and fp16 parameter grads.
-        """
-        # In principle, only the .grad attributes of the model params need to be zeroed,
-        # because gradients are copied into the FP32 master params.  However, we zero
-        # all gradients owned by the optimizer, just to be safe:
-        for group in self.optimizer.param_groups:
-            for p in group['params']:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-        # Zero fp16 gradients owned by the model:
-        for fp16_group in self.fp16_groups:
-            for param in fp16_group:
-                if set_grads_to_None:
-                    param.grad = None
-                else:
-                    if param.grad is not None:
-                        param.grad.detach_(
-                        )  # as in torch.optim.optimizer.zero_grad()
-                        param.grad.zero_()
-
-    def _check_overflow(self):
-        params = []
-        for group in self.fp16_groups:
-            for param in group:
-                params.append(param)
-        for group in self.fp32_from_fp32_groups:
-            for param in group:
-                params.append(param)
-        self.overflow = self.loss_scaler.has_overflow(params)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    def _master_params_to_model_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(
-                self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-
-    def _model_params_to_master_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(
-                self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
-
-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
-    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
-    def _model_grads_to_master_grads(self):
-        for fp16_group, fp32_from_fp16_group in zip(
-                self.fp16_groups, self.fp32_from_fp16_groups):
-            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
-
-    def _downscale_master(self):
-        if self.loss_scale != 1.0:
-            for group in self.optimizer.param_groups:
-                for param in group['params']:
-                    if param.grad is not None:
-                        param.grad.data.mul_(1. / self.loss_scale)
-
-    def clip_master_grads(self, max_norm, norm_type=2):
-        """
-        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
-
-        Args:
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the current fp32 gradients (viewed as a single vector).
-
-        .. warning::
-            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
-        """ # noqa
-        if not self.overflow:
-            fp32_params = []
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    fp32_params.append(param)
-            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
-        else:
-            return -1
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict[
-            'first_closure_call_this_step'] = self.first_closure_call_this_step
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
-        return state_dict
-
-    def load_state_dict(self, state_dict):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-
-        Example::
-
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-        self.first_closure_call_this_step = state_dict[
-            'first_closure_call_this_step']
-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 2.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
-                                              state_dict['fp32_from_fp16']):
-            for current, saved in zip(current_group, saved_group):
-                current.data.copy_(saved.data)
-
-    def step(self, closure=None):  # could add clip option.
-        """
-        If no closure is supplied, :attr:`step` should be called after
-        ``fp16_optimizer_obj.backward(loss)``.
-        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
-        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
-        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
-        another forward pass using their model.
-
-        If a closure is supplied, :attr:`step` may be called without a prior call to
-        :attr:`backward(loss)`.
-        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
-        However, the user should take care that any ``loss.backward()`` call within the closure
-        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
-
-        Args:
-           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
-
-        Example with closure::
-
-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
-            # existing pytorch optimizer.
-            for input, target in dataset:
-                def closure():
-                    optimizer.zero_grad()
-                    output = model(input)
-                    loss = loss_fn(output, target)
-                    # loss.backward() becomes:
-                    optimizer.backward(loss)
-                    return loss
-                optimizer.step(closure)
-
-        .. warning::
-            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
-
-        .. _`ordinary Pytorch optimizer use`:
-            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
-        """ # noqa
-
-        scale = self.loss_scaler.loss_scale
-        self._update_scale(self.overflow)
-
-        if self.overflow:
-            self.maybe_print(
-                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
-                .format(scale, self.loss_scale))
-            return
-
-        if closure is not None:
-            retval = self._step_with_closure(closure)
-        else:
-            retval = self.optimizer.step()
-
-        self._master_params_to_model_params()
-
-        return retval
-
-    def _step_with_closure(self, closure):
-
-        def wrapped_closure():
-            # helpful for debugging
-            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
-            #       .format(self.first_closure_call_this_step))
-            if self.first_closure_call_this_step:
-                # We expect that the fp16 params are initially fresh on entering self.step(),
-                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
-                # is called within self.optimizer.step().
-                self.first_closure_call_this_step = False
-            else:
-                # If self.optimizer.step() internally calls wrapped_closure more than once,
-                # it may update the fp32 params after each call.  However, self.optimizer
-                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
-                # we can't rely on self.optimizer to refresh the fp16 params.  We need
-                # to handle that manually:
-                self._master_params_to_model_params()
-            # Our API expects the user to give us ownership of the backward() call by
-            # replacing all calls to loss.backward() with optimizer.backward(loss).
-            # This requirement holds whether or not the call to backward() is made within a closure.
-            # If the user is properly calling optimizer.backward(loss) within "closure,"
-            # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, so all wrapped_closure needs to do is call
-            # closure() and return the loss.
-            temp_loss = closure()
-            while (self.overflow):
-                scale = self.loss_scaler.loss_scale
-                self._update_scale(self.overflow)
-                self.maybe_print(
-                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
-                    'reducing to {}'.format(scale, self.loss_scale))
-                temp_loss = closure()
-            return temp_loss
-
-        retval = self.optimizer.step(wrapped_closure)
-
-        self.first_closure_call_this_step = True
-
-        return retval
-
-    def backward(self, loss, update_master_grads=True, retain_graph=False):
-        """
-        :attr:`backward` performs the following conceptual steps:
-
-        1. fp32_loss = loss.float() (see first Note below)
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
-        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
-        5. Finally, master grads are divided by loss_scale.
-
-        In this way, after :attr:`backward`, the master params have fresh gradients,
-        and :attr:`step` may be called.
-
-        .. note::
-            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
-            This provides some additional safety against overflow if the user has supplied an
-            fp16 loss value.
-            However, for maximum overflow safety, the user should
-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
-            :attr:`backward`.
-
-        .. warning::
-            The gradients found in a model's leaves after the call to
-            :attr:`backward` should not be regarded as valid in general,
-            because it's possible
-            they have been scaled (and in the case of dynamic loss scaling,
-            the scale factor may change over time).
-            If the user wants to inspect gradients after a call to :attr:`backward`,
-            only the master gradients should be regarded as valid.  These can be retrieved via
-            :attr:`inspect_master_grad_data()`.
-
-        Args:
-            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
-            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
-            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
-
-        Example::
-
-            # Ordinary operation:
-            optimizer.backward(loss)
-
-            # Naive operation with multiple losses (technically valid, but less efficient):
-            # fp32 grads will be correct after the second call,  but
-            # the first call incurs an unnecessary fp16->fp32 grad copy.
-            optimizer.backward(loss1)
-            optimizer.backward(loss2)
-
-            # More efficient way to handle multiple losses:
-            # The fp16->fp32 grad copy is delayed until fp16 grads from all
-            # losses have been accumulated.
-            optimizer.backward(loss1, update_master_grads=False)
-            optimizer.backward(loss2, update_master_grads=False)
-            optimizer.update_master_grads()
-        """ # noqa
-        # To consider:  try multiple backward passes using retain_grad=True to find
-        # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
-        # discarding the iteration,  but probably wouldn't improve overall efficiency.
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-        if update_master_grads:
-            self.update_master_grads()
-
-    def update_master_grads(self):
-        """
-        Copy the ``.grad`` attribute from stored references to fp16 parameters to
-        the ``.grad`` attribute of the fp32 master parameters that are directly
-        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
-        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
-        """ # noqa
-        if self.dynamic_loss_scale:
-            self._check_overflow()
-            if self.overflow: return  # noqa
-        self._model_grads_to_master_grads()
-        self._downscale_master()
-
-    def inspect_master_grad_data(self):
-        """
-        When running with :class:`FP16_Optimizer`,
-        ``.grad`` attributes of a model's fp16 leaves should not be
-        regarded as truthful, because they might be scaled.
-        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
-        the fp32 master params' ``.grad``
-        attributes will contain valid gradients properly divided by the loss scale.  However,
-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
-        nonintuitive.  :attr:`inspect_master_grad_data`
-        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
-
-        Returns:
-            List of lists (one list for each parameter group).  The list for each parameter group
-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
-        """
-        if self.overflow:
-            print(
-                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
-                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
-            )
-            return None
-        else:
-            # The optimizer owns only references to master params.
-            master_grads_data = []
-            for param_group in self.optimizer.param_groups:
-                master_grads_this_group = []
-                for param in param_group['params']:
-                    if param.grad is not None:
-                        master_grads_this_group.append(param.grad.data)
-                    else:
-                        master_grads_this_group.append(None)
-                master_grads_data.append(master_grads_this_group)
-            return master_grads_data
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/models/nlp/mglm/fp16/fp16util.py b/modelscope/models/nlp/mglm/fp16/fp16util.py
deleted file mode 100644
index 3fcd3005..00000000
--- a/modelscope/models/nlp/mglm/fp16/fp16util.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from torch.autograd import Variable
-
-from modelscope.models.nlp.mglm import mpu
-
-
-class tofp16(nn.Module):
-    """
-    Utility module that implements::
-
-        def forward(self, input):
-            return input.half()
-    """
-
-    def __init__(self):
-        super(tofp16, self).__init__()
-
-    def forward(self, input):
-        return input.half()
-
-
-def BN_convert_float(module):
-    """
-    Utility function for network_to_half().
-
-    Retained for legacy purposes.
-    """
-    if isinstance(
-            module,
-            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-        module.float()
-    for child in module.children():
-        BN_convert_float(child)
-    return module
-
-
-def network_to_half(network):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-
-    Retained for legacy purposes. It is recommended to use FP16Model.
-    """
-    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
-
-
-def convert_module(module, dtype):
-    """
-    Converts a module's immediate parameters and buffers to dtype.
-    """
-    for param in module.parameters(recurse=False):
-        if param is not None:
-            if param.data.dtype.is_floating_point:
-                param.data = param.data.to(dtype=dtype)
-            if param._grad is not None and param._grad.data.dtype.is_floating_point:
-                param._grad.data = param._grad.data.to(dtype=dtype)
-
-    for buf in module.buffers(recurse=False):
-        if buf is not None and buf.data.dtype.is_floating_point:
-            buf.data = buf.data.to(dtype=dtype)
-
-
-def convert_network(network, dtype):
-    """
-    Converts a network's parameters and buffers to dtype.
-    """
-    for module in network.modules():
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
-                      ) and module.affine is True:
-            continue
-        convert_module(module, dtype)
-    return network
-
-
-class FP16Model(nn.Module):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    """
-
-    def __init__(self, network):
-        super(FP16Model, self).__init__()
-        self.network = convert_network(network, dtype=torch.half)
-
-    def forward(self, *inputs):
-        inputs = tuple(t.half() for t in inputs)
-        return self.network(*inputs)
-
-
-def backwards_debug_hook(grad):
-    raise RuntimeError(
-        'master_params recieved a gradient in the backward pass!')
-
-
-def prep_param_lists(model, flat_master=False):
-    """
-    Creates a list of FP32 master parameters for a given model, as in
-    `Training Neural Networks with Mixed Precision:  Real Examples`_.
-
-    Args:
-        model (torch.nn.Module): Existing Pytorch model
-        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
-    Returns:
-        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
-
-    Example::
-
-        model_params, master_params = prep_param_lists(model)
-
-    .. warning::
-        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
-
-    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
-        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
-    """ # noqa
-    model_params = [
-        param for param in model.parameters() if param.requires_grad
-    ]
-
-    if flat_master:
-        # Give the user some more useful error messages
-        try:
-            # flatten_dense_tensors returns a contiguous flat array.
-            # http://pytorch.org/docs/master/_modules/torch/_utils.html
-            master_params = _flatten_dense_tensors(
-                [param.data for param in model_params]).float()
-        except:  # noqa
-            print(
-                'Error in prep_param_lists:  model may contain a mixture of parameters '
-                'of different types.  Use flat_master=False, or use F16_Optimizer.'
-            )
-            raise
-        master_params = torch.nn.Parameter(master_params)
-        master_params.requires_grad = True
-        # master_params.register_hook(backwards_debug_hook)
-        if master_params.grad is None:
-            master_params.grad = master_params.new(*master_params.size())
-        return model_params, [master_params]
-    else:
-        master_params = [
-            param.clone().float().detach() for param in model_params
-        ]
-        for param in master_params:
-            param.requires_grad = True
-        return model_params, master_params
-
-
-def model_grads_to_master_grads(model_params,
-                                master_params,
-                                flat_master=False):
-    """
-    Copy model gradients to master gradients.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
-    """ # noqa
-    if flat_master:
-        # The flattening may incur one more deep copy than is necessary.
-        master_params[0].grad.data.copy_(
-            _flatten_dense_tensors([p.grad.data for p in model_params]))
-    else:
-        for model, master in zip(model_params, master_params):
-            if model.grad is not None:
-                if master.grad is None:
-                    master.grad = Variable(
-                        master.data.new(*master.data.size()))
-                master.grad.data.copy_(model.grad.data)
-            else:
-                master.grad = None
-
-
-def master_params_to_model_params(model_params,
-                                  master_params,
-                                  flat_master=False):
-    """
-    Copy master parameters to model parameters.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
-    """ # noqa
-    if flat_master:
-        for model, master in zip(
-                model_params,
-                _unflatten_dense_tensors(master_params[0].data, model_params)):
-            model.data.copy_(master)
-    else:
-        for model, master in zip(model_params, master_params):
-            model.data.copy_(master.data)
-
-
-# Backward compatibility fixes
-
-
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-
-clip_grad_norm = mpu.clip_grad_norm
diff --git a/modelscope/models/nlp/mglm/fp16/loss_scaler.py b/modelscope/models/nlp/mglm/fp16/loss_scaler.py
deleted file mode 100755
index 721571b3..00000000
--- a/modelscope/models/nlp/mglm/fp16/loss_scaler.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from modelscope.models.nlp.mglm import mpu
-
-
-# item() is a recent addition, so this helps with backward compatibility.
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-
-class LossScaler:
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-
-    def __init__(self, scale=1):
-        self.cur_scale = scale
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-
-    def update_scale(self, overflow):
-        pass
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-class DynamicLossScaler:
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
-    """ # noqa
-
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for p in params:
-            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
-                    p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow(self, params):
-        overflow = self.has_overflow_serial(params)
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        torch.distributed.all_reduce(
-            overflow_gpu,
-            op=torch.distributed.ReduceOp.MAX,
-            group=mpu.get_model_parallel_group())
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if 'value cannot be converted' not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float(
-                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-
-        if not hasattr(self, 'min_scale'):
-            self.min_scale = 1
-        if not hasattr(self, 'delayed_shift'):
-            self.delayed_shift = 1
-        if not hasattr(self, 'cur_hysteresis'):
-            self.cur_hysteresis = 1
-        if not hasattr(self, 'consecutive_hysteresis'):
-            self.consecutive_hysteresis = True
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale / self.scale_factor,
-                                     self.min_scale)
-            else:
-                self.cur_hysteresis -= 1
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if self.consecutive_hysteresis:
-                self.cur_hysteresis = self.delayed_shift
-            if (self.cur_iter
-                    - self.last_overflow_iter) % self.scale_window == 0:
-                if not self.consecutive_hysteresis:
-                    self.cur_hysteresis = self.delayed_shift
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-"""
-TO-DO separate out into an example.
-if __name__ == "__main__":
-    import torch
-    from torch.autograd import Variable
-    from dynamic_loss_scaler import DynamicLossScaler
-
-    # N is batch size; D_in is input dimension;
-    # H is hidden dimension; D_out is output dimension.
-    N, D_in, H, D_out = 64, 1000, 100, 10
-
-    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-    x = Variable(torch.randn(N, D_in), requires_grad=False)
-    y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
-    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
-    parameters = [w1, w2]
-
-    learning_rate = 1e-6
-    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
-    loss_scaler = DynamicLossScaler()
-
-    for t in range(500):
-        y_pred = x.mm(w1).clamp(min=0).mm(w2)
-        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
-        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
-        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
-        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-
-        # Run backprop
-        optimizer.zero_grad()
-        loss.backward()
-
-        # Check for overflow
-        has_overflow = DynamicLossScaler.has_overflow(parameters)
-
-        # If no overflow, unscale grad and update as usual
-        if not has_overflow:
-            for param in parameters:
-                param.grad.data.mul_(1. / loss_scaler.loss_scale)
-            optimizer.step()
-        # Otherwise, don't do anything -- ie, skip iteration
-        else:
-            print('OVERFLOW!')
-
-        # Update loss scale for next iteration
-        loss_scaler.update_scale(has_overflow)
-
-"""
diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
index 2df11d6c..079cfd46 100644
--- a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
+++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
@@ -8,6 +8,7 @@ from typing import Dict
 import numpy as np
 import torch
 import torch.nn.functional as F
+from megatron_util import mpu
 
 from modelscope.metainfo import Models
 from modelscope.models.base import Tensor, TorchModel
@@ -15,7 +16,7 @@ from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
-from . import mpu
+from modelscope.utils.megatron_utils import init_megatron_util
 from .arguments import get_args
 from .generation_utils import BeamSearchScorer
 from .train_utils import get_model
@@ -62,16 +63,6 @@ def setup_model(args):
     return model
 
 
-def set_random_seed(seed):
-    """Set random seed for reproducability."""
-
-    if seed is not None and seed > 0:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        mpu.model_parallel_cuda_manual_seed(seed)
-
-
 def get_masks_and_position_ids(data,
                                eod_token,
                                reset_position_ids,
@@ -142,36 +133,6 @@ def get_masks_and_position_ids(data,
     return attention_mask, loss_mask, position_ids
 
 
-def initialize_distributed(args):
-    """Initialize torch.distributed."""
-
-    # Manually set the device ids.
-    device = args.rank % torch.cuda.device_count()
-    if args.local_rank is not None:
-        device = args.local_rank
-    torch.cuda.set_device(device)
-    # Call the init process
-    init_method = 'tcp://'
-    args.master_ip = os.getenv('MASTER_ADDR', 'localhost')
-    args.master_port = os.getenv('MASTER_PORT', '6000')
-    init_method += args.master_ip + ':' + args.master_port
-    torch.distributed.init_process_group(
-        backend=args.distributed_backend,
-        world_size=args.world_size,
-        rank=args.rank,
-        init_method=init_method)
-
-    # Set the model-parallel / data-parallel communicators.
-    mpu.initialize_model_parallel(args.model_parallel_size)
-
-    # Optional DeepSpeed Activation Checkpointing Features
-    #
-    if hasattr(
-            args, 'deepspeed'
-    ) and args.deepspeed and args.deepspeed_activation_checkpointing:
-        set_deepspeed_activation_checkpointing(args)
-
-
 def get_batch(context_tokens, device, args):
     tokens = context_tokens
     tokens = tokens.view(args.batch_size, -1).contiguous()
@@ -398,13 +359,12 @@ class MGLMForTextSummarization(TorchModel):
         # Arguments.
         self.args = setup_args(get_args())
         self.args.load_pretrained = model_dir
-        # Pytorch distributed.
+
         try:
-            initialize_distributed(self.args)
-        except (RuntimeError):
-            print('group process initialized twice')
-        # Random seeds for reproducability.
-        set_random_seed(self.args.seed)
+            init_megatron_util(model_dir=model_dir)
+        except AssertionError:
+            print('megatron initialized twice')
+
         # setting default batch size to 1
         self.args.batch_size = 1
         self.args.tokenizer_path = model_dir
diff --git a/modelscope/models/nlp/mglm/model/distributed.py b/modelscope/models/nlp/mglm/model/distributed.py
index a3c84e9f..328721dc 100755
--- a/modelscope/models/nlp/mglm/model/distributed.py
+++ b/modelscope/models/nlp/mglm/model/distributed.py
@@ -14,13 +14,12 @@
 
 import torch
 import torch.distributed as dist
+from megatron_util import mpu
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.autograd import Variable
 from torch.nn.modules import Module
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 
-from modelscope.models.nlp.mglm import mpu
-
 
 class PyTorchDistributedDataParallel(DDP):
 
diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py
index 965f82a7..63f21224 100644
--- a/modelscope/models/nlp/mglm/model/modeling_bert.py
+++ b/modelscope/models/nlp/mglm/model/modeling_bert.py
@@ -26,10 +26,10 @@ import tarfile
 import tempfile
 
 import json
-import mpu
 import torch
 import torch.nn.functional as F
 from data_utils.file_utils import cached_path
+from megatron_util import mpu
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
diff --git a/modelscope/models/nlp/mglm/model/modeling_glm.py b/modelscope/models/nlp/mglm/model/modeling_glm.py
index 80f61cef..4f21943a 100644
--- a/modelscope/models/nlp/mglm/model/modeling_glm.py
+++ b/modelscope/models/nlp/mglm/model/modeling_glm.py
@@ -17,10 +17,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from megatron_util import mpu, print_rank_0
 
-from modelscope.models.nlp.mglm import mpu
 from modelscope.models.nlp.mglm.model.prompt import PromptSpell
-from modelscope.models.nlp.mglm.utils import print_rank_0
+from .transformer import GPT2ParallelTransformer
 
 
 def init_method_normal(std=0.02):
@@ -78,7 +78,7 @@ class GLMModel(torch.nn.Module):
             vocab_size, hidden_size, init_method=init_method)
 
         # Transformer
-        self.transformer = mpu.GPT2ParallelTransformer(
+        self.transformer = GPT2ParallelTransformer(
             num_layers,
             hidden_size,
             num_attention_heads,
@@ -181,11 +181,11 @@ class EncoderDecoder(torch.nn.Module):
             vocab_size, hidden_size, init_method=init_method)
 
         # Transformer
-        self.encoder = mpu.GPT2ParallelTransformer(
+        self.encoder = GPT2ParallelTransformer(
             num_layers, hidden_size, num_attention_heads, max_sequence_length,
             max_memory_length, embedding_dropout_prob, attention_dropout_prob,
             output_dropout_prob, checkpoint_activations, checkpoint_num_layers)
-        self.decoder = mpu.GPT2ParallelTransformer(
+        self.decoder = GPT2ParallelTransformer(
             num_layers,
             hidden_size,
             num_attention_heads,
diff --git a/modelscope/models/nlp/mglm/mpu/transformer.py b/modelscope/models/nlp/mglm/model/transformer.py
old mode 100755
new mode 100644
similarity index 93%
rename from modelscope/models/nlp/mglm/mpu/transformer.py
rename to modelscope/models/nlp/mglm/model/transformer.py
index c12b2e10..da944c76
--- a/modelscope/models/nlp/mglm/mpu/transformer.py
+++ b/modelscope/models/nlp/mglm/model/transformer.py
@@ -19,12 +19,7 @@ import deepspeed
 import torch
 import torch.nn.init as init
 from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-
-from .initialize import get_model_parallel_world_size
-from .layers import ColumnParallelLinear, RowParallelLinear
-from .mappings import gather_from_model_parallel_region
-from .random import checkpoint, get_cuda_rng_tracker
-from .utils import divide, split_tensor_along_last_dim
+from megatron_util import mpu
 
 
 class PositionalEmbedding(torch.nn.Module):
@@ -63,19 +58,19 @@ class ParallelCrossAttention(torch.nn.Module):
         if output_layer_init_method is None:
             output_layer_init_method = init_method
         # Per attention head and per partition values.
-        world_size = get_model_parallel_world_size()
-        self.hidden_size_per_partition = divide(hidden_size, world_size)
-        self.hidden_size_per_attention_head = divide(hidden_size,
-                                                     num_attention_heads)
-        self.num_attention_heads_per_partition = divide(
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
             num_attention_heads, world_size)
         # Strided linear layer.
-        self.query = ColumnParallelLinear(
+        self.query = mpu.ColumnParallelLinear(
             hidden_size,
             hidden_size,
             gather_output=False,
             init_method=init_method)
-        self.key_value = ColumnParallelLinear(
+        self.key_value = mpu.ColumnParallelLinear(
             hidden_size,
             2 * hidden_size,
             stride=2,
@@ -87,7 +82,7 @@ class ParallelCrossAttention(torch.nn.Module):
         self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
 
         # Output.
-        self.dense = RowParallelLinear(
+        self.dense = mpu.RowParallelLinear(
             hidden_size,
             hidden_size,
             input_is_parallel=True,
@@ -95,9 +90,8 @@ class ParallelCrossAttention(torch.nn.Module):
         self.output_dropout = torch.nn.Dropout(output_dropout_prob)
 
         if deepspeed.checkpointing.is_configured():
-            global get_cuda_rng_tracker, checkpoint
-            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
-            checkpoint = deepspeed.checkpointing.checkpoint
+            mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            mpu.checkpoint = deepspeed.checkpointing.checkpoint
 
     def _transpose_for_scores(self, tensor):
         """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
@@ -116,8 +110,8 @@ class ParallelCrossAttention(torch.nn.Module):
         # Attention heads. [b, s, hp]
         mixed_query_layer = self.query(hidden_states)
         mixed_x_layer = self.key_value(encoder_states)
-        (mixed_key_layer,
-         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 2)
+        (mixed_key_layer, mixed_value_layer) = mpu.split_tensor_along_last_dim(
+            mixed_x_layer, 2)
 
         # Reshape and transpose [b, np, s, hn]
         query_layer = self._transpose_for_scores(mixed_query_layer)
@@ -137,7 +131,7 @@ class ParallelCrossAttention(torch.nn.Module):
         attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
-        with get_cuda_rng_tracker().fork():
+        with mpu.get_cuda_rng_tracker().fork():
             attention_probs = self.attention_dropout(attention_probs)
 
         # Context layer.
@@ -200,23 +194,23 @@ class ParallelSelfAttention(torch.nn.Module):
         if output_layer_init_method is None:
             output_layer_init_method = init_method
         # Per attention head and per partition values.
-        world_size = get_model_parallel_world_size()
-        self.hidden_size_per_partition = divide(hidden_size, world_size)
-        self.hidden_size_per_attention_head = divide(hidden_size,
-                                                     num_attention_heads)
-        self.num_attention_heads_per_partition = divide(
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
             num_attention_heads, world_size)
         self.relative_encoding = relative_encoding
         self.attention_scale = attention_scale
         # Strided linear layer.
-        self.query_key_value = ColumnParallelLinear(
+        self.query_key_value = mpu.ColumnParallelLinear(
             hidden_size,
             3 * hidden_size,
             stride=3,
             gather_output=False,
             init_method=init_method)
         if relative_encoding:
-            self.relative = ColumnParallelLinear(
+            self.relative = mpu.ColumnParallelLinear(
                 hidden_size,
                 hidden_size,
                 gather_output=False,
@@ -227,7 +221,7 @@ class ParallelSelfAttention(torch.nn.Module):
         self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
 
         # Output.
-        self.dense = RowParallelLinear(
+        self.dense = mpu.RowParallelLinear(
             hidden_size,
             hidden_size,
             input_is_parallel=True,
@@ -235,9 +229,8 @@ class ParallelSelfAttention(torch.nn.Module):
         self.output_dropout = torch.nn.Dropout(output_dropout_prob)
 
         if deepspeed.checkpointing.is_configured():
-            global get_cuda_rng_tracker, checkpoint
-            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
-            checkpoint = deepspeed.checkpointing.checkpoint
+            mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            mpu.checkpoint = deepspeed.checkpointing.checkpoint
 
     def _transpose_for_scores(self, tensor):
         """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
@@ -284,13 +277,13 @@ class ParallelSelfAttention(torch.nn.Module):
         if mem is None:
             mixed_x_layer = self.query_key_value(hidden_states)
             (mixed_query_layer, mixed_key_layer,
-             mixed_value_layer) = split_tensor_along_last_dim(
+             mixed_value_layer) = mpu.split_tensor_along_last_dim(
                  mixed_x_layer, 3)
         else:
             cat = torch.cat((mem, hidden_states), 1)
             mixed_x_layer = self.query_key_value(cat)
             (mixed_query_layer, mixed_key_layer,
-             mixed_value_layer) = split_tensor_along_last_dim(
+             mixed_value_layer) = mpu.split_tensor_along_last_dim(
                  mixed_x_layer, 3)
             mixed_query_layer = mixed_query_layer[:, -query_length:]
 
@@ -342,7 +335,7 @@ class ParallelSelfAttention(torch.nn.Module):
         attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
-        with get_cuda_rng_tracker().fork():
+        with mpu.get_cuda_rng_tracker().fork():
             attention_probs = self.attention_dropout(attention_probs)
 
         # Context layer.
@@ -403,13 +396,13 @@ class ParallelMLP(torch.nn.Module):
         if output_layer_init_method is None:
             output_layer_init_method = init_method
         # Project to 4h.
-        self.dense_h_to_4h = ColumnParallelLinear(
+        self.dense_h_to_4h = mpu.ColumnParallelLinear(
             hidden_size,
             4 * hidden_size,
             gather_output=False,
             init_method=init_method)
         # Project back to h.
-        self.dense_4h_to_h = RowParallelLinear(
+        self.dense_4h_to_h = mpu.RowParallelLinear(
             4 * hidden_size,
             hidden_size,
             input_is_parallel=True,
@@ -732,10 +725,10 @@ class GPT2ParallelTransformer(torch.nn.Module):
             # Relative position embedding
             self.position_embeddings = PositionalEmbedding(hidden_size)
             # Per attention head and per partition values.
-            world_size = get_model_parallel_world_size()
-            self.hidden_size_per_attention_head = divide(
+            world_size = mpu.get_model_parallel_world_size()
+            self.hidden_size_per_attention_head = mpu.divide(
                 hidden_size, num_attention_heads)
-            self.num_attention_heads_per_partition = divide(
+            self.num_attention_heads_per_partition = mpu.divide(
                 num_attention_heads, world_size)
             self.r_w_bias = torch.nn.Parameter(
                 torch.Tensor(self.num_attention_heads_per_partition,
@@ -798,9 +791,8 @@ class GPT2ParallelTransformer(torch.nn.Module):
         self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
 
         if deepspeed.checkpointing.is_configured():
-            global get_cuda_rng_tracker, checkpoint
-            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
-            checkpoint = deepspeed.checkpointing.checkpoint
+            mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            mpu.checkpoint = deepspeed.checkpointing.checkpoint
 
     def forward(self,
                 hidden_states,
@@ -917,7 +909,8 @@ class GPT2ParallelTransformer(torch.nn.Module):
                     args += [position_embeddings, self.r_w_bias, self.r_r_bias]
                 if memory_states:
                     args += memory_states[l:l + chunk_length]
-                hidden_states = checkpoint(custom(l, l + chunk_length), *args)
+                hidden_states = mpu.checkpoint(
+                    custom(l, l + chunk_length), *args)
                 l += chunk_length  # noqa
         else:
             for i, layer in enumerate(self.layers):
@@ -1000,14 +993,14 @@ class BertParallelSelfAttention(torch.nn.Module):
         self.dropout_prob = dropout_prob
         self.output_parallel = output_parallel
         # Per attention head and per partition values.
-        world_size = get_model_parallel_world_size()
-        self.hidden_size_per_partition = divide(hidden_size, world_size)
-        self.hidden_size_per_attention_head = divide(hidden_size,
-                                                     num_attention_heads)
-        self.num_attention_heads_per_partition = divide(
+        world_size = mpu.get_model_parallel_world_size()
+        self.hidden_size_per_partition = mpu.divide(hidden_size, world_size)
+        self.hidden_size_per_attention_head = mpu.divide(
+            hidden_size, num_attention_heads)
+        self.num_attention_heads_per_partition = mpu.divide(
             num_attention_heads, world_size)
         # Strided linear layer.
-        self.query_key_value = ColumnParallelLinear(
+        self.query_key_value = mpu.ColumnParallelLinear(
             hidden_size,
             3 * hidden_size,
             stride=3,
@@ -1019,9 +1012,8 @@ class BertParallelSelfAttention(torch.nn.Module):
         self.dropout = torch.nn.Dropout(dropout_prob)
 
         if deepspeed.checkpointing.is_configured():
-            global get_cuda_rng_tracker, checkpoint
-            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
-            checkpoint = deepspeed.checkpointing.checkpoint
+            mpu.get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
+            mpu.checkpoint = deepspeed.checkpointing.checkpoint
 
     def _transpose_for_scores(self, tensor):
         """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
@@ -1038,7 +1030,8 @@ class BertParallelSelfAttention(torch.nn.Module):
         # Attention heads. [b, s, hp]
         mixed_x_layer = self.query_key_value(hidden_states)
         (mixed_query_layer, mixed_key_layer,
-         mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+         mixed_value_layer) = mpu.split_tensor_along_last_dim(
+             mixed_x_layer, 3)
 
         # Reshape and transpose [b, np, s, hn]
         query_layer = self._transpose_for_scores(mixed_query_layer)
@@ -1057,7 +1050,7 @@ class BertParallelSelfAttention(torch.nn.Module):
         attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
-        with get_cuda_rng_tracker().fork():
+        with mpu.get_cuda_rng_tracker().fork():
             attention_probs = self.dropout(attention_probs)
 
         # Context layer.
@@ -1074,7 +1067,7 @@ class BertParallelSelfAttention(torch.nn.Module):
         if self.output_parallel:
             output = context_layer
         else:
-            output = gather_from_model_parallel_region(context_layer)
+            output = mpu.gather_from_model_parallel_region(context_layer)
 
         return output
 
@@ -1092,7 +1085,7 @@ class BertParallelTransformerOutput(torch.nn.Module):
                  init_method=init.xavier_normal_):
         super(BertParallelTransformerOutput, self).__init__()
         # Components.
-        self.dense = RowParallelLinear(
+        self.dense = mpu.RowParallelLinear(
             input_size,
             output_size,
             input_is_parallel=input_is_parallel,
@@ -1167,7 +1160,7 @@ class BertParallelTransformerLayer(torch.nn.Module):
             input_is_parallel=True,
             init_method=init_method)
         # Intermediate.
-        self.intermediate = ColumnParallelLinear(
+        self.intermediate = mpu.ColumnParallelLinear(
             hidden_size,
             intermediate_size,
             gather_output=False,
diff --git a/modelscope/models/nlp/mglm/mpu/__init__.py b/modelscope/models/nlp/mglm/mpu/__init__.py
deleted file mode 100755
index 8cca4e2c..00000000
--- a/modelscope/models/nlp/mglm/mpu/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Model parallel utility interface."""
-
-from .cross_entropy import vocab_parallel_cross_entropy
-from .data import broadcast_data
-from .grads import clip_grad_norm
-from .initialize import (destroy_model_parallel, get_data_parallel_group,
-                         get_data_parallel_rank, get_data_parallel_world_size,
-                         get_model_parallel_group, get_model_parallel_rank,
-                         get_model_parallel_src_rank,
-                         get_model_parallel_world_size,
-                         initialize_model_parallel,
-                         model_parallel_is_initialized)
-from .layers import (ColumnParallelLinear, ParallelEmbedding,
-                     RowParallelLinear, VocabParallelEmbedding)
-from .mappings import (copy_to_model_parallel_region,
-                       gather_from_model_parallel_region,
-                       reduce_from_model_parallel_region,
-                       scatter_to_model_parallel_region)
-from .random import (checkpoint, get_cuda_rng_tracker,
-                     model_parallel_cuda_manual_seed,
-                     partition_activations_in_checkpoint)
-from .transformer import (BertParallelSelfAttention,
-                          BertParallelTransformerLayer,
-                          GPT2ParallelTransformer, LayerNorm)
diff --git a/modelscope/models/nlp/mglm/mpu/cross_entropy.py b/modelscope/models/nlp/mglm/mpu/cross_entropy.py
deleted file mode 100644
index 2ebcf7a8..00000000
--- a/modelscope/models/nlp/mglm/mpu/cross_entropy.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from .initialize import (get_model_parallel_group, get_model_parallel_rank,
-                         get_model_parallel_world_size)
-from .utils import VocabUtility
-
-
-class _VocabParallelCrossEntropy(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, vocab_parallel_logits, target):
-
-        # Copy so the input remains unchanged.
-        logits = vocab_parallel_logits.clone()
-        # Maximum value along vocab dimension across all GPUs.
-        logits_max = torch.max(logits, dim=-1)[0]
-        torch.distributed.all_reduce(
-            logits_max,
-            op=torch.distributed.ReduceOp.MAX,
-            group=get_model_parallel_group())
-        # Subtract the maximum value.
-        logits.sub_(logits_max.unsqueeze(dim=-1))
-        # Sum of exponential of logits along vocab dimension across all GPUs.
-        exp_logits = logits.exp()
-        sum_exp_logits = exp_logits.sum(dim=-1)
-        torch.distributed.all_reduce(
-            sum_exp_logits,
-            op=torch.distributed.ReduceOp.SUM,
-            group=get_model_parallel_group())
-
-        # Get the partition's vocab indecies
-        get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
-        partition_vocab_size = vocab_parallel_logits.size()[-1]
-        rank = get_model_parallel_rank()
-        world_size = get_model_parallel_world_size()
-        vocab_start_index, vocab_end_index = get_vocab_range(
-            partition_vocab_size, rank, world_size)
-
-        # Create a mask of valid vocab ids (1 means it needs to be masked).
-        target_mask = (target < vocab_start_index) | (
-            target >= vocab_end_index)
-        masked_target = target.clone() - vocab_start_index
-        masked_target[target_mask] = 0
-
-        # Get predicted-logits = logits[target].
-        # For Simplicity, we convert logits to a 2-D tensor with size
-        # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
-        logits_2d = logits.view(-1, partition_vocab_size)
-        masked_target_1d = masked_target.view(-1)
-        arange_1d = torch.arange(
-            start=0, end=logits_2d.size()[0], device=logits_2d.device)
-        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
-        predicted_logits = predicted_logits_1d.view_as(target)
-        predicted_logits[target_mask] = 0.0
-        # All reduce is needed to get the chunks from other GPUs.
-        torch.distributed.all_reduce(
-            predicted_logits,
-            op=torch.distributed.ReduceOp.SUM,
-            group=get_model_parallel_group())
-
-        # Loss = log(sum(exp(logits))) - predicted-logit.
-        loss = torch.log(sum_exp_logits) - predicted_logits
-
-        # Store softmax, target-mask and masked-target for backward pass.
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
-
-        return loss
-
-    @staticmethod
-    def backward(ctx, grad_output):
-
-        # Retreive tensors from the forward path.
-        softmax, target_mask, masked_target_1d = ctx.saved_tensors
-
-        # All the inputs have softmax as thier gradient.
-        grad_input = softmax
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(
-            start=0, end=grad_2d.size()[0], device=grad_2d.device)
-        grad_2d[arange_1d,
-                masked_target_1d] -= (1.0 - target_mask.view(-1).float())
-
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(grad_output.unsqueeze(dim=-1))
-
-        return grad_input, None
-
-
-def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
-    """Helper function for the cross entropy."""
-    return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
diff --git a/modelscope/models/nlp/mglm/mpu/data.py b/modelscope/models/nlp/mglm/mpu/data.py
deleted file mode 100644
index 6f595f0f..00000000
--- a/modelscope/models/nlp/mglm/mpu/data.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from .initialize import (get_model_parallel_group, get_model_parallel_rank,
-                         get_model_parallel_src_rank)
-
-_MAX_DATA_DIM = 5
-
-
-def _check_data_types(keys, data, target_dtype):
-    """Check that all the keys have the same target data type."""
-    for key in keys:
-        assert data[key].dtype == target_dtype, '{} has data type {} which '\
-            'is different than {}'.format(key, data[key].dtype, target_dtype)
-
-
-def _build_key_size_numel_dictionaries(keys, data):
-    """Build the size on rank 0 and broadcast."""
-    max_dim = _MAX_DATA_DIM
-    sizes = [0 for _ in range(max_dim) for _ in keys]
-
-    # Pack the sizes on rank zero.
-    if get_model_parallel_rank() == 0:
-        offset = 0
-        for key in keys:
-            assert data[key].dim(
-            ) < max_dim, 'you should increase MAX_DATA_DIM'
-            size = data[key].size()
-            for i, s in enumerate(size):
-                sizes[i + offset] = s
-            offset += max_dim
-
-    # Move to GPU and broadcast.
-    sizes_cuda = torch.cuda.LongTensor(sizes)
-    torch.distributed.broadcast(
-        sizes_cuda,
-        get_model_parallel_src_rank(),
-        group=get_model_parallel_group())
-
-    # Move back to cpu and unpack.
-    sizes_cpu = sizes_cuda.cpu()
-    key_size = {}
-    key_numel = {}
-    total_numel = 0
-    offset = 0
-    for key in keys:
-        i = 0
-        size = []
-        numel = 1
-        while sizes_cpu[offset + i] > 0:
-            this_size = sizes_cpu[offset + i]
-            size.append(this_size)
-            numel *= this_size
-            i += 1
-        key_size[key] = size
-        key_numel[key] = numel
-        total_numel += numel
-        offset += max_dim
-
-    return key_size, key_numel, total_numel
-
-
-def broadcast_data(keys, data, datatype):
-    """Broadcast data from rank zero of each model parallel group to the
-    members of the same model parallel group.
-
-    Arguments:
-        keys: list of keys in the data disctionary to be broadcasted
-        data: data dictionary of string keys and cpu tensor values.
-        datatype: torch data type of all tensors in data associated
-                  with keys.
-    """
-    # Build (key, size) and (key, number of elements) dictionaries along
-    # with the total number of elements on all ranks.
-    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
-        keys, data)
-
-    # Pack on rank zero.
-    if get_model_parallel_rank() == 0:
-        # Check that all keys have the same data type.
-        _check_data_types(keys, data, datatype)
-        # Flatten the data associated with the keys
-        flatten_data = torch.cat(
-            [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
-    else:
-        flatten_data = torch.empty(
-            total_numel, device=torch.cuda.current_device(), dtype=datatype)
-
-    # Boradcast
-    torch.distributed.broadcast(
-        flatten_data,
-        get_model_parallel_src_rank(),
-        group=get_model_parallel_group())
-
-    # Unpack
-    output = {}
-    offset = 0
-    for key in keys:
-        size = key_size[key]
-        numel = key_numel[key]
-        output[key] = flatten_data.narrow(0, offset, numel).view(size)
-        offset += numel
-
-    return output
diff --git a/modelscope/models/nlp/mglm/mpu/grads.py b/modelscope/models/nlp/mglm/mpu/grads.py
deleted file mode 100644
index a7dc6c5c..00000000
--- a/modelscope/models/nlp/mglm/mpu/grads.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-import torch
-from torch._six import inf
-
-from .initialize import get_model_parallel_group, get_model_parallel_rank
-
-
-def clip_grad_norm(parameters, max_norm, norm_type=2):
-    """Clips gradient norm of an iterable of parameters.
-
-    This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-    added functionality to handle model parallel parameters. Note that
-    the gradients are modified in place.
-
-    Arguments:
-        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-            single Tensor that will have gradients normalized
-        max_norm (float or int): max norm of the gradients
-        norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
-
-    Returns:
-        Total norm of the parameters (viewed as a single vector).
-    """
-    if isinstance(parameters, torch.Tensor):
-        parameters = [parameters]
-    parameters = list(filter(lambda p: p.grad is not None, parameters))
-    max_norm = float(max_norm)
-    norm_type = float(norm_type)
-    if norm_type == inf:
-        total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        # Take max across all GPUs.
-        torch.distributed.all_reduce(
-            total_norm_cuda,
-            op=torch.distributed.ReduceOp.MAX,
-            group=get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()
-    else:
-        total_norm = 0
-        for p in parameters:
-            if p.model_parallel or (get_model_parallel_rank() == 0):
-                param_norm = p.grad.data.norm(norm_type)
-                total_norm += param_norm.item()**norm_type
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-        torch.distributed.all_reduce(
-            total_norm_cuda,
-            op=torch.distributed.ReduceOp.SUM,
-            group=get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-    clip_coef = max_norm / (total_norm + 1e-6)
-    if clip_coef < 1:
-        for p in parameters:
-            p.grad.data.mul_(clip_coef)
-    return total_norm
diff --git a/modelscope/models/nlp/mglm/mpu/initialize.py b/modelscope/models/nlp/mglm/mpu/initialize.py
deleted file mode 100644
index 33f8dbda..00000000
--- a/modelscope/models/nlp/mglm/mpu/initialize.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Model and data parallel groups."""
-
-import torch
-
-from .utils import ensure_divisibility
-
-# Model parallel group that the current rank belongs to.
-_MODEL_PARALLEL_GROUP = None
-# Data parallel group that the current rank belongs to.
-_DATA_PARALLEL_GROUP = None
-
-
-def initialize_model_parallel(model_parallel_size_):
-    """
-    Initialize model data parallel groups.
-
-    Arguments:
-        model_parallel_size: number of GPUs used to parallelize model.
-
-    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
-    use 2 GPUs to parallelize the model. The present function will
-    create 4 model parallel groups and 2 data parallel grous as:
-        4 model parallel groups:
-            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
-        2 data parallel groups:
-            [g0, g2, g4, g6], [g1, g3, g5, g7]
-    Note that for efficiency, the caller should make sure adjacent ranks
-    are on the same DGX box. For example if we are using 2 DGX-1 boxes
-    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
-    ranks 8 to 15 belong to the second box.
-    """
-    if torch.distributed.get_rank() == 0:
-        print('> initializing model parallel with size {}'.format(
-            model_parallel_size_))
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size = torch.distributed.get_world_size()
-    model_parallel_size = min(model_parallel_size_, world_size)
-    ensure_divisibility(world_size, model_parallel_size)
-    rank = torch.distributed.get_rank()
-
-    # Build the data parallel groups.
-    global _DATA_PARALLEL_GROUP
-    assert _DATA_PARALLEL_GROUP is None, \
-        'data parallel group is already initialized'
-    for i in range(model_parallel_size):
-        ranks = range(i, world_size, model_parallel_size)
-        group = torch.distributed.new_group(ranks)
-        if i == (rank % model_parallel_size):
-            _DATA_PARALLEL_GROUP = group
-
-    # Build the model parallel groups.
-    global _MODEL_PARALLEL_GROUP
-    assert _MODEL_PARALLEL_GROUP is None, \
-        'model parallel group is already initialized'
-    for i in range(world_size // model_parallel_size):
-        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
-        group = torch.distributed.new_group(ranks)
-        if i == (rank // model_parallel_size):
-            _MODEL_PARALLEL_GROUP = group
-
-
-def model_parallel_is_initialized():
-    """Check if model and data parallel groups are initialized."""
-    if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
-        return False
-    return True
-
-
-def get_model_parallel_group():
-    """Get the model parallel group the caller rank belongs to."""
-    assert _MODEL_PARALLEL_GROUP is not None, \
-        'model parallel group is not initialized'
-    return _MODEL_PARALLEL_GROUP
-
-
-def get_data_parallel_group():
-    """Get the data parallel group the caller rank belongs to."""
-    assert _DATA_PARALLEL_GROUP is not None, \
-        'data parallel group is not initialized'
-    return _DATA_PARALLEL_GROUP
-
-
-def get_model_parallel_world_size():
-    """Return world size for the model parallel group."""
-    return torch.distributed.get_world_size(group=get_model_parallel_group())
-
-
-def get_model_parallel_rank():
-    """Return my rank for the model parallel group."""
-    return torch.distributed.get_rank(group=get_model_parallel_group())
-
-
-def get_model_parallel_src_rank():
-    """Calculate the global rank corresponding to a local rank zeor
-    in the model parallel group."""
-    global_rank = torch.distributed.get_rank()
-    local_world_size = get_model_parallel_world_size()
-    return (global_rank // local_world_size) * local_world_size
-
-
-def get_data_parallel_world_size():
-    """Return world size for the data parallel group."""
-    return torch.distributed.get_world_size(group=get_data_parallel_group())
-
-
-def get_data_parallel_rank():
-    """Return my rank for the data parallel group."""
-    return torch.distributed.get_rank(group=get_data_parallel_group())
-
-
-def destroy_model_parallel():
-    """Set the groups to none."""
-    global _MODEL_PARALLEL_GROUP
-    _MODEL_PARALLEL_GROUP = None
-    global _DATA_PARALLEL_GROUP
-    _DATA_PARALLEL_GROUP = None
diff --git a/modelscope/models/nlp/mglm/mpu/layers.py b/modelscope/models/nlp/mglm/mpu/layers.py
deleted file mode 100644
index 4eb94b50..00000000
--- a/modelscope/models/nlp/mglm/mpu/layers.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-
-import math
-
-import torch
-import torch.nn.functional as F
-import torch.nn.init as init
-from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-from torch.nn.parameter import Parameter
-
-from .initialize import get_model_parallel_rank, get_model_parallel_world_size
-from .mappings import (copy_to_model_parallel_region,
-                       gather_from_model_parallel_region,
-                       reduce_from_model_parallel_region,
-                       scatter_to_model_parallel_region)
-from .random import get_cuda_rng_tracker
-from .utils import VocabUtility, divide, split_tensor_along_last_dim
-
-
-def _initialize_affine_weight(weight,
-                              output_size,
-                              input_size,
-                              per_partition_size,
-                              partition_dim,
-                              init_method,
-                              stride=1,
-                              return_master_weight=False):
-    """Initialize affine weight for model parallel.
-
-    Build the master weight on all processes and scatter
-    the relevant chunk."""
-    # If we only use 1 process for model parallelism, bypass scatter.
-    world_size = get_model_parallel_world_size()
-    if world_size == 1:
-        init_method(weight)
-        if return_master_weight:
-            return weight
-        return None
-
-    # Initialize master weight
-    master_weight = torch.empty(
-        output_size, input_size, dtype=weight.dtype, requires_grad=False)
-    init_method(master_weight)
-
-    # Split and copy
-    per_partition_per_stride_size = divide(per_partition_size, stride)
-    weight_list = torch.split(
-        master_weight, per_partition_per_stride_size, dim=partition_dim)
-    rank = get_model_parallel_rank()
-    my_weight_list = weight_list[rank::world_size]
-
-    with torch.no_grad():
-        torch.cat(my_weight_list, dim=partition_dim, out=weight)
-    if return_master_weight:
-        return master_weight
-    return None
-
-
-class VocabParallelEmbedding(torch.nn.Module):
-    """Embedding parallelized in the vocabulary dimension.
-
-    This is mainly adapted from torch.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-        init_method: method to initialize weights.
-    """
-
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 init_method=init.xavier_normal_):
-        super(VocabParallelEmbedding, self).__init__()
-        # Keep the input dimensions.
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        # Set the detauls for compatibility.
-        self.padding_idx = None
-        self.max_norm = None
-        self.norm_type = 2.
-        self.scale_grad_by_freq = False
-        self.sparse = False
-        self._weight = None
-        # Divide the weight matrix along the vocaburaly dimension.
-        self.vocab_start_index, self.vocab_end_index = \
-            VocabUtility.vocab_range_from_global_vocab_size(
-                self.num_embeddings, get_model_parallel_rank(),
-                get_model_parallel_world_size())
-        self.num_embeddings_per_partition = self.vocab_end_index - self.vocab_start_index  # noqa
-
-        # Allocate weights.
-        self.weight = Parameter(
-            torch.Tensor(self.num_embeddings_per_partition,
-                         self.embedding_dim))
-        self.weight.model_parallel = True
-        # And initialize.
-        _initialize_affine_weight(self.weight, self.num_embeddings,
-                                  self.embedding_dim,
-                                  self.num_embeddings_per_partition, 0,
-                                  init_method)
-
-    def forward(self, input_):
-        # Build the mask.
-        input_mask = (input_ < self.vocab_start_index) | \
-                     (input_ >= self.vocab_end_index)
-        # Mask the input.
-        masked_input = input_.clone() - self.vocab_start_index
-        masked_input[input_mask] = 0
-        # Get the embeddings.
-        output_parallel = F.embedding(masked_input, self.weight,
-                                      self.padding_idx, self.max_norm,
-                                      self.norm_type, self.scale_grad_by_freq,
-                                      self.sparse)
-        # Mask the output embedding.
-        output_parallel[input_mask, :] = 0.0
-        # Reduce across all the model parallel GPUs.
-        output = reduce_from_model_parallel_region(output_parallel)
-        return output
-
-
-class ParallelEmbedding(torch.nn.Module):
-    """Embedding parallelized in the embedding dimension.
-
-    This is mainly adapted from torch.nn.Embedding and all the default
-    values are kept.
-    Arguments:
-        num_embeddings: vocabulary size.
-        embedding_dim: size of hidden state.
-        init_method: method to initialize weights.
-    """
-
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 init_method=init.xavier_normal_,
-                 keep_master_weight_for_test=False):
-        super(ParallelEmbedding, self).__init__()
-        # Keep the input dimensions.
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        # Set some detauls for compatibility.
-        self.padding_idx = None
-        self.max_norm = None
-        self.norm_type = 2.
-        self.scale_grad_by_freq = False
-        self.sparse = False
-        self._weight = None
-        # Divide the weight matrix along the embedding dimension.
-        world_size = get_model_parallel_world_size()
-        self.embedding_dim_per_partition = divide(self.embedding_dim,
-                                                  world_size)
-
-        # Allocate weights.
-        self.weight = Parameter(
-            torch.Tensor(self.num_embeddings,
-                         self.embedding_dim_per_partition))
-        self.weight.model_parallel = True
-        # And initialize.
-        _initialize_affine_weight(
-            self.weight,
-            self.num_embeddings,
-            self.embedding_dim,
-            self.embedding_dim_per_partition,
-            1,
-            init_method,
-            stride=1,
-            return_master_weight=False)
-
-    def forward(self, input_):
-        input_parallel = copy_to_model_parallel_region(input_)
-        output_parallel = F.embedding(input_parallel, self.weight,
-                                      self.padding_idx, self.max_norm,
-                                      self.norm_type, self.scale_grad_by_freq,
-                                      self.sparse)
-        output = gather_from_model_parallel_region(output_parallel)
-        return output
-
-
-class ColumnParallelLinear(torch.nn.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-        bias: If true, add bias
-        gather_output: If true, call all-gether on output and make Y avaiable
-                       to all GPUs, otherwise, every GPU will have its output
-                       which is Y_i = XA_i
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-    """
-
-    def __init__(self,
-                 input_size,
-                 output_size,
-                 bias=True,
-                 gather_output=True,
-                 init_method=init.xavier_normal_,
-                 stride=1,
-                 keep_master_weight_for_test=False):
-        super(ColumnParallelLinear, self).__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.gather_output = gather_output
-        # Divide the weight matrix along the last dimension.
-        world_size = get_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, world_size)
-
-        # Parameters.
-        # Note: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        self.weight = Parameter(
-            torch.Tensor(self.output_size_per_partition, self.input_size))
-        self.weight.model_parallel = True
-        if bias:
-            self.bias = Parameter(torch.Tensor(self.output_size_per_partition))
-            self.bias.model_parallel = True
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
-        else:
-            self.register_parameter('bias', None)
-
-        # Initialize weight.
-        self.master_weight = _initialize_affine_weight(
-            self.weight,
-            self.output_size,
-            self.input_size,
-            self.output_size_per_partition,
-            0,
-            init_method,
-            stride=stride,
-            return_master_weight=keep_master_weight_for_test)
-
-    def forward(self, input_):
-        # Set up backprop all-reduce.
-        input_parallel = copy_to_model_parallel_region(input_)
-        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight, self.bias)
-        if self.gather_output:
-            # All-gather across the partitions.
-            output = gather_from_model_parallel_region(output_parallel)
-        else:
-            output = output_parallel
-        return output
-
-
-class RowParallelLinear(torch.nn.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-    Arguments:
-        input_size: first dimension of matrix A.
-        output_size: second dimension of matrix A.
-        bias: If true, add bias. Note that bias is not parallelized.
-        input_is_parallel: If true, we assume that the input is already
-                           split across the GPUs and we do not split
-                           again.
-        init_method: method to initialize weights. Note that bias is always set
-                     to zero.
-        stride: For the strided linear layers.
-        keep_master_weight_for_test: This was added for testing and should be
-                                     set to False. It returns the master weights
-                                     used for initialization.
-    """
-
-    def __init__(self,
-                 input_size,
-                 output_size,
-                 bias=True,
-                 input_is_parallel=False,
-                 init_method=init.xavier_normal_,
-                 stride=1,
-                 keep_master_weight_for_test=False):
-        super(RowParallelLinear, self).__init__()
-
-        # Keep input parameters
-        self.input_size = input_size
-        self.output_size = output_size
-        self.input_is_parallel = input_is_parallel
-        # Divide the weight matrix along the last dimension.
-        world_size = get_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, world_size)
-
-        # Parameters.
-        # Note: torch.nn.functional.linear performs XA^T + b and as a result
-        # we allocate the transpose.
-        self.weight = Parameter(
-            torch.Tensor(self.output_size, self.input_size_per_partition))
-        self.weight.model_parallel = True
-        if bias:
-            self.bias = Parameter(torch.Tensor(self.output_size))
-            # Always initialize bias to zero.
-            with torch.no_grad():
-                self.bias.zero_()
-        else:
-            self.register_parameter('bias', None)
-
-        # Initialize weight.
-        self.master_weight = _initialize_affine_weight(
-            self.weight,
-            self.output_size,
-            self.input_size,
-            self.input_size_per_partition,
-            1,
-            init_method,
-            stride=stride,
-            return_master_weight=keep_master_weight_for_test)
-
-    def forward(self, input_):
-        # Set up backprop all-reduce.
-        if self.input_is_parallel:
-            input_parallel = input_
-        else:
-            input_parallel = scatter_to_model_parallel_region(input_)
-        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight)
-        # All-reduce across all the partitions.
-        output_ = reduce_from_model_parallel_region(output_parallel)
-        if self.bias is not None:
-            output = output_ + self.bias
-        else:
-            output = output_
-        return output
diff --git a/modelscope/models/nlp/mglm/mpu/mappings.py b/modelscope/models/nlp/mglm/mpu/mappings.py
deleted file mode 100644
index b3056dd7..00000000
--- a/modelscope/models/nlp/mglm/mpu/mappings.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from .initialize import get_model_parallel_group
-from .utils import split_tensor_along_last_dim
-
-
-def _reduce(input_):
-    """All-reduce the the input tensor across model parallel group."""
-    group = get_model_parallel_group()
-
-    # Bypass the function if we are using only 1 GPU.
-    if torch.distributed.get_world_size(group=group) == 1:
-        return input_
-
-    # All-reduce.
-    torch.distributed.all_reduce(input_, group=group)
-
-    return input_
-
-
-def _split(input_):
-    """Split the tensor along its last dimension and keep the
-    corresponding slice."""
-    group = get_model_parallel_group()
-
-    # Bypass the function if we are using only 1 GPU.
-    if torch.distributed.get_world_size(group=group) == 1:
-        return input_
-
-    # Split along last dimension.
-    world_size = torch.distributed.get_world_size(group=group)
-    input_list = split_tensor_along_last_dim(input_, world_size)
-
-    # Note: torch.split does not create contiguous tensors by default.
-    rank = torch.distributed.get_rank(group=group)
-    output = input_list[rank].contiguous()
-
-    return output
-
-
-def _gather(input_):
-    """Gather tensors and concatinate along the last dimension."""
-    group = get_model_parallel_group()
-
-    # Bypass the function if we are using only 1 GPU.
-    if torch.distributed.get_world_size(group=group) == 1:
-        return input_
-
-    # Size and dimension.
-    last_dim = input_.dim() - 1
-    rank = torch.distributed.get_rank(group=group)
-    world_size = torch.distributed.get_world_size(group=group)
-
-    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-    tensor_list[rank] = input_
-    torch.distributed.all_gather(tensor_list, input_, group=group)
-
-    # Note: torch.cat already creates a contiguous tensor.
-    output = torch.cat(tensor_list, dim=last_dim).contiguous()
-
-    return output
-
-
-class _CopyToModelParallelRegion(torch.autograd.Function):
-    """Pass the input to the model parallel region."""
-
-    @staticmethod
-    def forward(ctx, input_):
-        return input_
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _reduce(grad_output)
-
-
-class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-redcue the input from the model parallel region."""
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _reduce(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output
-
-
-class _ScatterToModelParallelRegion(torch.autograd.Function):
-    """Split the input and keep only the corresponding chuck to the rank."""
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _split(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _gather(grad_output)
-
-
-class _GatherFromModelParallelRegion(torch.autograd.Function):
-    """Gather the input from model parallel region and concatinate."""
-
-    @staticmethod
-    def forward(ctx, input_):
-        return _gather(input_)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return _split(grad_output)
-
-
-# -----------------
-# Helper functions.
-# -----------------
-
-
-def copy_to_model_parallel_region(input_):
-    return _CopyToModelParallelRegion.apply(input_)
-
-
-def reduce_from_model_parallel_region(input_):
-    return _ReduceFromModelParallelRegion.apply(input_)
-
-
-def scatter_to_model_parallel_region(input_):
-    return _ScatterToModelParallelRegion.apply(input_)
-
-
-def gather_from_model_parallel_region(input_):
-    return _GatherFromModelParallelRegion.apply(input_)
diff --git a/modelscope/models/nlp/mglm/mpu/random.py b/modelscope/models/nlp/mglm/mpu/random.py
deleted file mode 100755
index 2cdf236d..00000000
--- a/modelscope/models/nlp/mglm/mpu/random.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Modified by Samyam Rajbhandari
-# Used to partition the activations stored for backward propagation
-# Therefore reduces the memory consumption
-
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Parts of the code here are adapted from PyTorch
-# repo: https://github.com/pytorch/pytorch
-import contextlib
-
-import torch
-import torch.distributed as dist
-from torch import _C
-from torch.cuda import _lazy_call
-from torch.cuda import device as device_ctx_manager
-
-from .initialize import (get_data_parallel_rank, get_model_parallel_group,
-                         get_model_parallel_rank,
-                         get_model_parallel_world_size)
-
-# from torch.utils.checkpoint import detach_variable
-
-PARTITION_ACTIVATIONS = False
-PA_CORRECTNESS_TEST = False
-
-
-def see_memory_usage(message, force=False):
-    if not force:
-        return
-    dist.barrier()
-    if dist.get_rank() == 0:
-        print(message)
-        print('Memory Allocated ',
-              torch.cuda.memory_allocated() / (1024 * 1024 * 1024),
-              'GigaBytes')
-        print('Max Memory Allocated ',
-              torch.cuda.max_memory_allocated() / (1024 * 1024 * 1024),
-              'GigaBytes')
-        print('Cache Allocated ',
-              torch.cuda.memory_cached() / (1024 * 1024 * 1024), 'GigaBytes')
-        print('Max cache Allocated ',
-              torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
-              'GigaBytes')
-        print(' ')
-        # input("Press Any Key To Continue ..")
-
-
-mp_rank = None  # get_model_parallel_rank()
-mp_size = None  # get_model_parallel_world_size()
-mp_group = None  # get_model_parallel_group()
-
-# Default name for the model parallel rng tracker.
-_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-transport_stream = None
-cuda_device = None
-
-
-def detach_variable(inputs, device=None):
-    if isinstance(inputs, tuple):
-        out = []
-        for inp in inputs:
-            if not isinstance(inp, torch.Tensor):
-                out.append(inp)
-                continue
-
-            requires_grad = inp.requires_grad
-
-            if device is not None:
-                x = inp.to(device=device)
-            else:
-                x = inp
-
-            x = x.detach()
-            x.requires_grad = requires_grad
-            out.append(x)
-        return tuple(out)
-    else:
-        raise RuntimeError(
-            'Only tuple of tensors is supported. Got Unsupported input type: ',
-            type(inputs).__name__)
-
-
-def _set_cuda_rng_state(new_state, device=-1):
-    """Sets the random number generator state of the current GPU.
-
-    Argumentss:
-        new_state (torch.ByteTensor): The desired state
-    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
-    with a single change: the input state is not cloned. Cloning caused
-    major performance issues for +4 GPU cases.
-    """
-    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
-        # older PyTorch
-        def cb():
-            with device_ctx_manager(device):
-                _C._cuda_setRNGState(new_state)
-    else:
-        # newer PyTorch
-        if device == -1:
-            device = torch.device('cuda')
-        elif isinstance(device, str):
-            device = torch.device(device)
-        elif isinstance(device, int):
-            device = torch.device('cuda', device)
-
-        def cb():
-            idx = device.index
-            if idx is None:
-                idx = torch.cuda.current_device()
-            default_generator = torch.cuda.default_generators[idx]
-            default_generator.set_state(new_state)
-
-    _lazy_call(cb)
-
-
-class CudaRNGStatesTracker:
-    """Tracker for the cuda RNG states.
-
-    Using the `add` method, a cuda rng state is initialized based on
-    the input `seed` and is assigned to `name`. Later, by forking the
-    rng state, we can perform operations and return to our starting
-    cuda state.
-    """
-
-    def __init__(self):
-        # Map from a string name to the cuda rng state.
-        self.states_ = {}
-        # Seeds are just for book keeping and ensure no seed is set twice.
-        self.seeds_ = set()
-
-    def reset(self):
-        """Set to the initial state (no tracker)."""
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def get_states(self):
-        """Get rng states. Copy the dictionary so we have direct
-        pointers to the states, not just a pointer to the dictionary."""
-        states = {}
-        for name in self.states_:
-            states[name] = self.states_[name]
-        return states
-
-    def set_states(self, states):
-        """Set the rng states. For efficiency purposes, we do not check
-        the size of seed for compatibility."""
-        self.states_ = states
-
-    def add(self, name, seed):
-        """Track the rng state."""
-        # Check seed is not already used.
-        if seed in self.seeds_:
-            raise Exception('seed {} already exists'.format(seed))
-        self.seeds_.add(seed)
-        # Check that state is not already defined.
-        if name in self.states_:
-            raise Exception('cuda rng state {} already exists'.format(name))
-        # Get the current rng state.
-        orig_rng_state = torch.cuda.get_rng_state()
-        # Set the new state and store it.
-        torch.cuda.manual_seed(seed)
-        self.states_[name] = torch.cuda.get_rng_state()
-        # Reset rng state to what it was.
-        _set_cuda_rng_state(orig_rng_state)
-
-    @contextlib.contextmanager
-    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
-        """Fork the cuda rng state, perform operations, and exit with
-        the original state."""
-        # Check if we have added the state
-        if name not in self.states_:
-            raise Exception('cuda rng state {} is not added'.format(name))
-        # Store current rng state.
-        orig_cuda_rng_state = torch.cuda.get_rng_state()
-        # Set rng state to the desired one
-        _set_cuda_rng_state(self.states_[name])
-        # Do the stuff we wanted to do.
-        try:
-            yield
-        finally:
-            # Update the current rng state for later use.
-            self.states_[name] = torch.cuda.get_rng_state()
-            # And set the state to the original state we started with.
-            _set_cuda_rng_state(orig_cuda_rng_state)
-
-
-# RNG tracker object.
-_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
-
-
-def get_cuda_rng_tracker():
-    """Get cuda rng tracker."""
-    return _CUDA_RNG_STATE_TRACKER
-
-
-def model_parallel_cuda_manual_seed(seed):
-    """Initialize model parallel cuda seed.
-
-    This function should be called after the model parallel is
-    initialized. Also, no torch.cuda.manual_seed should be called
-    after this function. Basically, this is replacement for that
-    function.
-    Two set of RNG states are tracked:
-        default state: This is for data parallelism and is the same among a
-                       set of model parallel GPUs but different across
-                       different model paralle groups. This is used for
-                       example for dropout in the non-model-parallel regions.
-        model-parallel state: This state is different among a set of model
-                              parallel GPUs, but the same across data parallel
-                              groups. This is used for example for dropout in
-                              model parallel regions.
-    """
-    # 2718 is just for fun and any POSITIVE value will work.
-    offset = seed + 2718
-    model_parallel_seed = offset + get_model_parallel_rank()
-    # Data parallel gets the original sedd.
-    data_parallel_seed = seed
-
-    if torch.distributed.get_rank() == 0:
-        print(
-            '> initializing model parallel cuda seeds on global rank {}, '
-            'model parallel rank {}, and data parallel rank {} with '
-            'model parallel seed: {} and data parallel seed: {}'.format(
-                torch.distributed.get_rank(), get_model_parallel_rank(),
-                get_data_parallel_rank(), model_parallel_seed,
-                data_parallel_seed),
-            flush=True)
-    _CUDA_RNG_STATE_TRACKER.reset()
-    # Set the default state.
-    torch.cuda.manual_seed(data_parallel_seed)
-    # and model parallel state.
-    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
-                                model_parallel_seed)
-
-
-def get_partition_start(item):
-    global mp_rank, mp_size, mp_group
-    partition_size = get_partition_size(item)
-    start = partition_size * mp_rank
-    return int(start)
-
-
-def get_partition_size(item):
-    global mp_rank, mp_size, mp_group
-    size = item.numel()
-    partition_size = size / mp_size
-    return int(partition_size)
-
-
-def get_full_inputs(tensors):
-    inputs = []
-    for i in range(int(len(tensors) / 2) - 1):
-        item = tensors[2 * i]
-        size = tensors[2 * i + 1]
-        partition_size = item.numel()
-        tensor_size = partition_size * mp_size
-        flat_tensor = torch.zeros([tensor_size],
-                                  dtype=item.dtype,
-                                  device=item.device)
-        partitions = []
-        for i in range(mp_size):
-            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
-            if i == mp_rank:
-                part_i.copy_(item)
-            partitions.append(part_i)
-        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
-        input_tensor = flat_tensor.view(list(size.numpy()))
-        item.data = input_tensor.data
-
-        inputs.append(item)
-    inputs.append(tensors[-2])
-
-    return tuple(inputs)
-
-
-class CheckpointFunction(torch.autograd.Function):
-    """This function is adapted from torch.utils.checkpoint with
-       two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
-           2) the states in the model parallel tracker are also properly
-              tracked/set/reset.
-    """
-
-    @staticmethod
-    def forward(ctx, run_function, *args):
-        ctx.run_function = run_function
-        global mp_rank, mp_size, mp_group
-        if mp_rank is None:
-            mp_rank = get_model_parallel_rank()
-            mp_size = get_model_parallel_world_size()
-            mp_group = get_model_parallel_group()
-
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
-        if cuda_device is None:
-            if dist.get_rank() == 0:
-                print(
-                    f'Partition Activations {PARTITION_ACTIVATIONS} and Correctness Check {PA_CORRECTNESS_TEST}'
-                )
-
-            cuda_device = torch.cuda.current_device()
-            # The transport stream is used to overlap the allgather communication for the activations
-            # with the computation in the backward pass
-            transport_stream = torch.cuda.Stream(device=cuda_device)
-
-        if PARTITION_ACTIVATIONS:
-            inputs = [
-                item.detach().contiguous().view(-1).narrow(
-                    0, get_partition_start(item),
-                    get_partition_size(item)).clone() for item in args[:-1]
-            ]
-            inputs.append(args[-1])
-
-        # just in case something funky is happening such as reuse of inputs
-        inputs_cuda = [item.to(cuda_device) for item in args]
-
-        # Copy the rng states.
-        ctx.fwd_cpu_rng_state = torch.get_rng_state()
-        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
-        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        # ctx.save_for_backward(*args)
-        with torch.no_grad():
-            outputs = run_function(*inputs_cuda)
-
-        del inputs_cuda
-
-        if PARTITION_ACTIVATIONS:
-            new_args = []
-            for arg, inp in zip(args, inputs):
-                size = torch.tensor(arg.size())
-                arg.data = inp.data
-                new_args.append(arg)
-                new_args.append(size)
-            ctx.save_for_backward(*new_args)
-        else:
-            ctx.save_for_backward(*args)
-
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        if not torch.autograd._is_checkpoint_valid():
-            raise RuntimeError('Checkpointing is not compatible with .grad(), '
-                               'please use .backward() if possible')
-
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
-
-        if PARTITION_ACTIVATIONS:
-            with torch.cuda.stream(transport_stream):
-                inputs = get_full_inputs(ctx.saved_tensors)
-                detached_inputs = detach_variable(inputs)
-        else:
-            inputs = ctx.saved_tensors
-            detached_inputs = detach_variable(inputs)
-
-        # Store the current states.
-        bwd_cpu_rng_state = torch.get_rng_state()
-        bwd_cuda_rng_state = torch.cuda.get_rng_state()
-        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
-
-        # Set the states to what it used to be before the forward pass.
-        torch.set_rng_state(ctx.fwd_cpu_rng_state)
-        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
-
-        if PARTITION_ACTIVATIONS:
-            current_stream = torch.cuda.current_stream()
-            current_stream.wait_stream(transport_stream)
-
-        with torch.enable_grad():
-            outputs = ctx.run_function(*detached_inputs)
-
-        # Set the states back to what it was at the start of this function.
-        torch.set_rng_state(bwd_cpu_rng_state)
-        _set_cuda_rng_state(bwd_cuda_rng_state)
-        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
-
-        if isinstance(outputs, torch.Tensor):
-            outputs = (outputs, )
-        torch.autograd.backward(outputs, args)
-        return (None, ) + tuple(inp.grad for inp in detached_inputs)
-
-
-def checkpoint(function, *args):
-    """Checkpoint a model or part of the model.
-    This has been directly copied from torch.utils.checkpoint."""
-    return CheckpointFunction.apply(function, *args)
-
-
-def partition_activations_in_checkpoint(partition_activation):
-    global PARTITION_ACTIVATIONS
-    PARTITION_ACTIVATIONS = partition_activation
-    if dist.get_rank() == 0:
-        print(
-            f'**************Partition Activations {PARTITION_ACTIVATIONS}************'
-        )
diff --git a/modelscope/models/nlp/mglm/mpu/tests/commons.py b/modelscope/models/nlp/mglm/mpu/tests/commons.py
deleted file mode 100644
index ecfd5e72..00000000
--- a/modelscope/models/nlp/mglm/mpu/tests/commons.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-import random
-
-import mpu
-import numpy
-import torch
-
-
-class IdentityLayer(torch.nn.Module):
-
-    def __init__(self, size, scale=1.0):
-        super(IdentityLayer, self).__init__()
-        self.weight = torch.nn.Parameter(scale * torch.randn(size))
-
-    def forward(self):
-        return self.weight
-
-
-def set_random_seed(seed):
-    """Set random seed for reproducability."""
-    random.seed(seed)
-    numpy.random.seed(seed)
-    torch.manual_seed(seed)
-    mpu.model_parallel_cuda_manual_seed(seed)
-
-
-def initialize_distributed(backend='nccl'):
-    """Initialize torch.distributed."""
-    # Get local rank in case it is provided.
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--local_rank',
-        type=int,
-        default=None,
-        help='local rank passed from distributed launcher')
-    args = parser.parse_args()
-    local_rank = args.local_rank
-
-    # Get rank and world size.
-    rank = int(os.getenv('RANK', '0'))
-    world_size = int(os.getenv('WORLD_SIZE', '1'))
-
-    print('> initializing torch.distributed with local rank: {}, '
-          'rank: {}, world size: {}'.format(local_rank, rank, world_size))
-
-    # Set the device id.
-    device = rank % torch.cuda.device_count()
-    if local_rank is not None:
-        device = local_rank
-    torch.cuda.set_device(device)
-
-    # Call the init process.
-    init_method = 'tcp://'
-    master_ip = os.getenv('MASTER_ADDR', 'localhost')
-    master_port = os.getenv('MASTER_PORT', '6000')
-    init_method += master_ip + ':' + master_port
-    torch.distributed.init_process_group(
-        backend=backend,
-        world_size=world_size,
-        rank=rank,
-        init_method=init_method)
-
-
-def print_separator(message):
-    torch.distributed.barrier()
-    filler_len = (78 - len(message)) // 2
-    filler = '-' * filler_len
-    string = '\n' + filler + ' {} '.format(message) + filler
-    if torch.distributed.get_rank() == 0:
-        print(string, flush=True)
-    torch.distributed.barrier()
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py b/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
deleted file mode 100644
index 47fd1d7e..00000000
--- a/modelscope/models/nlp/mglm/mpu/tests/test_cross_entropy.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import sys
-
-import mpu
-import torch
-import torch.nn.functional as F
-from commons import (IdentityLayer, initialize_distributed, print_separator,
-                     set_random_seed)
-from mpu.cross_entropy import vocab_parallel_cross_entropy
-
-sys.path.append('../..')
-
-
-def torch_cross_entropy(batch_size, seq_length, vocab_size, logits_scale,
-                        seed):
-    set_random_seed(seed)
-    identity = IdentityLayer((batch_size, seq_length, vocab_size),
-                             scale=logits_scale).cuda()
-    logits = identity()
-    target = torch.cuda.LongTensor(size=(batch_size,
-                                         seq_length)).random_(0, vocab_size)
-    loss = F.cross_entropy(
-        logits.view(-1,
-                    logits.size()[-1]), target.view(-1),
-        reduction='none').view_as(target).mean()
-    loss.backward()
-    return loss, identity.weight.grad
-
-
-def mpu_cross_entropy(batch_size, seq_length, vocab_size, logits_scale, seed):
-    set_random_seed(seed)
-    identity = IdentityLayer((batch_size, seq_length, vocab_size),
-                             scale=logits_scale).cuda()
-    logits = identity()
-    logits_parallel = mpu.scatter_to_model_parallel_region(logits)
-    target = torch.cuda.LongTensor(size=(batch_size,
-                                         seq_length)).random_(0, vocab_size)
-    loss = vocab_parallel_cross_entropy(logits_parallel, target).mean()
-    loss.backward()
-    return loss, identity.weight.grad
-
-
-def test_cross_entropy(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing cross entropy with model parallel size {} ...'.format(
-            model_parallel_size))
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    batch_size = 13
-    seq_length = 17
-    vocab_size_per_partition = 11
-    logits_scale = 1000.0
-    vocab_size = vocab_size_per_partition * model_parallel_size
-    seed = 1234
-
-    loss_torch, grad_torch = torch_cross_entropy(batch_size, seq_length,
-                                                 vocab_size, logits_scale,
-                                                 seed)
-    loss_mpu, grad_mpu = mpu_cross_entropy(batch_size, seq_length, vocab_size,
-                                           logits_scale, seed)
-
-    error = loss_torch.sub_(loss_mpu).abs().max()
-    print('   max error in loss on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = grad_torch.sub_(grad_mpu).abs().max()
-    print('   max error in grad on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test cross entropy')
-        test_cross_entropy(model_parallel_size)
-        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_data.py b/modelscope/models/nlp/mglm/mpu/tests/test_data.py
deleted file mode 100644
index 66575300..00000000
--- a/modelscope/models/nlp/mglm/mpu/tests/test_data.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import operator
-import sys
-
-import mpu
-import torch
-from commons import initialize_distributed, print_separator
-from mpu import data as data_utils
-
-sys.path.append('../..')
-
-
-def test_boradcast_data(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print(
-            '> testing boradcast_data with model parallel size {} ...'.format(
-                model_parallel_size))
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    torch.manual_seed(1234 + mpu.get_data_parallel_rank())
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    key_size_t = {
-        'key1': [7, 11],
-        'key2': [8, 2, 1],
-        'key3': [13],
-        'key4': [5, 1, 2],
-        'key5': [5, 12]
-    }
-    keys = list(key_size_t.keys())
-
-    data = {}
-    data_t = {}
-    for key in key_size_t:
-        data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
-        data_t[key] = data[key].clone()
-    data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
-    data_t['keyX'] = data['keyX'].clone()
-    if mpu.get_model_parallel_rank() != 0:
-        data = None
-
-    data_utils._check_data_types(keys, data_t, torch.int64)
-    key_size, key_numel, \
-        total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
-    for key in keys:
-        assert key_size[key] == key_size_t[key]
-    total_numel_t = 0
-    for key in keys:
-        target_size = functools.reduce(operator.mul, key_size_t[key], 1)
-        assert key_numel[key] == target_size
-        total_numel_t += target_size
-    assert total_numel == total_numel_t
-
-    data_b = data_utils.broadcast_data(keys, data, torch.int64)
-    for key in keys:
-        tensor = data_t[key].cuda()
-        assert data_b[key].sub(tensor).abs().max() == 0
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test test boradcast data')
-        test_boradcast_data(model_parallel_size)
-        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py b/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
deleted file mode 100644
index df62d213..00000000
--- a/modelscope/models/nlp/mglm/mpu/tests/test_initialize.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-import mpu
-import torch
-from commons import initialize_distributed, print_separator
-
-sys.path.append('../..')
-
-
-def test_initialize_model_parallel(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing initialize_model_parallel with size {} ...'.format(
-            model_parallel_size))
-    model_parallel_size_ = min(model_parallel_size,
-                               torch.distributed.get_world_size())
-    assert not mpu.model_parallel_is_initialized()
-    mpu.initialize_model_parallel(model_parallel_size_)
-    assert mpu.model_parallel_is_initialized()
-
-    # Checks.
-    def check(group, world_size, rank):
-        assert world_size == torch.distributed.get_world_size(group=group)
-        assert rank == torch.distributed.get_rank(group=group)
-
-    # Model parallel.
-    world_size = model_parallel_size_
-    rank = torch.distributed.get_rank() % model_parallel_size_
-    assert world_size == mpu.get_model_parallel_world_size()
-    assert rank == mpu.get_model_parallel_rank()
-    check(mpu.get_model_parallel_group(), world_size, rank)
-
-    # Data parallel.
-    world_size = torch.distributed.get_world_size() // model_parallel_size_
-    rank = torch.distributed.get_rank() // model_parallel_size
-    assert world_size == mpu.get_data_parallel_world_size()
-    assert rank == mpu.get_data_parallel_rank()
-    check(mpu.get_data_parallel_group(), world_size, rank)
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_get_model_parallel_src_rank(model_parallel_size_):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing get_model_parallel_src_rank with size {} ...'.format(
-            model_parallel_size_))
-    model_parallel_size = min(model_parallel_size_,
-                              torch.distributed.get_world_size())
-    assert not mpu.model_parallel_is_initialized()
-    mpu.initialize_model_parallel(model_parallel_size)
-    assert mpu.model_parallel_is_initialized()
-
-    # Checks
-    src_rank = torch.distributed.get_rank() - mpu.get_model_parallel_rank()
-    assert mpu.get_model_parallel_src_rank() == src_rank
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test initialize model parallel')
-        test_initialize_model_parallel(model_parallel_size)
-        print_separator('test model parallel source rank')
-        test_get_model_parallel_src_rank(model_parallel_size)
-        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_layers.py b/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
deleted file mode 100644
index 2dbc987a..00000000
--- a/modelscope/models/nlp/mglm/mpu/tests/test_layers.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import sys
-
-import mpu
-import torch
-import torch.nn.init as init
-from commons import initialize_distributed, print_separator, set_random_seed
-from mpu import layers
-from torch.nn.parameter import Parameter
-
-sys.path.append('../..')
-
-
-def test_parallel_embedding(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing parallel embedding with model parallel size {} ...'.
-              format(model_parallel_size))
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    batch_size = 17
-    seq_length = 23
-    vocab_size = 48
-    hidden_size = 16
-    seed = 1236
-
-    set_random_seed(123)
-    input_data = torch.LongTensor(size=(batch_size, seq_length)).random_(
-        0, vocab_size).cuda()
-    loss_weight = torch.randn([batch_size, seq_length, hidden_size]).cuda()
-
-    set_random_seed(seed)
-    embedding_original = torch.nn.Embedding(vocab_size, hidden_size).cuda()
-
-    output = embedding_original(input_data)
-    loss_original = torch.mul(output, loss_weight).sum()
-    loss_original.backward()
-
-    set_random_seed(seed)
-    embedding_parallel = layers.ParallelEmbedding(
-        vocab_size, hidden_size, init_method=init.normal_).cuda()
-    output = embedding_parallel(input_data)
-    loss_parallel = torch.mul(output, loss_weight).sum()
-    loss_parallel.backward()
-
-    set_random_seed(seed)
-    embedding_vocab_parallel = layers.VocabParallelEmbedding(
-        vocab_size, hidden_size, init_method=init.normal_).cuda()
-    output = embedding_vocab_parallel(input_data)
-    loss_vocab_parallel = torch.mul(output, loss_weight).sum()
-    loss_vocab_parallel.backward()
-
-    torch.distributed.barrier()
-    error = loss_parallel.sub(loss_original).abs()
-    print('   error in loss (parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    torch.distributed.barrier()
-    error = loss_vocab_parallel.sub(loss_original).abs()
-    print('   error in loss (vocab parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    weight_grad_orig = torch.split(embedding_original.weight.grad,
-                                   hidden_size // model_parallel_size,
-                                   1)[mpu.get_model_parallel_rank()]
-    error = embedding_parallel.weight.grad.sub(weight_grad_orig).abs().max()
-    print('   error in grad (parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    weight_grad_orig = torch.split(embedding_original.weight.grad,
-                                   vocab_size // model_parallel_size,
-                                   0)[mpu.get_model_parallel_rank()]
-    error = embedding_vocab_parallel.weight.grad.sub(
-        weight_grad_orig).abs().max()
-    print('   error in grad (vocab parallel) on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-12, 'error: {}'.format(error)
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_initialize_affine_weight(model_parallel_size):
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print('> testing initialize_affine_weight with model parallel '
-              'size: {}'.format(model_parallel_size))
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    seed = 12345
-    input_size_coeff = 13
-    input_size = input_size_coeff * model_parallel_size
-    output_size_coeff = 17
-    output_size = output_size_coeff * model_parallel_size
-
-    # ---------------
-    # Column parallel
-    # ---------------
-    weight = torch.empty(output_size_coeff, input_size)
-    set_random_seed(seed)
-    layers._initialize_affine_weight(weight, output_size, input_size,
-                                     output_size_coeff, 0,
-                                     torch.nn.init.normal_)
-    # Target.
-    set_random_seed(seed)
-    master_weight = torch.empty(output_size, input_size)
-    torch.nn.init.normal_(master_weight)
-    rank = mpu.get_model_parallel_rank()
-    my_weight = torch.split(
-        master_weight, output_size_coeff, dim=0)[rank].contiguous().clone()
-
-    # Compare.
-    error = weight.sub(my_weight).abs().max()
-    torch.distributed.barrier()
-    print('   column parallel max error (should be zero) on global rank '
-          '{}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # ------------
-    # Row parallel
-    # ------------
-    weight = torch.empty(output_size, input_size_coeff)
-    set_random_seed(seed)
-    mpu.layers._initialize_affine_weight(weight, output_size, input_size,
-                                         input_size_coeff, 1,
-                                         torch.nn.init.normal_)
-    # Target.
-    set_random_seed(seed)
-    master_weight = torch.empty(output_size, input_size)
-    torch.nn.init.normal_(master_weight)
-    rank = mpu.get_model_parallel_rank()
-    my_weight = torch.split(
-        master_weight, input_size_coeff, dim=1)[rank].contiguous().clone()
-
-    # Compare.
-    error = weight.sub(my_weight).abs().max()
-    torch.distributed.barrier()
-    print('   row parallel max error (should be zero) on global rank '
-          '{}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-class IdentityLayer2D(torch.nn.Module):
-
-    def __init__(self, m, n):
-        super(IdentityLayer2D, self).__init__()
-        self.weight = Parameter(torch.Tensor(m, n))
-        torch.nn.init.xavier_normal_(self.weight)
-
-    def forward(self):
-        return self.weight
-
-
-def test_column_parallel_linear(model_parallel_size):
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print('> testing ColumnParallelLinear with model parallel '
-              'size: {}'.format(model_parallel_size))
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-    input_size_coeff = 13
-    input_size = input_size_coeff * model_parallel_size
-    output_size_coeff = 17
-    output_size = output_size_coeff * model_parallel_size
-    batch_size = 7
-
-    # Network
-    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
-    linear_layer = mpu.ColumnParallelLinear(
-        input_size, output_size, keep_master_weight_for_test=True).cuda()
-    loss_weight = torch.randn([batch_size, output_size]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = linear_layer(input_)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    # Values.
-    dLdY = loss_weight
-    X = identity_layer.weight
-    A = linear_layer.master_weight.cuda()
-    dLdA = torch.matmul(dLdY.t(), X)
-    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
-    dLdX = torch.matmul(dLdY, A)
-
-    rank = mpu.get_model_parallel_rank()
-    my_dLdA = torch.split(
-        dLdA, output_size_coeff, dim=0)[rank].contiguous().clone()
-    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdA on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    my_dLdb = torch.split(
-        dLdb, output_size_coeff, dim=0)[rank].contiguous().clone()
-    error = my_dLdb.sub(linear_layer.bias.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdb on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = dLdX.sub(identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdX on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-def test_row_parallel_linear(model_parallel_size):
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    if torch.distributed.get_rank() == 0:
-        print('> testing RowParallelLinear with model parallel '
-              'size: {}'.format(model_parallel_size))
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-    input_size_coeff = 13
-    input_size = input_size_coeff * model_parallel_size
-    output_size_coeff = 17
-    output_size = output_size_coeff * model_parallel_size
-    batch_size = 7
-
-    # Network
-    identity_layer = IdentityLayer2D(batch_size, input_size).cuda()
-    linear_layer = mpu.RowParallelLinear(
-        input_size, output_size, keep_master_weight_for_test=True).cuda()
-    loss_weight = torch.randn([batch_size, output_size]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = linear_layer(input_)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    # Values.
-    dLdY = loss_weight
-    X = identity_layer.weight
-    A = linear_layer.master_weight.cuda()
-    dLdA = torch.matmul(dLdY.t(), X)
-    dLdb = torch.matmul(torch.ones(batch_size, 1).cuda().t(), dLdY).view(-1)
-    dLdX = torch.matmul(dLdY, A)
-
-    rank = mpu.get_model_parallel_rank()
-    my_dLdA = torch.split(
-        dLdA, input_size_coeff, dim=1)[rank].contiguous().clone()
-    error = my_dLdA.sub(linear_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdA on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = dLdb.sub(linear_layer.bias.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdb on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    error = dLdX.sub(identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   error in dLdX on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-class IdentityLayer3D(torch.nn.Module):
-
-    def __init__(self, m, n, k):
-        super(IdentityLayer3D, self).__init__()
-        self.weight = Parameter(torch.Tensor(m, n, k))
-        torch.nn.init.xavier_normal_(self.weight)
-
-    def forward(self):
-        return self.weight
-
-
-def parallel_self_attention(model_parallel_size, num_att_heads_per_partition,
-                            hidden_size_per_att_head, dropout_prob, batch_size,
-                            sequence_length):
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-
-    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
-    )  # noqa
-    hidden_size = hidden_size_per_att_head * num_att_heads
-
-    # Network
-    identity_layer = IdentityLayer3D(batch_size, sequence_length,
-                                     hidden_size).cuda()
-    attention_layer = mpu.BertParallelSelfAttention(hidden_size, num_att_heads,
-                                                    dropout_prob).cuda()
-    loss_weight = torch.randn([batch_size, sequence_length,
-                               hidden_size]).cuda()
-    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = attention_layer(input_, attention_mask)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    rank = mpu.get_model_parallel_rank()
-    mpu.destroy_model_parallel()
-    return rank, hidden_size, model_parallel_size, loss, \
-        attention_layer, identity_layer
-
-
-def test_parallel_self_attention(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing ParallelSelfAttention with model parallel '
-              'size: {}'.format(model_parallel_size))
-
-    num_att_heads_per_partition = 3
-    hidden_size_per_att_head = 7
-    dropout_prob = 0.0  # has to be zero
-    batch_size = 5
-    sequence_length = 13
-
-    rank_1, hideen_size_1, model_parallel_size_1, loss_1, \
-        attention_layer_1, identity_layer_1 = parallel_self_attention(
-            1, num_att_heads_per_partition,
-            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
-
-    rank, hidden_size, model_parallel_size, loss, \
-        attention_layer, identity_layer = parallel_self_attention(
-            model_parallel_size, num_att_heads_per_partition,
-            hidden_size_per_att_head, dropout_prob, batch_size, sequence_length)
-    assert hideen_size_1 == hidden_size
-
-    error = loss_1.sub(loss).abs().max()
-    torch.distributed.barrier()
-    print('   loss error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-6
-
-    my_lin_grad_list = torch.split(
-        attention_layer_1.query_key_value.weight.grad,
-        hidden_size // model_parallel_size, 0)[rank::model_parallel_size]
-    my_lin_grad = torch.cat(my_lin_grad_list, dim=0)
-    error = my_lin_grad.sub(
-        attention_layer.query_key_value.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   weight gradient error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-6
-
-    error = identity_layer_1.weight.grad.sub(
-        identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   input gradient error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-6
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-def parallel_transformer(model_parallel_size, num_att_heads_per_partition,
-                         hidden_size_per_att_head, batch_size,
-                         sequence_length):
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    seed = 12345
-    set_random_seed(seed)
-
-    num_att_heads = num_att_heads_per_partition * torch.distributed.get_world_size(
-    )
-    hidden_size = hidden_size_per_att_head * num_att_heads
-    intermediate_size = 4 * hidden_size
-
-    # Network
-    identity_layer = IdentityLayer3D(batch_size, sequence_length,
-                                     hidden_size).cuda()
-    transformer_layer = mpu.BertParallelTransformerLayer(
-        hidden_size, intermediate_size, num_att_heads, 0.0, 0.0,
-        torch.nn.functional.relu, 1.0e-5).cuda()
-
-    loss_weight = torch.randn([batch_size, sequence_length,
-                               hidden_size]).cuda()
-    attention_mask = torch.randn([batch_size, 1, 1, sequence_length]).cuda()
-    # Forward
-    input_ = identity_layer()
-    output = transformer_layer(input_, attention_mask)
-    loss = torch.mul(output, loss_weight).sum()
-    # Backward
-    loss.backward()
-
-    rank = mpu.get_model_parallel_rank()
-    mpu.destroy_model_parallel()
-    return rank, hidden_size, model_parallel_size, loss, \
-        transformer_layer, identity_layer
-
-
-def test_parallel_transformer_layer(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing ParallelTransformerLayer with model parallel '
-              'size: {}'.format(model_parallel_size))
-
-    num_att_heads_per_partition = 3
-    hidden_size_per_att_head = 7
-    batch_size = 5
-    sequence_length = 13
-
-    rank_1, hidden_size_1, model_parallel_size_1, loss_1, \
-        transformer_layer_1, identity_layer_1 = parallel_transformer(
-            1, num_att_heads_per_partition,
-            hidden_size_per_att_head, batch_size, sequence_length)
-
-    rank, hidden_size, model_parallel_size, loss, \
-        transformer_layer, identity_layer = parallel_transformer(
-            model_parallel_size, num_att_heads_per_partition,
-            hidden_size_per_att_head, batch_size, sequence_length)
-
-    error = loss_1.sub(loss).abs().max()
-    torch.distributed.barrier()
-    print('   loss error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-5, 'error: {}'.format(error)
-
-    error = identity_layer_1.weight.grad.sub(
-        identity_layer.weight.grad).abs().max()
-    torch.distributed.barrier()
-    print('   input gradient error on global rank {}: {}'.format(
-        torch.distributed.get_rank(), error))
-    assert error < 5.0e-5, 'error: {}'.format(error)
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print(' >> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    print_separator('test initialize affine weight')
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        test_initialize_affine_weight(model_parallel_size)
-        model_parallel_size *= 2
-
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test parallel embedding')
-        test_parallel_embedding(model_parallel_size)
-        model_parallel_size *= 2
-
-    print_separator('test column-parallel linear')
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        test_column_parallel_linear(model_parallel_size)
-        model_parallel_size *= 2
-
-    print_separator('test row-parallel linear')
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        test_row_parallel_linear(model_parallel_size)
-        model_parallel_size *= 2
-
-    print_separator('test parallel self-attention')
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        test_parallel_self_attention(model_parallel_size)
-        model_parallel_size *= 2
-
-    print_separator('test parallel transformer')
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        test_parallel_transformer_layer(model_parallel_size)
-        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/tests/test_random.py b/modelscope/models/nlp/mglm/mpu/tests/test_random.py
deleted file mode 100644
index 55cc2351..00000000
--- a/modelscope/models/nlp/mglm/mpu/tests/test_random.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-import mpu
-import torch
-from commons import initialize_distributed, print_separator
-
-sys.path.append('../..')
-
-
-def test_set_cuda_rng_state(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing set_rng_state with size {} ...'.format(
-            model_parallel_size))
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    size = 123
-    seed = 1234
-    torch.cuda.manual_seed(seed)
-    tensor = torch.cuda.FloatTensor(size)
-
-    # Get the state
-    rng_state = torch.cuda.get_rng_state()
-    rng_state_copy = rng_state.clone()
-
-    # Do some stuff.
-    for _ in range(5):
-        torch.randn(size, out=tensor)
-    result_1 = tensor.clone()
-
-    assert rng_state.sub(rng_state_copy).max() == 0
-    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
-
-    # State should be different.
-    new_rng_state = torch.cuda.get_rng_state()
-    max_diff = new_rng_state.sub(rng_state).max()
-    print(
-        '   max diff in rng state (should be non-zero) on global rank {}: {}'.
-        format(torch.distributed.get_rank(), max_diff))
-    assert max_diff > 0
-
-    # Reset the rng state and do the same stuff.
-    mpu.random._set_cuda_rng_state(rng_state)
-    for _ in range(5):
-        torch.randn(size, out=tensor)
-    mpu.random._set_cuda_rng_state(rng_state)
-    for _ in range(5):
-        torch.randn(size, out=tensor)
-    result_2 = tensor.clone()
-
-    # Results should be the same
-    error = result_2.sub(result_1).abs().max()
-    print('   max error in generated tensors (should be zero) on '
-          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Input state should have remained intact.
-    error = rng_state.sub(rng_state_copy).max()
-    print('   max error in rng state (should be zero) on global rank {}: {}'.
-          format(torch.distributed.get_rank(), error))
-    assert error == 0
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_cuda_rng_tracker(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing cuda rng tracker with size {} ...'.format(
-            model_parallel_size))
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    seed_1 = 1234
-    seed_2 = 4321
-    size = [12, 21]
-    tensor = torch.cuda.FloatTensor(size)
-
-    # Set to seed_1 and generate two tensors.
-    torch.cuda.manual_seed(seed_1)
-    torch.randn(size, out=tensor)
-    target_11 = tensor.clone()
-    torch.randn(size, out=tensor)
-    target_12 = tensor.clone()
-
-    # Set to seed_2 and generate two tensors.
-    torch.cuda.manual_seed(seed_2)
-    torch.randn(size, out=tensor)
-    target_21 = tensor.clone()
-    torch.randn(size, out=tensor)
-    target_22 = tensor.clone()
-
-    # Now if we interleave seed_1 and seed_2,
-    # we should still get the same tensors
-    torch.cuda.manual_seed(seed_1)
-    mpu.get_cuda_rng_tracker().add('test', seed_2)
-
-    torch.randn(size, out=tensor)
-    result_11 = tensor.clone()
-
-    with mpu.get_cuda_rng_tracker().fork('test'):
-        torch.randn(size, out=tensor)
-        result_21 = tensor.clone()
-
-    torch.randn(size, out=tensor)
-    result_12 = tensor.clone()
-
-    with mpu.get_cuda_rng_tracker().fork('test'):
-        torch.randn(size, out=tensor)
-        result_22 = tensor.clone()
-
-    diff = result_11.sub(result_21).abs().max()
-    diff = min(diff, result_12.sub(result_22).abs().max())
-    print('   max diff in generated tensors (should be non-zero) on '
-          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
-    assert diff > 1.0e-6
-    error = max(
-        result_11.sub(target_11).abs().max(),
-        result_12.sub(target_12).abs().max())
-    error = max(error, result_21.sub(target_21).abs().max())
-    error = max(error, result_22.sub(target_22).abs().max())
-    print('   max error in generated tensors (should be zero) on '
-          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
-    assert error < 1.0e-6
-
-    # Reset the tracker
-    mpu.get_cuda_rng_tracker().reset()
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-def test_model_parallel_cuda_manual_seed(model_parallel_size):
-
-    if torch.distributed.get_rank() == 0:
-        print('> testing model parallel cuda manual seed with size {} ...'.
-              format(model_parallel_size))
-
-    mpu.initialize_model_parallel(model_parallel_size)
-    model_parallel_size = mpu.get_model_parallel_world_size()
-
-    mpu.model_parallel_cuda_manual_seed(12345)
-    assert torch.cuda.initial_seed() == 12345
-    with mpu.get_cuda_rng_tracker().fork():
-        assert torch.cuda.initial_seed() == (12345 + 2718
-                                             + mpu.get_model_parallel_rank())
-
-    # Reset the tracker
-    mpu.get_cuda_rng_tracker().reset()
-
-    # Reset groups
-    mpu.destroy_model_parallel()
-
-    torch.distributed.barrier()
-    if torch.distributed.get_rank() == 0:
-        print('>> passed the test :-)')
-
-
-if __name__ == '__main__':
-
-    initialize_distributed()
-    world_size = torch.distributed.get_world_size()
-
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test set rng state')
-        test_set_cuda_rng_state(model_parallel_size)
-        model_parallel_size *= 2
-
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test cuda rng tracker')
-        test_cuda_rng_tracker(model_parallel_size)
-        model_parallel_size *= 2
-
-    model_parallel_size = 1
-    while model_parallel_size <= world_size:
-        print_separator('test model parallel cuda manual seed')
-        test_model_parallel_cuda_manual_seed(model_parallel_size)
-        model_parallel_size *= 2
diff --git a/modelscope/models/nlp/mglm/mpu/utils.py b/modelscope/models/nlp/mglm/mpu/utils.py
deleted file mode 100644
index 76c37a2b..00000000
--- a/modelscope/models/nlp/mglm/mpu/utils.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-
-def ensure_divisibility(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
-        numerator, denominator)
-
-
-def divide(numerator, denominator):
-    """Ensure that numerator is divisible by the denominator and return
-    the division value."""
-    ensure_divisibility(numerator, denominator)
-    return numerator // denominator
-
-
-def split_tensor_along_last_dim(tensor,
-                                num_partitions,
-                                contiguous_split_chunks=False):
-    """Split a tensor along its last dimension.
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class VocabUtility:
-    """Split the vocabulary into `world_size` chunks amd return the
-        first and last index of the vocabulary belonging to the `rank`
-        partition: Note that indecies in [fist, last)"""
-
-    @staticmethod
-    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
-                                                  rank, world_size):
-        index_f = rank * per_partition_vocab_size
-        index_l = index_f + per_partition_vocab_size
-        return index_f, index_l
-
-    @staticmethod
-    def vocab_range_from_global_vocab_size(global_vocab_size, rank,
-                                           world_size):
-        per_partition_vocab_size = divide(global_vocab_size, world_size)
-        return VocabUtility.vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size, rank, world_size)
diff --git a/modelscope/models/nlp/mglm/tasks/data_utils.py b/modelscope/models/nlp/mglm/tasks/data_utils.py
index 179d304e..8792d080 100644
--- a/modelscope/models/nlp/mglm/tasks/data_utils.py
+++ b/modelscope/models/nlp/mglm/tasks/data_utils.py
@@ -21,10 +21,9 @@ import json
 import numpy as np
 import torch
 import torch.utils.data
+from megatron_util import mpu
 from torch.utils.data.dataloader import default_collate
 
-from modelscope.models.nlp.mglm import mpu
-
 
 def clean_text(text):
     """Remove new lines and multiple spaces and adjust end of sentence dot."""
diff --git a/modelscope/models/nlp/mglm/tasks/eval_utils.py b/modelscope/models/nlp/mglm/tasks/eval_utils.py
index da23a884..306bf33a 100644
--- a/modelscope/models/nlp/mglm/tasks/eval_utils.py
+++ b/modelscope/models/nlp/mglm/tasks/eval_utils.py
@@ -20,9 +20,9 @@ import time
 from collections import OrderedDict
 from typing import List
 
-import mpu
 import torch
 from finetune_glm import process_batch
+from megatron_util import mpu
 from sklearn.metrics import f1_score
 from tasks.data_utils import InputExample, build_data_loader
 from utils import debug_finetune_data, get_spare_port, print_rank_0
diff --git a/modelscope/models/nlp/mglm/tasks/language_model/finetune.py b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
index b6089e6f..35c590b6 100644
--- a/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
+++ b/modelscope/models/nlp/mglm/tasks/language_model/finetune.py
@@ -16,15 +16,14 @@
 import functools
 import math
 
-import mpu
 import torch
 from finetune_glm import finetune
+from megatron_util import mpu, print_rank_0
 from pretrain_glm import get_batch
 from tasks.data_utils import build_data_loader
 from tasks.language_model.dataset import (build_lambada_dataset,
                                           build_lm_dataset,
                                           build_wikitext103_dataset)
-from utils import print_rank_0
 
 global_tokenizer = None
 
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
index 5fd28b89..3669051d 100644
--- a/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/evaluate.py
@@ -4,14 +4,13 @@ import datetime
 import random
 import string
 
-import mpu
 import torch
 import torch.nn.functional as F
 from generation_utils import (BeamSearchScorer, LogitsProcessorList,
                               MinLengthLogitsProcessor,
                               NoRepeatNGramLogitsProcessor)
+from megatron_util import mpu, print_rank_0
 from rouge_score import rouge_scorer
-from utils import print_rank_0
 
 
 def _is_digit(w):
diff --git a/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
index 4c0c28e7..20ce3ad0 100644
--- a/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
+++ b/modelscope/models/nlp/mglm/tasks/seq2seq/finetune.py
@@ -15,9 +15,9 @@
 import functools
 from collections import OrderedDict
 
-import mpu
 import torch
 from finetune_glm import finetune
+from megatron_util import mpu
 from pretrain_glm import get_batch
 from tasks.eval_utils import accuracy_func_provider
 from tasks.seq2seq.dataset import (BlankLMDataset, ExtractionDataset,
diff --git a/modelscope/models/nlp/mglm/train_utils.py b/modelscope/models/nlp/mglm/train_utils.py
index c9c0de8e..4f8bf8b1 100644
--- a/modelscope/models/nlp/mglm/train_utils.py
+++ b/modelscope/models/nlp/mglm/train_utils.py
@@ -3,10 +3,10 @@
 import deepspeed
 import torch
 from apex.optimizers import FusedAdam as Adam
+from megatron_util import mpu
+from megatron_util.fp16 import DynamicLossScaler, FP16_Module, FP16_Optimizer
 from torch import distributed as dist
 
-from . import mpu
-from .fp16 import DynamicLossScaler, FP16_Module, FP16_Optimizer
 from .model import DistributedDataParallel as LocalDDP
 from .model import (GLMForMultiTokenCloze, GLMForMultiTokenClozeFast,
                     GLMForSequenceClassification, GLMForSingleTokenCloze,
diff --git a/modelscope/models/nlp/mglm/utils.py b/modelscope/models/nlp/mglm/utils.py
index 0e781189..2ecec588 100644
--- a/modelscope/models/nlp/mglm/utils.py
+++ b/modelscope/models/nlp/mglm/utils.py
@@ -21,9 +21,8 @@ import time
 import json
 import numpy as np
 import torch
-
-from . import mpu
-from .fp16 import FP16_Optimizer
+from megatron_util import mpu, print_rank_0
+from megatron_util.fp16 import FP16_Optimizer
 
 SUMMARY_WRITER_DIR_NAME = 'runs'
 
@@ -32,14 +31,6 @@ def get_log_dir(name, base):
     return os.path.join(base, SUMMARY_WRITER_DIR_NAME, name)
 
 
-def print_rank_0(message):
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            print(message, flush=True)
-    else:
-        print(message, flush=True)
-
-
 def get_hostname():
     hostname_cmd = ['hostname -I']
     result = subprocess.check_output(hostname_cmd, shell=True)
diff --git a/modelscope/models/nlp/plug/backbone.py b/modelscope/models/nlp/plug/backbone.py
index 8daeda6a..6f7d594b 100644
--- a/modelscope/models/nlp/plug/backbone.py
+++ b/modelscope/models/nlp/plug/backbone.py
@@ -18,12 +18,10 @@ from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 import logging
 import math
-import os
 
 import torch
 import torch.nn.functional as F
-from deepspeed.utils.timer import SynchronizedWallClockTimer
-from megatron import mpu
+from megatron_util import mpu
 from torch import nn
 
 from modelscope.utils.nlp.distributed import (normal_init_method,
@@ -468,7 +466,6 @@ class BertLMPredictionHead(nn.Module):
 
         self.type_converter = convert_to_type
         self.converted = False
-        self.timers = SynchronizedWallClockTimer()
 
     def forward(self, hidden_states):
         if not self.converted:
@@ -478,9 +475,7 @@ class BertLMPredictionHead(nn.Module):
                 if self.fp32_layernorm:
                     self.transform.LayerNorm.float()
         hidden_states = self.transform(self.type_converter(hidden_states))
-        self.timers('final linear gather').start()
         hidden_states = mpu.copy_to_model_parallel_region(hidden_states)
-        self.timers('final linear gather').stop()
         hidden_states = F.linear(
             self.type_converter(hidden_states),
             self.type_converter(self.decoder_weight),
@@ -1009,118 +1004,6 @@ class PlugModel(torch.nn.Module):
             sequence_output=sequence_output,
             parallel_output=parallel_output)
 
-    @staticmethod
-    def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
-        # This function has been mostly taken from huggingface conversational ai code at
-        # https://medium.com/huggingface/how-to-build-a-state-of-the-art-
-        # conversational-ai-with-transfer-learning-2d818ac26313
-
-        if top_k > 0:
-            # Remove all tokens with a probability less than the last token of the top-k
-            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
-                                                                      None]
-            logits[indices_to_remove] = filter_value
-
-        if top_p > 0.0:
-            # convert to 1D
-            logits = logits.view(logits.size()[1]).contiguous()
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-            cumulative_probs = torch.cumsum(
-                F.softmax(sorted_logits, dim=-1), dim=-1)
-
-            # Remove tokens with cumulative probability above the threshold
-            sorted_indices_to_remove = cumulative_probs > top_p
-            # Shift the indices to the right to keep also the first token above the threshold
-            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
-                ..., :-1].clone()
-            sorted_indices_to_remove[..., 0] = 0
-            indices_to_remove = sorted_indices[sorted_indices_to_remove]
-            logits[indices_to_remove] = filter_value
-            # going back to 2D
-            logits = logits.view(1, -1).contiguous()
-        return logits
-
-    def generate(self, input, out_length=128, model_cfg=None, *kwargs):
-        device = torch.cuda.current_device()
-        batch_size = input['input_ids'].shape[0]
-        tokens = input['input_ids'].view(1, -1).contiguous().to(device)
-        dec_input_ids = input['dec_input_ids'].to(device)
-        attention_mask = input['attention_mask'].to(device)
-        self.model.eval()
-        with torch.no_grad():
-            # Only supports batch_size=1
-            all_generate_tokens = []
-            generate_tokens = []
-            counter = 0
-            sequence_output = None
-            vocab_size = self.config.original_vocab_size
-            sep_token_idx = 102  # index of [SEP] token in BertTokenizer
-            while counter < out_length:
-                if counter % 128 == 0 and counter != 0:
-                    # Sliding window
-                    generate_tokens.append(sep_token_idx)
-                    start = (tokens == sep_token_idx).nonzero(
-                        as_tuple=True)[-1]
-                    if start + len(generate_tokens) >= 512:
-                        tokens = torch.cat([
-                            tokens[:start],
-                            torch.cuda.LongTensor(generate_tokens)
-                        ], -1)[-512:]
-                    else:
-                        tokens[0][start:start + len(generate_tokens
-                                                    )] = torch.cuda.LongTensor(
-                                                        generate_tokens)
-
-                    attention_mask = (tokens != 0)
-                    dec_input_ids = input['dec_input_ids'].to(device)
-                    generate_tokens = []
-                    sequence_output = None
-
-                position_ids = torch.full([batch_size, 1],
-                                          len(generate_tokens),
-                                          dtype=torch.long,
-                                          device=device)
-                _, logits, sequence_output = self.model(
-                    tokens,
-                    None,
-                    attention_mask,
-                    dec_input_ids,
-                    attention_mask,
-                    position_ids,
-                    is_infer=True,
-                    sequence_output=sequence_output,
-                    parallel_output=False)
-                logits = logits[:, -1, :]
-                logits = logits / model_cfg['temperature']
-                logits = self.top_k_logits(
-                    logits, top_k=model_cfg['top_k'], top_p=model_cfg['top_p'])
-                log_probs = F.softmax(logits, dim=-1)
-                prev = torch.argmax(log_probs, 1).unsqueeze(1)
-                # prev = torch.multinomial(log_probs, num_samples=1)
-                prev_token = prev[0].item()
-                if prev_token >= vocab_size:
-                    prev_token = 100
-                    prev[0] = 100
-                if prev_token == 102 and len(all_generate_tokens) > int(
-                        max(1, out_length) * 0.8):
-                    break
-                if prev_token == 102:
-                    counter += 1
-                    continue
-                dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
-                generate_tokens.append(prev_token)
-                all_generate_tokens.append(prev_token)
-                counter += 1
-
-            generate_context = []
-            for token in all_generate_tokens:
-                if generate_context and generate_context[
-                        -1] == 100 and token == 100:
-                    continue
-                else:
-                    generate_context.append(token)
-            return {'generate_context': generate_context}
-
     def state_dict(self, destination=None, prefix='', keep_vars=False):
         return self.model.state_dict(
             destination=destination, prefix=prefix, keep_vars=keep_vars)
diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py
index 679bfc1b..6d1af8f4 100644
--- a/modelscope/models/nlp/plug/distributed_plug.py
+++ b/modelscope/models/nlp/plug/distributed_plug.py
@@ -1,19 +1,16 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from typing import Dict
 
 import torch
-import torch.nn.functional as F
-from megatron import mpu
-from megatron.fp16 import FP16_Module
-from megatron.utils import print_rank_0
+from megatron_util import mpu, print_rank_0
+from megatron_util.fp16 import FP16_Module
+from torch.nn import functional as F
 
 from modelscope.models import TorchModel
 from modelscope.models.base import Tensor
 from modelscope.utils.logger import get_logger
-from modelscope.utils.nlp.distributed import initialize_distributed
+from modelscope.utils.megatron_utils import init_megatron_util
 from modelscope.utils.nlp.load_checkpoint import pre_load
-from modelscope.utils.torch_utils import set_random_seed_mpu
 from . import PlugModel
 from .configuration import PlugNLGConfig
 
@@ -69,11 +66,9 @@ class DistributedPlug(TorchModel):
         self.rank = rank
         self.model_cfg = kwargs
         self.config = PlugNLGConfig.from_pretrained(model_dir)
-        initialize_distributed(rank, mpu, kwargs['world_size'],
-                               kwargs['model_parallel_size'],
-                               kwargs['master_ip'], kwargs['master_port'])
-        seed = 42 if 'seed' not in kwargs else kwargs['seed']
-        set_random_seed_mpu(seed)
+
+        init_megatron_util(model_dir=model_dir, rank=rank)
+
         self.iteration = 0
         self.model = self.initialize_model(path_load_tag='model')
 
@@ -85,7 +80,7 @@ class DistributedPlug(TorchModel):
         if mpu.get_data_parallel_rank() == 0:
             logger.info(
                 ' > number of parameters on model parallel rank {}: {}'.format(
-                    mpu.get_model_parallel_rank(),
+                    mpu.get_tensor_model_parallel_rank(),
                     sum([p.nelement() for p in model.parameters()])))
 
         if self.config.deepspeed and self.config.fp16:
@@ -111,7 +106,9 @@ class DistributedPlug(TorchModel):
                         _module.float()
 
         load_model = pre_load(
-            mpu.get_model_parallel_rank(), self.model_dir, tag=path_load_tag)
+            mpu.get_tensor_model_parallel_rank(),
+            self.model_dir,
+            tag=path_load_tag)
         model_dict = model.module.model.state_dict()
         for key in load_model:
             if key not in model_dict.keys():
@@ -121,6 +118,37 @@ class DistributedPlug(TorchModel):
         model.module.model.load_state_dict(load_model, strict=False)
         return model
 
+    @staticmethod
+    def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+        # This function has been mostly taken from huggingface conversational ai code at
+        # https://medium.com/huggingface/how-to-build-a-state-of-the-art-
+        # conversational-ai-with-transfer-learning-2d818ac26313
+
+        if top_k > 0:
+            # Remove all tokens with a probability less than the last token of the top-k
+            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
+                                                                      None]
+            logits[indices_to_remove] = filter_value
+
+        if top_p > 0.0:
+            # convert to 1D
+            logits = logits.view(logits.size()[1]).contiguous()
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = torch.cumsum(
+                F.softmax(sorted_logits, dim=-1), dim=-1)
+
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                ..., :-1].clone()
+            sorted_indices_to_remove[..., 0] = 0
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            logits[indices_to_remove] = filter_value
+            # going back to 2D
+            logits = logits.view(1, -1).contiguous()
+        return logits
+
     def forward(self,
                 input_tokens,
                 token_type_ids=None,
@@ -145,4 +173,83 @@ class DistributedPlug(TorchModel):
             parallel_output=parallel_output)
 
     def generate(self, input: Dict[str, Tensor], out_length=128, *kwargs):
-        return self.model.generate(input, out_length, self.model_cfg, *kwargs)
+        device = torch.cuda.current_device()
+        batch_size = input['input_ids'].shape[0]
+        tokens = input['input_ids'].view(1, -1).contiguous().to(device)
+        dec_input_ids = input['dec_input_ids'].to(device)
+        attention_mask = input['attention_mask'].to(device)
+        self.model.eval()
+        with torch.no_grad():
+            # Only supports batch_size=1
+            all_generate_tokens = []
+            generate_tokens = []
+            counter = 0
+            sequence_output = None
+            vocab_size = self.config.original_vocab_size
+            sep_token_idx = 102  # index of [SEP] token in BertTokenizer
+            while counter < out_length:
+                if counter % 128 == 0 and counter != 0:
+                    # Sliding window
+                    generate_tokens.append(sep_token_idx)
+                    start = (tokens == sep_token_idx).nonzero(
+                        as_tuple=True)[-1]
+                    if start + len(generate_tokens) >= 512:
+                        tokens = torch.cat([
+                            tokens[:start],
+                            torch.cuda.LongTensor(generate_tokens)
+                        ], -1)[-512:]
+                    else:
+                        tokens[0][start:start + len(generate_tokens
+                                                    )] = torch.cuda.LongTensor(
+                                                        generate_tokens)
+
+                    attention_mask = (tokens != 0)
+                    dec_input_ids = input['dec_input_ids'].to(device)
+                    generate_tokens = []
+                    sequence_output = None
+
+                position_ids = torch.full([batch_size, 1],
+                                          len(generate_tokens),
+                                          dtype=torch.long,
+                                          device=device)
+                _, logits, sequence_output = self.model(
+                    tokens,
+                    None,
+                    attention_mask,
+                    dec_input_ids,
+                    attention_mask,
+                    position_ids,
+                    is_infer=True,
+                    sequence_output=sequence_output,
+                    parallel_output=False)
+                logits = logits[:, -1, :]
+                logits = logits / self.model_cfg['temperature']
+                logits = self.top_k_logits(
+                    logits,
+                    top_k=self.model_cfg['top_k'],
+                    top_p=self.model_cfg['top_p'])
+                log_probs = F.softmax(logits, dim=-1)
+                prev = torch.multinomial(log_probs, num_samples=1)
+                prev_token = prev[0].item()
+                if prev_token >= vocab_size:
+                    prev_token = 100
+                    prev[0] = 100
+                if prev_token == 102 and len(all_generate_tokens) > int(
+                        max(1, out_length) * 0.8):
+                    break
+                if prev_token == 102:
+                    counter += 1
+                    continue
+                dec_input_ids = torch.cat([dec_input_ids, prev], dim=1)
+                generate_tokens.append(prev_token)
+                all_generate_tokens.append(prev_token)
+                counter += 1
+
+            generate_context = []
+            for token in all_generate_tokens:
+                if generate_context and generate_context[
+                        -1] == 100 and token == 100:
+                    continue
+                else:
+                    generate_context.append(token)
+            return {'generate_context': generate_context}
diff --git a/modelscope/models/nlp/space_T_cn/table_question_answering.py b/modelscope/models/nlp/space_T_cn/table_question_answering.py
index 3d16f649..82345da6 100644
--- a/modelscope/models/nlp/space_T_cn/table_question_answering.py
+++ b/modelscope/models/nlp/space_T_cn/table_question_answering.py
@@ -52,21 +52,22 @@ class TableQuestionAnswering(Model):
         self.max_where_num = constant.max_where_num
         self.col_type_dict = constant.col_type_dict
         self.schema_link_dict = constant.schema_link_dict
-        n_cond_ops = len(self.cond_ops)
-        n_agg_ops = len(self.agg_ops)
-        n_action_ops = len(self.action_ops)
+        self.n_cond_ops = len(self.cond_ops)
+        self.n_agg_ops = len(self.agg_ops)
+        self.n_action_ops = len(self.action_ops)
         iS = self.backbone_config.hidden_size
         self.head_model = Seq2SQL(
             iS,
             100,
             2,
             0.0,
-            n_cond_ops,
-            n_agg_ops,
-            n_action_ops,
+            self.n_cond_ops,
+            self.n_agg_ops,
+            self.n_action_ops,
             self.max_select_num,
             self.max_where_num,
             device=self._device_name)
+        self.device = self._device_name
         self.head_model.load_state_dict(state_dict['head_model'], strict=False)
 
     def to(self, device):
diff --git a/modelscope/models/nlp/structbert/faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py
index a37b8b2d..c5cd3061 100644
--- a/modelscope/models/nlp/structbert/faq_question_answering.py
+++ b/modelscope/models/nlp/structbert/faq_question_answering.py
@@ -192,7 +192,7 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
         self.metrics_layer = MetricsLayer(args)
         self.pooling = PoolingLayer(args)
 
-    def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    def forward(self, input: Dict[str, Tensor]) -> FaqQuestionAnsweringOutput:
         """
         Args:
             input (Dict[str, Tensor]): the preprocessed data, it contains the following keys:
@@ -238,17 +238,13 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
             >>>           }
             >>> result = model(preprocessor(param))
         """
-        assert not self.training
         query = input['query']
         support = input['support']
-        if isinstance(query, list):
-            query = torch.stack(query)
-        if isinstance(support, list):
-            support = torch.stack(support)
+        query_mask = input['query_attention_mask']
+        support_mask = input['support_attention_mask']
+
         n_query = query.shape[0]
         n_support = support.shape[0]
-        query_mask = torch.ne(query, 0).view([n_query, -1])
-        support_mask = torch.ne(support, 0).view([n_support, -1])
 
         support_labels = input['support_labels']
         num_cls = torch.max(support_labels) + 1
@@ -268,10 +264,26 @@ class SbertForFaqQuestionAnswering(BaseTaskModel):
         cls_n_support = torch.sum(onehot_labels, dim=-2) + 1e-5
         protos = torch.matmul(onehot_labels.transpose(0, 1),
                               z_support) / cls_n_support.unsqueeze(-1)
-        scores = self.metrics_layer(z_query, protos).view([n_query, num_cls])
+        logits = self.metrics_layer(z_query, protos).view([n_query, num_cls])
         if self.metrics_layer.name == 'relation':
-            scores = torch.sigmoid(scores)
-        return FaqQuestionAnsweringOutput(scores=scores)
+            scores = torch.sigmoid(logits)
+        else:
+            scores = logits
+        if 'labels' in input:
+            query_labels = input['labels']
+            loss = self._compute_loss(logits, query_labels, num_cls)
+            _, pred_labels = torch.max(scores, dim=1)
+            return FaqQuestionAnsweringOutput(
+                loss=loss, logits=scores).to_dict()
+        else:
+            return FaqQuestionAnsweringOutput(scores=scores)
+
+    def _compute_loss(self, logits, target, num_cls):
+        from torch.nn import CrossEntropyLoss
+        logits = logits.view([-1, num_cls])
+        target = target.reshape(-1)
+        loss = CrossEntropyLoss(reduction='mean')(logits, target)
+        return loss
 
     def _get_onehot_labels(self, labels, support_size, num_cls):
         labels_ = labels.view(support_size, 1)
diff --git a/modelscope/models/nlp/structbert/token_classification.py b/modelscope/models/nlp/structbert/token_classification.py
index ab46fc83..8bfd46bc 100644
--- a/modelscope/models/nlp/structbert/token_classification.py
+++ b/modelscope/models/nlp/structbert/token_classification.py
@@ -220,16 +220,6 @@ class SbertForTokenClassification(SbertPreTrainedModel):
                     with_attention_mask=attention_mask is not None,
                     **outputs.kwargs)
 
-        if label_mask is not None:
-            mask = label_mask
-            masked_lengths = mask.sum(-1).long()
-            masked_logits = torch.zeros_like(logits)
-            for i in range(len(mask)):
-                masked_logits[
-                    i, :masked_lengths[i], :] = logits[i].masked_select(
-                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
-            logits = masked_logits
-
         return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
diff --git a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
index ca2613d4..a7e27d9d 100644
--- a/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
+++ b/modelscope/models/nlp/task_models/nncrf_for_named_entity_recognition.py
@@ -141,6 +141,18 @@ class SequenceLabelingForNamedEntityRecognition(TorchModel):
         predicts = self.model.decode(input)
         offset_mapping = input.get('offset_mapping')
         mask = input.get('label_mask')
+
+        # revert predicts to original position with respect of label mask
+        masked_predict = torch.zeros_like(predicts)
+        for i in range(len(mask)):
+            masked_lengths = mask[i].sum(-1).long().cpu().item()
+            selected_predicts = torch.narrow(
+                predicts[i], 0, 0,
+                masked_lengths)  # index_select only move loc, not resize
+            mask_position = mask[i].byte()
+            masked_predict[i][mask_position] = selected_predicts
+        predicts = masked_predict
+
         return AttentionTokenClassificationModelOutput(
             loss=None,
             logits=None,
diff --git a/modelscope/models/nlp/task_models/token_classification.py b/modelscope/models/nlp/task_models/token_classification.py
index 0e216496..40543dc8 100644
--- a/modelscope/models/nlp/task_models/token_classification.py
+++ b/modelscope/models/nlp/task_models/token_classification.py
@@ -63,16 +63,6 @@ class TokenClassificationModel(SingleBackboneTaskModelBase):
         if labels in input:
             loss = self.compute_loss(outputs, labels)
 
-        if 'label_mask' in input:
-            mask = input['label_mask']
-            masked_lengths = mask.sum(-1).long()
-            masked_logits = torch.zeros_like(logits)
-            for i in range(len(mask)):
-                masked_logits[
-                    i, :masked_lengths[i], :] = logits[i].masked_select(
-                        mask[i].unsqueeze(-1)).view(masked_lengths[i], -1)
-            logits = masked_logits
-
         return AttentionTokenClassificationModelOutput(
             loss=loss,
             logits=logits,
diff --git a/modelscope/models/nlp/unite/configuration_unite.py b/modelscope/models/nlp/unite/configuration_unite.py
index 81abd2db..b0a48585 100644
--- a/modelscope/models/nlp/unite/configuration_unite.py
+++ b/modelscope/models/nlp/unite/configuration_unite.py
@@ -6,7 +6,7 @@ from enum import Enum
 from modelscope.utils import logger as logging
 from modelscope.utils.config import Config
 
-logger = logging.get_logger(__name__)
+logger = logging.get_logger()
 
 
 class EvaluationMode(Enum):
diff --git a/modelscope/models/nlp/use/__init__.py b/modelscope/models/nlp/use/__init__.py
new file mode 100644
index 00000000..9e4ea599
--- /dev/null
+++ b/modelscope/models/nlp/use/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .user_satisfaction_estimation import UserSatisfactionEstimation
+else:
+    _import_structure = {
+        'user_satisfaction_estimation': ['UserSatisfactionEstimation']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/nlp/use/transformer.py b/modelscope/models/nlp/use/transformer.py
new file mode 100644
index 00000000..d3918903
--- /dev/null
+++ b/modelscope/models/nlp/use/transformer.py
@@ -0,0 +1,148 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import math
+
+import torch
+import torch.nn as nn
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(
+        math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+class PositionwiseFeedForward(nn.Module):
+
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.actv = gelu
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        inter = self.dropout_1(self.actv(self.w_1(self.layer_norm(x))))
+        output = self.dropout_2(self.w_2(inter))
+        return output + x
+
+
+class MultiHeadedAttention(nn.Module):
+
+    def __init__(self, head_count, model_dim, dropout=0.1):
+        assert model_dim % head_count == 0
+        self.dim_per_head = model_dim // head_count
+        self.model_dim = model_dim
+
+        super(MultiHeadedAttention, self).__init__()
+        self.head_count = head_count
+
+        self.linear_k = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_v = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.linear_q = nn.Linear(model_dim, head_count * self.dim_per_head)
+        self.softmax = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.linear = nn.Linear(model_dim, model_dim)
+
+    def forward(self, key, value, query, mask=None):
+        batch_size = key.size(0)
+        dim_per_head = self.dim_per_head
+        head_count = self.head_count
+
+        def shape(x):
+            """  projection """
+            return x.view(batch_size, -1, head_count, dim_per_head) \
+                .transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous() \
+                .view(batch_size, -1, head_count * dim_per_head)
+
+        key = self.linear_k(key).view(batch_size, -1, head_count,
+                                      dim_per_head).transpose(1, 2)
+        value = self.linear_v(value).view(batch_size, -1, head_count,
+                                          dim_per_head).transpose(1, 2)
+        query = self.linear_q(query).view(batch_size, -1, head_count,
+                                          dim_per_head).transpose(1, 2)
+
+        query = query / math.sqrt(dim_per_head)
+        scores = torch.matmul(query, key.transpose(2, 3))
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).expand_as(scores)
+            scores = scores.masked_fill(mask, -1e10)
+
+        attn = self.softmax(scores)
+
+        drop_attn = self.dropout(attn)
+        context = torch.matmul(drop_attn,
+                               value).transpose(1, 2).contiguous().view(
+                                   batch_size, -1, head_count * dim_per_head)
+        output = self.linear(context)
+        return output
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, dim, max_len=512):
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float)
+                              * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = torch.sin(position.float() * div_term)
+        pe[:, 1::2] = torch.cos(position.float() * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        L = x.size(1)
+        pos_emb = self.pe[:, :L]
+        x = x + pos_emb
+        return x
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self, d_model, heads, d_ff, dropout):
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, iter, query, inputs, mask):
+        if iter != 0:
+            input_norm = self.layer_norm(inputs)
+        else:
+            input_norm = inputs
+
+        mask = mask.unsqueeze(1)
+        context = self.self_attn(input_norm, input_norm, input_norm, mask=mask)
+        out = self.dropout(context) + inputs
+        return self.feed_forward(out)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, d_model, d_ff, heads, layers, dropout=0.1):
+        super(TransformerEncoder, self).__init__()
+
+        self.d_model = d_model
+        self.layers = layers
+        self.pos_emb = PositionalEncoding(d_model)
+        self.transformer_inter = nn.ModuleList([
+            TransformerEncoderLayer(d_model, heads, d_ff, dropout)
+            for _ in range(layers)
+        ])
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x, mask):
+        x = self.pos_emb(x)
+        x = self.dropout(x)
+        for i in range(self.layers):
+            x = self.transformer_inter[i](i, x, x, mask.eq(0))
+        return x
diff --git a/modelscope/models/nlp/use/user_satisfaction_estimation.py b/modelscope/models/nlp/use/user_satisfaction_estimation.py
new file mode 100644
index 00000000..9fe47b74
--- /dev/null
+++ b/modelscope/models/nlp/use/user_satisfaction_estimation.py
@@ -0,0 +1,168 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_
+from transformers import BertModel
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.outputs import DialogueUserSatisfactionEstimationModelOutput
+from modelscope.utils.constant import ModelFile, Tasks
+from .transformer import TransformerEncoder
+
+__all__ = ['UserSatisfactionEstimation']
+
+
+@MODELS.register_module(Tasks.text_classification, module_name=Models.use)
+class UserSatisfactionEstimation(TorchModel):
+
+    def __init__(self,
+                 model_dir: str,
+                 bert_name: str = None,
+                 device: str = None):
+        """initialize the user satisfaction estimation model from the `model_dir` path. The default preprocessor
+        for this task is DialogueClassificationUsePreprocessor.
+
+        Args:
+            model_dir: The model dir containing the model.
+            bert_name: The pretrained model, default bert-base-chinese
+            device: The device of running model, default cpu
+        """
+        super().__init__(model_dir)
+        self.model_dir = model_dir
+        self.bert_name = bert_name if bert_name is not None else 'bert-base-chinese'
+        self.device = 'cpu'
+        if device is not None and torch.cuda.is_available():
+            self.device = device
+        self.model = self.init_model()
+        model_ckpt = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.model.load_state_dict(
+            torch.load(model_ckpt, map_location=torch.device('cpu')))
+
+    def init_model(self):
+        configs = {
+            'bert_name': self.bert_name,
+            'cache_dir': self.model_dir,
+            'dropout': 0.1
+        }
+        model = USE(configs)
+        return model
+
+    def forward(
+        self, input_ids: Tensor
+    ) -> Union[DialogueUserSatisfactionEstimationModelOutput, Dict[str,
+                                                                   Tensor]]:
+        """Compute the logits of satisfaction polarities for a dialogue.
+
+        Args:
+           input_ids (Tensor): the preprocessed dialogue input
+        Returns:
+           output (Dict[str, Any] or DialogueUserSatisfactionEstimationModelOutput): The results of user satisfaction.
+           Example: {'logits': tensor([[-2.1795,  1.1323,  1.8605]])}
+        """
+        logits = self.model(input_ids)
+        return DialogueUserSatisfactionEstimationModelOutput(logits=logits)
+
+
+def init_params(model):
+    for name, param in model.named_parameters():
+        if param.data.dim() > 1:
+            xavier_uniform_(param.data)
+        else:
+            pass
+
+
+def universal_sentence_embedding(sentences, mask, sqrt=True):
+    sentence_sums = torch.bmm(
+        sentences.permute(0, 2, 1),
+        mask.float().unsqueeze(-1)).squeeze(-1)
+    divisor = (mask.sum(dim=1).view(-1, 1).float())
+    if sqrt:
+        divisor = divisor.sqrt()
+    sentence_sums /= divisor
+    return sentence_sums
+
+
+class BERTBackbone(nn.Module):
+
+    def __init__(self, **config):
+        super().__init__()
+        bert_name = config.get('bert_name', 'bert-base-chinese')
+        cache_dir = config.get('cache_dir')
+        self.bert = BertModel.from_pretrained(bert_name, cache_dir=cache_dir)
+        self.d_model = 768 * 2
+
+    def forward(self, input_ids):
+        attention_mask = input_ids.ne(0).detach()
+        outputs = self.bert(input_ids, attention_mask)
+        h = universal_sentence_embedding(outputs[0], attention_mask)
+        cls = outputs[1]
+        out = torch.cat([cls, h], dim=-1)
+        return out
+
+
+class MLP(nn.Module):
+
+    def __init__(self, input_size, output_size, hidden_size):
+        super().__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, din):
+        dout = F.relu(self.fc1(din))
+        dout = F.relu(self.fc2(dout))
+        return dout
+
+
+class USE(nn.Module):
+
+    def __init__(self, args):
+        super().__init__()
+        self.drop_out = nn.Dropout(args['dropout'])
+        self.private = BERTBackbone(
+            bert_name=args['bert_name'], cache_dir=args['cache_dir'])
+        d_model = self.private.d_model
+        self.encoder = TransformerEncoder(d_model, d_model * 2, 8, 2, 0.1)
+        self.content_gru = nn.GRU(
+            d_model,
+            d_model,
+            num_layers=1,
+            bidirectional=False,
+            batch_first=True)
+        self.sat_classifier = nn.Linear(d_model, 3)
+
+        self.U_c = nn.Linear(d_model, d_model)
+        self.w_c = nn.Linear(d_model, 1, bias=False)
+
+        init_params(self.encoder)
+        init_params(self.sat_classifier)
+        init_params(self.U_c)
+        init_params(self.w_c)
+
+    def forward(self, input_ids):
+        self.content_gru.flatten_parameters()
+        batch_size, dialog_len, utt_len = input_ids.size()
+        attention_mask = input_ids[:, :, 0].squeeze(-1).ne(0).detach()
+        input_ids = input_ids.view(-1, utt_len)
+
+        private_out = self.private(input_ids=input_ids)
+        private_out = private_out.view(batch_size, dialog_len, -1)
+        H = self.encoder(private_out, attention_mask)
+        H = self.drop_out(H)
+
+        H, _ = self.content_gru(H)
+        att_c = self.w_c(torch.tanh(self.U_c(H))).squeeze(-1)
+        att_c = F.softmax(
+            att_c.masked_fill(mask=~attention_mask, value=-np.inf), dim=1)
+        hidden = torch.bmm(H.permute(0, 2, 1), att_c.unsqueeze(-1)).squeeze(-1)
+
+        sat_res = self.sat_classifier(hidden)
+        return sat_res
diff --git a/modelscope/msdatasets/auth/__init__.py b/modelscope/msdatasets/auth/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/auth/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/auth/auth_config.py b/modelscope/msdatasets/auth/auth_config.py
new file mode 100644
index 00000000..576a6efd
--- /dev/null
+++ b/modelscope/msdatasets/auth/auth_config.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from http.cookiejar import CookieJar
+from typing import Tuple
+
+
+class BaseAuthConfig(object):
+    """Base authorization config class."""
+
+    def __init__(self, cookies: CookieJar, git_token: str,
+                 user_info: Tuple[str, str]):
+        self.cookies = cookies
+        self.git_token = git_token
+        self.user_info = user_info
+
+
+class OssAuthConfig(BaseAuthConfig):
+    """The authorization config for oss dataset."""
+
+    def __init__(self, cookies: CookieJar, git_token: str,
+                 user_info: Tuple[str, str]):
+        super().__init__(
+            cookies=cookies, git_token=git_token, user_info=user_info)
+
+
+class MaxComputeAuthConfig(BaseAuthConfig):
+    # TODO: MaxCompute dataset to be supported.
+    def __init__(self, cookies: CookieJar, git_token: str,
+                 user_info: Tuple[str, str]):
+        super().__init__(
+            cookies=cookies, git_token=git_token, user_info=user_info)
+
+        self.max_compute_grant_cmd = None
diff --git a/modelscope/msdatasets/context/__init__.py b/modelscope/msdatasets/context/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/context/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/context/dataset_context_config.py b/modelscope/msdatasets/context/dataset_context_config.py
new file mode 100644
index 00000000..26b05f7d
--- /dev/null
+++ b/modelscope/msdatasets/context/dataset_context_config.py
@@ -0,0 +1,100 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Mapping, Sequence, Union
+
+from modelscope.msdatasets.auth.auth_config import BaseAuthConfig
+from modelscope.msdatasets.download.download_config import DataDownloadConfig
+from modelscope.msdatasets.meta.data_meta_config import DataMetaConfig
+from modelscope.utils.constant import DownloadMode, Hubs
+
+
+class DatasetContextConfig:
+    """Context configuration of dataset."""
+
+    def __init__(self, dataset_name: Union[str, list], namespace: str,
+                 version: str, subset_name: str, split: Union[str, list],
+                 target: str, hub: Hubs, data_dir: str,
+                 data_files: Union[str, Sequence[str],
+                                   Mapping[str, Union[str, Sequence[str]]]],
+                 download_mode: DownloadMode, cache_root_dir: str,
+                 use_streaming: bool, **kwargs):
+
+        self._download_config = None
+        self._data_meta_config = None
+        self._config_kwargs = kwargs
+        self._dataset_version_cache_root_dir = None
+        self._auth_config = None
+
+        # The lock file path for meta-files and data-files
+        self._global_meta_lock_file_path = None
+        self._global_data_lock_file_path = None
+
+        # General arguments for dataset
+        self.hub = hub
+        self.download_mode = download_mode
+        self.dataset_name = dataset_name
+        self.namespace = namespace
+        self.version = version
+        self.subset_name = subset_name
+        self.split = split
+        self.target = target
+        self.data_dir = data_dir
+        self.data_files = data_files
+        self.cache_root_dir = cache_root_dir
+        self.use_streaming = use_streaming
+
+    @property
+    def config_kwargs(self) -> dict:
+        return self._config_kwargs
+
+    @config_kwargs.setter
+    def config_kwargs(self, val: dict):
+        self._config_kwargs = val
+
+    @property
+    def download_config(self) -> DataDownloadConfig:
+        return self._download_config
+
+    @download_config.setter
+    def download_config(self, val: DataDownloadConfig):
+        self._download_config = val
+
+    @property
+    def data_meta_config(self) -> DataMetaConfig:
+        return self._data_meta_config
+
+    @data_meta_config.setter
+    def data_meta_config(self, val: DataMetaConfig):
+        self._data_meta_config = val
+
+    @property
+    def dataset_version_cache_root_dir(self) -> str:
+        return self._dataset_version_cache_root_dir
+
+    @dataset_version_cache_root_dir.setter
+    def dataset_version_cache_root_dir(self, val: str):
+        self._dataset_version_cache_root_dir = val
+
+    @property
+    def global_meta_lock_file_path(self) -> str:
+        return self._global_meta_lock_file_path
+
+    @global_meta_lock_file_path.setter
+    def global_meta_lock_file_path(self, val: str):
+        self._global_meta_lock_file_path = val
+
+    @property
+    def global_data_lock_file_path(self) -> str:
+        return self._global_data_lock_file_path
+
+    @global_data_lock_file_path.setter
+    def global_data_lock_file_path(self, val: str):
+        self._global_data_lock_file_path = val
+
+    @property
+    def auth_config(self) -> BaseAuthConfig:
+        return self._auth_config
+
+    @auth_config.setter
+    def auth_config(self, val: BaseAuthConfig):
+        self._auth_config = val
diff --git a/modelscope/msdatasets/cv/object_detection/detection_dataset.py b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
index 2f6ad7d3..4a533d00 100644
--- a/modelscope/msdatasets/cv/object_detection/detection_dataset.py
+++ b/modelscope/msdatasets/cv/object_detection/detection_dataset.py
@@ -13,6 +13,8 @@ from modelscope.utils.constant import Tasks
 
 @TASK_DATASETS.register_module(
     group_key=Tasks.image_object_detection, module_name=Datasets.DetDataset)
+@TASK_DATASETS.register_module(
+    group_key=Tasks.image_segmentation, module_name=Datasets.DetDataset)
 class DetDataset(EasyCVBaseDataset, _DetDataset):
     """EasyCV dataset for object detection.
     For more details, please refer to https://github.com/alibaba/EasyCV/blob/master/easycv/datasets/detection/raw.py .
diff --git a/modelscope/msdatasets/data_files/__init__.py b/modelscope/msdatasets/data_files/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/data_files/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/data_files/data_files_manager.py b/modelscope/msdatasets/data_files/data_files_manager.py
new file mode 100644
index 00000000..a84876f6
--- /dev/null
+++ b/modelscope/msdatasets/data_files/data_files_manager.py
@@ -0,0 +1,114 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Union
+
+from datasets import DatasetBuilder
+
+from modelscope.hub.api import HubApi
+from modelscope.msdatasets.context.dataset_context_config import \
+    DatasetContextConfig
+from modelscope.msdatasets.download.dataset_builder import (
+    CsvDatasetBuilder, IterableDatasetBuilder, TaskSpecificDatasetBuilder)
+from modelscope.msdatasets.download.download_config import DataDownloadConfig
+from modelscope.msdatasets.download.download_manager import (
+    DataDownloadManager, DataStreamingDownloadManager)
+from modelscope.utils.constant import (DatasetPathName, DownloadMode,
+                                       MetaDataFields)
+
+
+class DataFilesManager(object):
+    """The modelscope data-files manager."""
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+
+        # Get dataset config info
+        self.dataset_name = dataset_context_config.dataset_name
+        self.namespace = dataset_context_config.namespace
+        self.version = dataset_context_config.version
+        self.subset_name = dataset_context_config.subset_name
+        self.split = dataset_context_config.split
+        self.meta_data_files = dataset_context_config.data_meta_config.meta_data_files
+        self.meta_args_map = dataset_context_config.data_meta_config.meta_args_map
+        self.zip_data_files = dataset_context_config.data_meta_config.zip_data_files
+        self.download_mode = dataset_context_config.download_mode
+        self.use_streaming = dataset_context_config.use_streaming
+        self.input_config_kwargs = dataset_context_config.config_kwargs
+
+        # Get download_config
+        download_config = dataset_context_config.download_config or DataDownloadConfig(
+        )
+        download_config.dataset_name = dataset_context_config.dataset_name
+        download_config.namespace = dataset_context_config.namespace
+        download_config.version = dataset_context_config.version
+        download_config.split = dataset_context_config.split
+        download_config.cache_dir = os.path.join(
+            dataset_context_config.cache_root_dir, self.namespace,
+            self.dataset_name, self.version, DatasetPathName.DATA_FILES_NAME)
+
+        is_force_download = dataset_context_config.download_mode == DownloadMode.FORCE_REDOWNLOAD
+        download_config.force_download = bool(is_force_download)
+        download_config.force_extract = bool(is_force_download)
+        download_config.use_etag = False
+
+        # Get oss config
+        api = HubApi()
+        self.oss_config = api.get_dataset_access_config(
+            self.dataset_name, self.namespace, self.version)
+
+        # Set context. Note: no need to update context_config.
+        download_config.oss_config = self.oss_config
+        dataset_context_config.download_config = download_config
+        self.dataset_context_config = dataset_context_config
+        os.makedirs(download_config.cache_dir, exist_ok=True)
+
+    def get_data_files_builder(self) -> Union[DatasetBuilder, None]:
+        """ Build download manager. """
+
+        if self.use_streaming:
+            return IterableDatasetBuilder.get_builder_instance(
+                dataset_context_config=self.dataset_context_config)
+
+        if not self.meta_data_files:
+            return None
+
+        meta_data_file = next(iter(self.meta_data_files.values()))
+        meta_args_map_file = next(iter(self.meta_args_map.values()))
+        if meta_args_map_file is None:
+            meta_args_map_file = {}
+
+        if not meta_data_file or meta_args_map_file.get(
+                MetaDataFields.ARGS_BIG_DATA):
+            meta_args_map_file.update(self.input_config_kwargs)
+            self.dataset_context_config.data_meta_config.meta_args_map = meta_args_map_file
+
+            builder = TaskSpecificDatasetBuilder(
+                dataset_context_config=self.dataset_context_config)
+        elif meta_data_file.endswith('.csv'):
+            builder = CsvDatasetBuilder(
+                dataset_context_config=self.dataset_context_config)
+        else:
+            raise NotImplementedError(
+                f'Dataset meta file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
+            )
+        return builder
+
+    def fetch_data_files(self, builder):
+        """ Fetch the data-files from dataset-hub. """
+
+        if self.dataset_context_config.use_streaming:
+            dl_manager = DataStreamingDownloadManager(
+                download_config=self.dataset_context_config.download_config)
+            return builder.as_streaming_dataset(dl_manager)
+        else:
+
+            self.dataset_context_config.download_config.meta_args_map = \
+                self.dataset_context_config.data_meta_config.meta_args_map
+
+            dl_manager = DataDownloadManager(
+                download_config=self.dataset_context_config.download_config)
+            builder.download_and_prepare(
+                dl_manager=dl_manager,
+                download_mode=self.download_mode.value,
+                try_from_hf_gcs=False)
+            return builder.as_dataset()
diff --git a/modelscope/msdatasets/data_loader/__init__.py b/modelscope/msdatasets/data_loader/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/data_loader/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py
new file mode 100644
index 00000000..c97151b0
--- /dev/null
+++ b/modelscope/msdatasets/data_loader/data_loader.py
@@ -0,0 +1,168 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+
+from datasets import (Dataset, DatasetBuilder, DatasetDict, IterableDataset,
+                      IterableDatasetDict)
+from datasets import load_dataset as hf_data_loader
+
+from modelscope.hub.api import ModelScopeConfig
+from modelscope.msdatasets.auth.auth_config import OssAuthConfig
+from modelscope.msdatasets.context.dataset_context_config import \
+    DatasetContextConfig
+from modelscope.msdatasets.data_files.data_files_manager import \
+    DataFilesManager
+from modelscope.msdatasets.meta.data_meta_manager import DataMetaManager
+from modelscope.utils.constant import DatasetFormations
+
+
+class BaseDataLoader(ABC):
+    """Base dataset loader to load data."""
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        self.dataset_context_config = dataset_context_config
+
+    @abstractmethod
+    def process(self):
+        """The entity processing pipeline for fetching the data. """
+        raise NotImplementedError(
+            f'No default implementation provided for {BaseDataLoader.__name__}.process.'
+        )
+
+    @abstractmethod
+    def _authorize(self):
+        raise NotImplementedError(
+            f'No default implementation provided for {BaseDataLoader.__name__}._authorize.'
+        )
+
+    @abstractmethod
+    def _build(self):
+        raise NotImplementedError(
+            f'No default implementation provided for {BaseDataLoader.__name__}._build.'
+        )
+
+    @abstractmethod
+    def _prepare_and_download(self):
+        raise NotImplementedError(
+            f'No default implementation provided for {BaseDataLoader.__name__}._prepare_and_download.'
+        )
+
+    @abstractmethod
+    def _post_process(self):
+        raise NotImplementedError(
+            f'No default implementation provided for {BaseDataLoader.__name__}._post_process.'
+        )
+
+
+class OssDataLoader(BaseDataLoader):
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        super().__init__(dataset_context_config)
+
+        self.data_files_builder: Optional[DataFilesManager] = None
+        self.dataset: Optional[Union[Dataset, IterableDataset, DatasetDict,
+                                     IterableDatasetDict]] = None
+        self.builder: Optional[DatasetBuilder] = None
+        self.data_files_manager: Optional[DataFilesManager] = None
+
+    def process(self) -> None:
+        """ Sequential data fetching process: authorize -> build -> prepare_and_download -> post_process,
+        to keep dataset_context_config updated. """
+
+        self._authorize()
+        self._build()
+        self._prepare_and_download()
+        self._post_process()
+
+    def _authorize(self) -> None:
+        """ Authorization of target dataset.
+        Get credentials from cache and send to the modelscope-hub in the future. """
+        # TODO: obtain credentials from loacl cache when available.
+        cookies = ModelScopeConfig.get_cookies()
+        git_token = ModelScopeConfig.get_token()
+        user_info = ModelScopeConfig.get_user_info()
+
+        if not self.dataset_context_config.auth_config:
+            auth_config = OssAuthConfig(
+                cookies=cookies, git_token=git_token, user_info=user_info)
+        else:
+            auth_config = self.dataset_context_config.auth_config
+            auth_config.cookies = cookies
+            auth_config.git_token = git_token
+            auth_config.user_info = user_info
+
+        self.dataset_context_config.auth_config = auth_config
+
+    def _build(self) -> None:
+        """ Sequential data files building process: build_meta -> build_data_files , to keep context_config updated. """
+        # Build meta data
+        meta_manager = DataMetaManager(self.dataset_context_config)
+        meta_manager.fetch_meta_files()
+        meta_manager.parse_dataset_structure()
+        self.dataset_context_config = meta_manager.dataset_context_config
+
+        # Build data-files manager
+        self.data_files_manager = DataFilesManager(
+            dataset_context_config=self.dataset_context_config)
+        self.builder = self.data_files_manager.get_data_files_builder()
+
+    def _prepare_and_download(self) -> None:
+        """ Fetch data-files from modelscope dataset-hub. """
+        dataset_py_script = self.dataset_context_config.data_meta_config.dataset_py_script
+        dataset_formation = self.dataset_context_config.data_meta_config.dataset_formation
+        dataset_name = self.dataset_context_config.dataset_name
+        subset_name = self.dataset_context_config.subset_name
+        version = self.dataset_context_config.version
+        split = self.dataset_context_config.split
+        data_dir = self.dataset_context_config.data_dir
+        data_files = self.dataset_context_config.data_files
+        cache_dir = self.dataset_context_config.cache_root_dir
+        download_mode = self.dataset_context_config.download_mode
+        input_kwargs = self.dataset_context_config.config_kwargs
+
+        if self.builder is None and not dataset_py_script:
+            raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
+
+        if dataset_py_script and dataset_formation == DatasetFormations.hf_compatible:
+            self.dataset = hf_data_loader(
+                dataset_py_script,
+                name=subset_name,
+                revision=version,
+                split=split,
+                data_dir=data_dir,
+                data_files=data_files,
+                cache_dir=cache_dir,
+                download_mode=download_mode.value,
+                ignore_verifications=True,
+                **input_kwargs)
+        else:
+            self.dataset = self.data_files_manager.fetch_data_files(
+                self.builder)
+
+    def _post_process(self) -> None:
+        ...
+
+
+class MaxComputeDataLoader(BaseDataLoader):
+    """Data loader for MaxCompute data source."""
+
+    # TODO: MaxCompute data source to be supported .
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        super().__init__(dataset_context_config)
+        self.dataset = None
+
+    def process(self):
+        ...
+
+    def _authorize(self):
+        ...
+
+    def _build(self):
+        ...
+
+    def _prepare_and_download(self):
+        ...
+
+    def _post_process(self):
+        ...
diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py
new file mode 100644
index 00000000..3c8a638a
--- /dev/null
+++ b/modelscope/msdatasets/data_loader/data_loader_manager.py
@@ -0,0 +1,141 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import enum
+import os
+from abc import ABC, abstractmethod
+
+from datasets import load_dataset as hf_data_loader
+
+from modelscope.hub.api import HubApi
+from modelscope.msdatasets.context.dataset_context_config import \
+    DatasetContextConfig
+from modelscope.msdatasets.data_loader.data_loader import OssDataLoader
+from modelscope.utils.constant import EXTENSIONS_TO_LOAD
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class LocalDataLoaderType(enum.Enum):
+    """ Supported data loader types for local dataset: huggingface, PyTorch, Tensorflow """
+    HF_DATA_LOADER = 'hf_data_loader'
+    TORCH_DATA_LOADER = 'torch_data_loader'
+    TF_DATA_LOADER = 'tf_data_loader'
+
+
+class RemoteDataLoaderType(enum.Enum):
+    """ Supported data loader types for remote dataset: huggingface, modelscope """
+    HF_DATA_LOADER = 'hf_data_loader'
+    MS_DATA_LOADER = 'ms_data_loader'
+
+
+class DataLoaderManager(ABC):
+    """Data loader manager, base class."""
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        self.dataset_context_config = dataset_context_config
+
+    @abstractmethod
+    def load_dataset(self, data_loader_type: enum.Enum):
+        ...
+
+
+class LocalDataLoaderManager(DataLoaderManager):
+    """Data loader manager for loading local data."""
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        super().__init__(dataset_context_config=dataset_context_config)
+
+    def load_dataset(self, data_loader_type: enum.Enum):
+        # Get args from context
+        dataset_name = self.dataset_context_config.dataset_name
+        subset_name = self.dataset_context_config.subset_name
+        version = self.dataset_context_config.version
+        split = self.dataset_context_config.split
+        data_dir = self.dataset_context_config.data_dir
+        data_files = self.dataset_context_config.data_files
+        cache_root_dir = self.dataset_context_config.cache_root_dir
+        download_mode = self.dataset_context_config.download_mode
+        use_streaming = self.dataset_context_config.use_streaming
+        input_config_kwargs = self.dataset_context_config.config_kwargs
+
+        # load local single file
+        if os.path.isfile(dataset_name):
+            file_ext = os.path.splitext(dataset_name)[1].strip('.')
+            if file_ext in EXTENSIONS_TO_LOAD:
+                split = None
+                data_files = [dataset_name]
+                dataset_name = EXTENSIONS_TO_LOAD.get(file_ext)
+
+        # Select local data loader
+        # TODO: more loaders to be supported.
+        if data_loader_type == LocalDataLoaderType.HF_DATA_LOADER:
+            # Build huggingface data loader and return dataset.
+            return hf_data_loader(
+                dataset_name,
+                name=subset_name,
+                revision=version,
+                split=split,
+                data_dir=data_dir,
+                data_files=data_files,
+                cache_dir=cache_root_dir,
+                download_mode=download_mode.value,
+                streaming=use_streaming,
+                ignore_verifications=True,
+                **input_config_kwargs)
+        raise f'Expected local data loader type: {LocalDataLoaderType.HF_DATA_LOADER.value}.'
+
+
+class RemoteDataLoaderManager(DataLoaderManager):
+    """Data loader manager for loading remote data."""
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        super().__init__(dataset_context_config=dataset_context_config)
+        self.api = HubApi()
+
+    def load_dataset(self, data_loader_type: enum.Enum):
+        # Get args from context
+        dataset_name = self.dataset_context_config.dataset_name
+        namespace = self.dataset_context_config.namespace
+        subset_name = self.dataset_context_config.subset_name
+        version = self.dataset_context_config.version
+        split = self.dataset_context_config.split
+        data_dir = self.dataset_context_config.data_dir
+        data_files = self.dataset_context_config.data_files
+        download_mode_val = self.dataset_context_config.download_mode.value
+        use_streaming = self.dataset_context_config.use_streaming
+        input_config_kwargs = self.dataset_context_config.config_kwargs
+
+        # To use the huggingface data loader
+        if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
+            dataset_ret = hf_data_loader(
+                dataset_name,
+                name=subset_name,
+                revision=version,
+                split=split,
+                data_dir=data_dir,
+                data_files=data_files,
+                download_mode=download_mode_val,
+                streaming=use_streaming,
+                ignore_verifications=True,
+                **input_config_kwargs)
+            # download statistics
+            self.api.dataset_download_statistics(
+                dataset_name=dataset_name,
+                namespace=namespace,
+                use_streaming=use_streaming)
+            return dataset_ret
+        # To use the modelscope data loader
+        elif data_loader_type == RemoteDataLoaderType.MS_DATA_LOADER:
+            oss_data_loader = OssDataLoader(
+                dataset_context_config=self.dataset_context_config)
+            oss_data_loader.process()
+            # download statistics
+            self.api.dataset_download_statistics(
+                dataset_name=dataset_name,
+                namespace=namespace,
+                use_streaming=use_streaming)
+            return oss_data_loader.dataset
+        else:
+            raise f'Expected remote data loader type: {RemoteDataLoaderType.HF_DATA_LOADER.value}/' \
+                  f'{RemoteDataLoaderType.MS_DATA_LOADER.value}, but got {data_loader_type} .'
diff --git a/modelscope/msdatasets/dataset_cls/__init__.py b/modelscope/msdatasets/dataset_cls/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/dataset_cls/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/dataset_cls/dataset.py b/modelscope/msdatasets/dataset_cls/dataset.py
new file mode 100644
index 00000000..49313e90
--- /dev/null
+++ b/modelscope/msdatasets/dataset_cls/dataset.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+import os
+
+import datasets
+import torchaudio
+from datasets import IterableDataset
+from PIL import Image
+
+from modelscope.utils.constant import EXTENSIONS_TO_LOAD
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class ExternalDataset(object):
+
+    def __init__(self, split_path_dict, config_kwargs):
+        self.split_path_dict = split_path_dict
+        self.config_kwargs = copy.deepcopy(config_kwargs)
+        self.config_kwargs.update({'split_config': split_path_dict})
+        self.ext_dataset = None
+        self.split_data_files = {k: [] for k, _ in split_path_dict.items()}
+        file_ext = ''
+
+        for split_name, split_dir in split_path_dict.items():
+            if isinstance(split_dir, str) and os.path.isdir(split_dir):
+                split_file_names = os.listdir(split_dir)
+                set_files_exts = set([
+                    os.path.splitext(file_name)[-1].strip('.')
+                    for file_name in split_file_names
+                ])
+                if '' in set_files_exts:
+                    continue
+                # ensure these files have same extensions
+                if len(set_files_exts) != 1:
+                    supported_exts = ','.join(EXTENSIONS_TO_LOAD.keys())
+                    logger.error(
+                        f'Split-{split_name} has been ignored, please flatten your folder structure, '
+                        f'and make sure these files have same extensions. '
+                        f'Supported extensions: {supported_exts} .')
+                    continue
+                file_ext = list(set_files_exts)[0]
+                if file_ext not in EXTENSIONS_TO_LOAD:
+                    continue
+
+                split_file_paths = [
+                    os.path.join(split_dir, file_name)
+                    for file_name in split_file_names
+                ]
+                self.split_data_files[split_name] = split_file_paths
+
+        if file_ext:
+            file_ext = EXTENSIONS_TO_LOAD.get(file_ext)
+            self.ext_dataset = datasets.load_dataset(
+                file_ext, data_files=self.split_data_files, **config_kwargs)
+
+    def __len__(self):
+        return len(self.split_path_dict
+                   ) if not self.ext_dataset else self.ext_dataset.__len__()
+
+    def __getitem__(self, item):
+        if not self.ext_dataset:
+            return self.split_path_dict.get(item)
+        else:
+            return self.ext_dataset.__getitem__(item)
+
+    def __iter__(self):
+        if not self.ext_dataset:
+            for k, v in self.split_path_dict.items():
+                yield k, v
+        else:
+            for k, v in self.ext_dataset.items():
+                yield k, v
+
+
+class NativeIterableDataset(IterableDataset):
+    """The modelscope iterable dataset class."""
+
+    def __init__(self, ex_iterable, info, split):
+        super().__init__(ex_iterable=ex_iterable, info=info, split=split)
+
+    def __iter__(self):
+        for key, entity in self._iter():
+            if isinstance(entity, dict):
+                ret = {}
+                for k, v in entity.items():
+                    ret[k] = v
+                    if k.endswith(':FILE'):
+                        dl_manager = self._ex_iterable.kwargs.get('dl_manager')
+                        ex_cache_path = dl_manager.download_and_extract(v)
+                        ret[k] = ex_cache_path
+                        if k.endswith('Image:FILE'):
+                            ret[k + ':Object'] = Image.open(fp=ex_cache_path)
+                        if k.endswith('Audio:FILE'):
+                            waveform_and_rate = torchaudio.load(ex_cache_path)
+                            ret[k + ':Object'] = waveform_and_rate
+                entity = ret
+
+            yield entity
diff --git a/modelscope/msdatasets/download/__init__.py b/modelscope/msdatasets/download/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/download/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/download/dataset_builder.py b/modelscope/msdatasets/download/dataset_builder.py
new file mode 100644
index 00000000..73a3a1a1
--- /dev/null
+++ b/modelscope/msdatasets/download/dataset_builder.py
@@ -0,0 +1,397 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Dict, Union
+
+import datasets
+import pandas as pd
+import pyarrow as pa
+from datasets import (ArrowBasedBuilder, GeneratorBasedBuilder,
+                      IterableDataset, IterableDatasetDict)
+from datasets.filesystems import is_remote_filesystem
+from datasets.info import DatasetInfo
+from datasets.naming import camelcase_to_snakecase
+from datasets.packaged_modules import csv
+from datasets.utils.filelock import FileLock
+from datasets.utils.py_utils import map_nested
+
+from modelscope.hub.api import HubApi
+from modelscope.msdatasets.context.dataset_context_config import \
+    DatasetContextConfig
+from modelscope.msdatasets.dataset_cls.dataset import (ExternalDataset,
+                                                       NativeIterableDataset)
+from modelscope.msdatasets.download.download_manager import \
+    DataStreamingDownloadManager
+from modelscope.msdatasets.utils.dataset_utils import \
+    get_subdir_hash_from_split
+from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
+                                       DatasetPathName, DownloadMode)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+DELIMITER_NAME = 'delimiter'
+DEFAULT_CSV_DELIMITER = ','
+
+
+class CsvDatasetBuilder(csv.Csv):
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        # Init config args
+        self.dataset_name = dataset_context_config.dataset_name
+        self.cache_root_dir = dataset_context_config.cache_root_dir
+        self.namespace = dataset_context_config.namespace
+        self.version = dataset_context_config.version
+        self.subset_name = dataset_context_config.subset_name
+        self.split = dataset_context_config.split
+        self.meta_data_files = dataset_context_config.data_meta_config.meta_data_files
+        self.zip_data_files = dataset_context_config.data_meta_config.zip_data_files
+        self.input_config_kwargs = dataset_context_config.config_kwargs
+
+        self.cache_build_dir = os.path.join(self.cache_root_dir,
+                                            self.namespace, self.dataset_name,
+                                            self.version,
+                                            DatasetPathName.META_NAME)
+        self.csv_delimiter = DEFAULT_CSV_DELIMITER
+        if DELIMITER_NAME in self.input_config_kwargs:
+            self.csv_delimiter = self.input_config_kwargs[DELIMITER_NAME]
+
+        split = self.split or list(dataset_context_config.data_meta_config.
+                                   target_dataset_structure.keys())
+        sub_dir_hash = get_subdir_hash_from_split(
+            split=split, version=self.version)
+
+        super().__init__(
+            cache_dir=self.cache_build_dir,
+            config_name=self.namespace,
+            hash=sub_dir_hash,
+            data_files=self.meta_data_files,
+            **self.input_config_kwargs)
+
+        self.info.builder_name = self.dataset_name
+        self.name = camelcase_to_snakecase(self.dataset_name)
+
+    def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
+        builder_data_dir = os.path.join(
+            self._cache_dir_root,
+            self._relative_data_dir(
+                with_version=False, with_hash=True, namespace=namespace))
+
+        return builder_data_dir
+
+    def _relative_data_dir(self,
+                           with_version=True,
+                           with_hash=True,
+                           namespace=DEFAULT_DATASET_NAMESPACE) -> str:
+        """Relative path of this dataset in cache_dir:
+        Will be:
+            self.name/self.config.version/self.hash/
+        or if a namespace has been specified:
+            self.namespace___self.name/self.config.version/self.hash/
+        """
+        builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}'
+        builder_config = self.config
+        hash = self.hash
+        if builder_config:
+            builder_data_dir = os.path.join(builder_data_dir, self.config_id)
+        if with_version:
+            builder_data_dir = os.path.join(builder_data_dir,
+                                            str(self.config.version))
+        if with_hash and hash and isinstance(hash, str):
+            builder_data_dir = os.path.join(builder_data_dir, hash)
+        return builder_data_dir
+
+    def _split_generators(self, dl_manager):
+        if not self.config.data_files:
+            raise ValueError(
+                'At least one data file must be specified, but got none.')
+        data_files = dl_manager.download_and_extract(self.config.data_files)
+        zip_data_files = dl_manager.download_and_extract(self.zip_data_files)
+        splits = []
+        for split_name, files in data_files.items():
+            if isinstance(files, str):
+                files = [files]
+            splits.append(
+                datasets.SplitGenerator(
+                    name=split_name,
+                    gen_kwargs={
+                        'files': dl_manager.iter_files(files),
+                        'base_dir': zip_data_files.get(split_name)
+                    }))
+        return splits
+
+    def _generate_tables(self, files, base_dir):
+        schema = pa.schema(self.config.features.type
+                           ) if self.config.features is not None else None
+        dtype = {
+            name: dtype.to_pandas_dtype()
+            for name, dtype in zip(schema.names, schema.types)
+        } if schema else None
+        for file_idx, file in enumerate(files):
+            csv_file_reader = pd.read_csv(
+                file, iterator=True, dtype=dtype, delimiter=self.csv_delimiter)
+            transform_fields = []
+            for field_name in csv_file_reader._engine.names:
+                if field_name.endswith(':FILE'):
+                    transform_fields.append(field_name)
+            try:
+                for batch_idx, df in enumerate(csv_file_reader):
+                    for field_name in transform_fields:
+                        if base_dir:
+                            df[field_name] = df[field_name].apply(
+                                lambda x: os.path.join(base_dir, x))
+                    pa_table = pa.Table.from_pandas(df, schema=schema)
+                    yield (file_idx, batch_idx), pa_table
+            except ValueError as e:
+                logger.error(
+                    f"Failed to read file '{file}' with error {type(e)}: {e}")
+                raise
+
+
+class TaskSpecificDatasetBuilder(CsvDatasetBuilder):
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+
+        # Init args
+        self.name = dataset_context_config.dataset_name
+        self.subset_name = dataset_context_config.subset_name
+        self.namespace = dataset_context_config.namespace
+        self.split = dataset_context_config.split
+        self.version = dataset_context_config.version
+        split = self.split or list(dataset_context_config.data_meta_config.
+                                   target_dataset_structure.keys())
+        self.hash = get_subdir_hash_from_split(
+            split=split, version=self.version)
+        self.data_files = dataset_context_config.data_meta_config.meta_data_files
+        self.zip_data_files = dataset_context_config.data_meta_config.zip_data_files
+        self.split_path_dict = None
+        self.config = None
+        self.info = DatasetInfo.from_dict(
+            {'builder_name': dataset_context_config.dataset_name})
+        self._cache_dir_root = os.path.expanduser(
+            dataset_context_config.cache_root_dir)
+        self._cache_dir = self._build_cache_dir()
+        self._config_kwargs = dataset_context_config.data_meta_config.meta_args_map
+
+    def download_and_prepare(self, download_mode, dl_manager,
+                             **download_kwargs):
+        # Prevent parallel disk operations
+        lock_path = os.path.join(
+            self._cache_dir_root,
+            self._cache_dir.replace(os.sep, '_') + '.lock')
+        with FileLock(lock_path):
+            data_exists = os.path.exists(self._cache_dir)
+            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
+                logger.warning(
+                    f'Reusing dataset {self.name} ({self._cache_dir})')
+                return
+            logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
+        self._download_and_prepare(dl_manager=dl_manager)
+
+    def _download_and_prepare(self, dl_manager):
+        self.split_path_dict = dl_manager.download_and_extract(
+            self.zip_data_files)
+
+    def as_dataset(self):
+        return ExternalDataset(self.split_path_dict, self._config_kwargs)
+
+
+class IterableDatasetBuilder(csv.Csv):
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        # Init config args
+        self.dataset_name = dataset_context_config.dataset_name
+        self.cache_root_dir = dataset_context_config.cache_root_dir
+        self.namespace = dataset_context_config.namespace
+        self.version = dataset_context_config.version
+        self.subset_name = dataset_context_config.subset_name
+        self.split = dataset_context_config.split
+        self.meta_data_files = dataset_context_config.data_meta_config.meta_data_files
+        self.zip_data_files = dataset_context_config.data_meta_config.zip_data_files
+        self.input_config_kwargs = dataset_context_config.config_kwargs
+
+        self.cache_build_dir = os.path.join(self.cache_root_dir,
+                                            self.namespace, self.dataset_name,
+                                            self.version,
+                                            DatasetPathName.META_NAME)
+        self.csv_delimiter = DEFAULT_CSV_DELIMITER
+        if DELIMITER_NAME in self.input_config_kwargs:
+            self.csv_delimiter = self.input_config_kwargs[DELIMITER_NAME]
+
+        split = self.split or list(dataset_context_config.data_meta_config.
+                                   target_dataset_structure.keys())
+        sub_dir_hash = get_subdir_hash_from_split(
+            split=split, version=self.version)
+
+        super().__init__(
+            cache_dir=self.cache_build_dir,
+            config_name=self.namespace,
+            hash=sub_dir_hash,
+            data_files=None,  # TODO: self.meta_data_files,
+            **self.input_config_kwargs)
+
+        self.info.builder_name = self.dataset_name
+        self.name = camelcase_to_snakecase(self.dataset_name)
+
+    @staticmethod
+    def get_builder_instance(
+            dataset_context_config: DatasetContextConfig) -> csv.Csv:
+        builder_instance = IterableDatasetBuilder(
+            dataset_context_config=dataset_context_config)
+        return builder_instance
+
+    def as_streaming_dataset(
+        self, dl_manager: DataStreamingDownloadManager
+    ) -> Union[Dict[str, IterableDataset], IterableDataset]:
+
+        if not isinstance(self, (GeneratorBasedBuilder, ArrowBasedBuilder)):
+            raise ValueError(f'Builder {self.name} is not streamable.')
+
+        is_local = not is_remote_filesystem(self._fs)
+        if not is_local:
+            raise NotImplementedError(
+                f'Loading a streaming dataset cached in a {type(self._fs).__name__} is not supported yet.'
+            )
+
+        self._check_manual_download(dl_manager)
+        splits_generators = {
+            sg.name: sg
+            for sg in self._split_generators(dl_manager)
+        }
+
+        # By default, return all splits
+        split = dl_manager.download_config.split
+        if split is None:
+            splits_generator = splits_generators
+        elif split in splits_generators:
+            splits_generator = splits_generators[split]
+        else:
+            raise ValueError(
+                f'Bad split: {split}. Available splits: {list(splits_generators)}'
+            )
+
+        # Create a dataset for each of the given splits
+        streaming_datasets = map_nested(
+            self._as_streaming_dataset_single,
+            splits_generator,
+            map_tuple=True,
+        )
+        if isinstance(streaming_datasets, dict):
+            streaming_datasets = IterableDatasetDict(streaming_datasets)
+        return streaming_datasets
+
+    def _split_generators(self, dl_manager: DataStreamingDownloadManager):
+        splits = []
+        meta_data_file = ''
+        zip_data_file = ''
+        if self.meta_data_files:
+            meta_data_file = next(iter(self.meta_data_files.values()))
+        if self.zip_data_files:
+            zip_data_file = next(iter(self.zip_data_files.values()))
+        if meta_data_file and not zip_data_file:
+            for split_name, meta_file_url in self.meta_data_files.items():
+                splits.append(
+                    datasets.SplitGenerator(
+                        name=split_name,
+                        gen_kwargs={
+                            'meta': meta_file_url,
+                            'files': [],
+                            'dl_manager': dl_manager,
+                        }))
+
+        elif meta_data_file and zip_data_file:
+            for split_name, files in self.zip_data_files.items():
+                if isinstance(files, str):
+                    files = [files]
+                meta_file_url = self.meta_data_files.get(split_name)
+                splits.append(
+                    datasets.SplitGenerator(
+                        name=split_name,
+                        gen_kwargs={
+                            'meta': meta_file_url,
+                            'files': files,
+                            'dl_manager': dl_manager,
+                        }))
+
+        elif not meta_data_file and zip_data_file:
+            for split_name, files in self.zip_data_files.items():
+                if isinstance(files, str):
+                    files = [files]
+                splits.append(
+                    datasets.SplitGenerator(
+                        name=split_name,
+                        gen_kwargs={
+                            'meta': '',
+                            'files': files,
+                            'dl_manager': dl_manager,
+                        }))
+
+        else:
+            raise f'Neither column meta nor data file found in {self.dataset_name}.json, specify at least one column.'
+
+        return splits
+
+    def _as_streaming_dataset_single(
+        self,
+        splits_generator,
+    ) -> NativeIterableDataset:
+
+        ex_iterable = self._get_examples_iterable_for_split(splits_generator)
+        return NativeIterableDataset(
+            ex_iterable, info=self.info, split=splits_generator.name)
+
+    def _generate_tables(self, **gen_kwargs):
+
+        meta_file_url = gen_kwargs.get('meta')
+        files = gen_kwargs.get('files')
+        dl_manager = gen_kwargs.get('dl_manager')
+
+        hub_api = HubApi()
+        is_zip = False
+        zip_file_name = ''
+
+        if files:
+            zip_file = str(next(iter(files)))
+            if zip_file.endswith('.zip'):
+                is_zip = True
+                zip_file_name = os.path.splitext(zip_file)[0]
+
+        if meta_file_url and not files:
+            headers, texts = hub_api.fetch_single_csv_script(meta_file_url)
+            meta_csv_mapping = IterableDatasetBuilder.trans_data_to_mapping(
+                headers, texts, self.csv_delimiter)
+            pa_table = pa.Table.from_pydict(meta_csv_mapping)
+            yield 0, pa_table
+
+        elif meta_file_url and files:
+            # Get meta file
+            headers, texts = hub_api.fetch_single_csv_script(meta_file_url)
+            meta_csv_mapping = IterableDatasetBuilder.trans_data_to_mapping(
+                headers, texts, self.csv_delimiter)
+
+            if is_zip:
+                oss_config_for_unzipped = hub_api.get_dataset_access_config_for_unzipped(
+                    self.dataset_name, self.namespace, self.version,
+                    zip_file_name)
+                dl_manager.download_config.oss_config = oss_config_for_unzipped
+
+            pa_table = pa.Table.from_pydict(meta_csv_mapping)
+            yield 0, pa_table
+
+        elif not meta_file_url and files:
+            pa_table = pa.Table.from_pydict({'Input:FILE': files})
+            yield 0, pa_table
+
+        else:
+            raise f'Neither column meta nor data file found in {self.dataset_name}.json .'
+
+    @staticmethod
+    def trans_data_to_mapping(headers: str, texts: list, delimiter: str):
+        res = {}
+        headers = headers.split(delimiter)
+        for idx in range(0, len(headers)):
+            col_list = []
+            for line in texts:
+                col_list.append(line.split(delimiter)[idx])
+            res[headers[idx]] = col_list
+        return res
diff --git a/modelscope/msdatasets/download/download_config.py b/modelscope/msdatasets/download/download_config.py
new file mode 100644
index 00000000..4af656e0
--- /dev/null
+++ b/modelscope/msdatasets/download/download_config.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Optional, Union
+
+from datasets.download.download_config import DownloadConfig
+
+
+class DataDownloadConfig(DownloadConfig):
+
+    def __init__(self):
+        self.dataset_name: Optional[str] = None
+        self.namespace: Optional[str] = None
+        self.version: Optional[str] = None
+        self.split: Optional[Union[str, list]] = None
+        self.data_dir: Optional[str] = None
+        self.oss_config: Optional[dict] = {}
+        self.meta_args_map: Optional[dict] = {}
+
+    def copy(self) -> 'DataDownloadConfig':
+        return self
diff --git a/modelscope/msdatasets/download/download_manager.py b/modelscope/msdatasets/download/download_manager.py
new file mode 100644
index 00000000..4799171a
--- /dev/null
+++ b/modelscope/msdatasets/download/download_manager.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from datasets.download.download_manager import DownloadManager
+from datasets.download.streaming_download_manager import \
+    StreamingDownloadManager
+from datasets.utils.file_utils import cached_path, is_relative_path
+
+from modelscope.msdatasets.download.download_config import DataDownloadConfig
+from modelscope.msdatasets.utils.oss_utils import OssUtilities
+
+
+class DataDownloadManager(DownloadManager):
+
+    def __init__(self, download_config: DataDownloadConfig):
+        super().__init__(
+            dataset_name=download_config.dataset_name,
+            data_dir=download_config.data_dir,
+            download_config=download_config,
+            record_checksums=True)
+
+    def _download(self, url_or_filename: str,
+                  download_config: DataDownloadConfig) -> str:
+        url_or_filename = str(url_or_filename)
+
+        oss_utilities = OssUtilities(
+            oss_config=download_config.oss_config,
+            dataset_name=download_config.dataset_name,
+            namespace=download_config.namespace,
+            revision=download_config.version)
+
+        if is_relative_path(url_or_filename):
+            # fetch oss files
+            return oss_utilities.download(
+                url_or_filename, download_config=download_config)
+        else:
+            return cached_path(
+                url_or_filename, download_config=download_config)
+
+
+class DataStreamingDownloadManager(StreamingDownloadManager):
+    """The data streaming download manager."""
+
+    def __init__(self, download_config: DataDownloadConfig):
+        super().__init__(
+            dataset_name=download_config.dataset_name,
+            data_dir=download_config.data_dir,
+            download_config=download_config,
+            base_path=download_config.cache_dir)
+
+    def _download(self, url_or_filename: str) -> str:
+        url_or_filename = str(url_or_filename)
+        oss_utilities = OssUtilities(
+            oss_config=self.download_config.oss_config,
+            dataset_name=self.download_config.dataset_name,
+            namespace=self.download_config.namespace,
+            revision=self.download_config.version)
+
+        if is_relative_path(url_or_filename):
+            # fetch oss files
+            return oss_utilities.download(
+                url_or_filename, download_config=self.download_config)
+        else:
+            return cached_path(
+                url_or_filename, download_config=self.download_config)
diff --git a/modelscope/msdatasets/meta/__init__.py b/modelscope/msdatasets/meta/__init__.py
new file mode 100644
index 00000000..b937315b
--- /dev/null
+++ b/modelscope/msdatasets/meta/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/modelscope/msdatasets/meta/data_meta_config.py b/modelscope/msdatasets/meta/data_meta_config.py
new file mode 100644
index 00000000..401a8e14
--- /dev/null
+++ b/modelscope/msdatasets/meta/data_meta_config.py
@@ -0,0 +1,15 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+
+class DataMetaConfig(object):
+    """Modelscope data-meta config class."""
+
+    def __init__(self):
+        self.dataset_scripts = None
+        self.dataset_formation = None
+        self.meta_cache_dir = None
+        self.meta_data_files = None
+        self.zip_data_files = None
+        self.meta_args_map = None
+        self.target_dataset_structure = None
+        self.dataset_py_script = None
diff --git a/modelscope/msdatasets/meta/data_meta_manager.py b/modelscope/msdatasets/meta/data_meta_manager.py
new file mode 100644
index 00000000..bba46e84
--- /dev/null
+++ b/modelscope/msdatasets/meta/data_meta_manager.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+from collections import defaultdict
+
+import json
+from datasets.utils.filelock import FileLock
+
+from modelscope.hub.api import HubApi
+from modelscope.msdatasets.context.dataset_context_config import \
+    DatasetContextConfig
+from modelscope.msdatasets.meta.data_meta_config import DataMetaConfig
+from modelscope.msdatasets.utils.dataset_utils import (
+    get_dataset_files, get_target_dataset_structure)
+from modelscope.utils.constant import (DatasetFormations, DatasetPathName,
+                                       DownloadMode)
+
+
+class DataMetaManager(object):
+    """Data-meta manager."""
+
+    def __init__(self, dataset_context_config: DatasetContextConfig):
+        self.dataset_context_config = dataset_context_config
+        self.api = HubApi()
+
+    def fetch_meta_files(self) -> None:
+
+        # Init meta infos
+        dataset_name = self.dataset_context_config.dataset_name
+        namespace = self.dataset_context_config.namespace
+        download_mode = self.dataset_context_config.download_mode
+        version = self.dataset_context_config.version
+        cache_root_dir = self.dataset_context_config.cache_root_dir
+        subset_name = self.dataset_context_config.subset_name
+        split = self.dataset_context_config.split
+
+        dataset_version_cache_root_dir = os.path.join(cache_root_dir,
+                                                      namespace, dataset_name,
+                                                      version)
+        meta_cache_dir = os.path.join(dataset_version_cache_root_dir,
+                                      DatasetPathName.META_NAME)
+        data_meta_config = self.dataset_context_config.data_meta_config or DataMetaConfig(
+        )
+
+        # Get lock file path
+        if not subset_name:
+            lock_subset_name = DatasetPathName.LOCK_FILE_NAME_ANY
+        else:
+            lock_subset_name = subset_name
+        if not split:
+            lock_split = DatasetPathName.LOCK_FILE_NAME_ANY
+        else:
+            lock_split = split
+        lock_file_name = f'{DatasetPathName.META_NAME}{DatasetPathName.LOCK_FILE_NAME_DELIMITER}{dataset_name}' \
+                         f'{DatasetPathName.LOCK_FILE_NAME_DELIMITER}{version}' \
+                         f'{DatasetPathName.LOCK_FILE_NAME_DELIMITER}' \
+                         f'{lock_subset_name}{DatasetPathName.LOCK_FILE_NAME_DELIMITER}{lock_split}.lock'
+        lock_file_path = os.path.join(dataset_version_cache_root_dir,
+                                      lock_file_name)
+        os.makedirs(dataset_version_cache_root_dir, exist_ok=True)
+
+        # Fetch meta from cache or hub if reuse dataset
+        if download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
+            if os.path.exists(meta_cache_dir) and os.listdir(meta_cache_dir):
+                dataset_scripts, dataset_formation = self._fetch_meta_from_cache(
+                    meta_cache_dir)
+            else:
+                # Fetch meta-files from modelscope-hub if cache does not exist
+                with FileLock(lock_file=lock_file_path):
+                    os.makedirs(meta_cache_dir, exist_ok=True)
+                    dataset_scripts, dataset_formation = self._fetch_meta_from_hub(
+                        dataset_name, namespace, version, meta_cache_dir)
+        # Fetch meta from hub if force download
+        elif download_mode == DownloadMode.FORCE_REDOWNLOAD:
+            # Clean meta-files
+            if os.path.exists(meta_cache_dir) and os.listdir(meta_cache_dir):
+                shutil.rmtree(meta_cache_dir)
+            # Re-download meta-files
+            with FileLock(lock_file=lock_file_path):
+                os.makedirs(meta_cache_dir, exist_ok=True)
+                dataset_scripts, dataset_formation = self._fetch_meta_from_hub(
+                    dataset_name, namespace, version, meta_cache_dir)
+        else:
+            raise ValueError(
+                f'Expected values of download_mode: '
+                f'{DownloadMode.REUSE_DATASET_IF_EXISTS.value} or '
+                f'{DownloadMode.FORCE_REDOWNLOAD.value}, but got {download_mode} .'
+            )
+
+        # Set data_meta_config
+        data_meta_config.meta_cache_dir = meta_cache_dir
+        data_meta_config.dataset_scripts = dataset_scripts
+        data_meta_config.dataset_formation = dataset_formation
+
+        # Set dataset_context_config
+        self.dataset_context_config.data_meta_config = data_meta_config
+        self.dataset_context_config.dataset_version_cache_root_dir = dataset_version_cache_root_dir
+        self.dataset_context_config.global_meta_lock_file_path = lock_file_path
+
+    def parse_dataset_structure(self):
+        # Get dataset_name.json
+        dataset_name = self.dataset_context_config.dataset_name
+        subset_name = self.dataset_context_config.subset_name
+        split = self.dataset_context_config.split
+        namespace = self.dataset_context_config.namespace
+        version = self.dataset_context_config.version
+        data_meta_config = self.dataset_context_config.data_meta_config or DataMetaConfig(
+        )
+
+        dataset_json = None
+        dataset_py_script = None
+        dataset_scripts = data_meta_config.dataset_scripts
+        if not dataset_scripts or len(dataset_scripts) == 0:
+            raise 'Cannot find dataset meta-files, please fetch meta from modelscope hub.'
+        if '.py' in dataset_scripts:
+            dataset_py_script = dataset_scripts['.py'][0]
+        for json_path in dataset_scripts['.json']:
+            if json_path.endswith(f'{dataset_name}.json'):
+                with open(json_path, encoding='utf-8') as dataset_json_file:
+                    dataset_json = json.load(dataset_json_file)
+                break
+        if not dataset_json and not dataset_py_script:
+            raise f'File {dataset_name}.json and {dataset_name}.py not found, please specify at least one meta-file.'
+
+        # Parse meta and get dataset structure
+        if dataset_py_script:
+            data_meta_config.dataset_py_script = dataset_py_script
+        else:
+            target_subset_name, target_dataset_structure = get_target_dataset_structure(
+                dataset_json, subset_name, split)
+            meta_map, file_map, args_map = get_dataset_files(
+                target_dataset_structure, dataset_name, namespace, version)
+
+            data_meta_config.meta_data_files = meta_map
+            data_meta_config.zip_data_files = file_map
+            data_meta_config.meta_args_map = args_map
+            data_meta_config.target_dataset_structure = target_dataset_structure
+
+        self.dataset_context_config.data_meta_config = data_meta_config
+
+    def _fetch_meta_from_cache(self, meta_cache_dir):
+        local_paths = defaultdict(list)
+        dataset_type = None
+        for meta_file_name in os.listdir(meta_cache_dir):
+            file_ext = os.path.splitext(meta_file_name)[-1]
+            if file_ext == DatasetFormations.formation_mark_ext.value:
+                dataset_type = int(os.path.splitext(meta_file_name)[0])
+                continue
+            local_paths[file_ext].append(
+                os.path.join(meta_cache_dir, meta_file_name))
+        if not dataset_type:
+            raise FileNotFoundError(
+                f'{DatasetFormations.formation_mark_ext.value} file does not exist, '
+                f'please use {DownloadMode.FORCE_REDOWNLOAD.value} .')
+
+        return local_paths, DatasetFormations(dataset_type)
+
+    def _fetch_meta_from_hub(self, dataset_name: str, namespace: str,
+                             revision: str, meta_cache_dir: str):
+
+        # Fetch id and type of dataset
+        dataset_id, dataset_type = self.api.get_dataset_id_and_type(
+            dataset_name, namespace)
+
+        # Fetch meta file-list of dataset
+        file_list = self.api.get_dataset_meta_file_list(
+            dataset_name, namespace, dataset_id, revision)
+
+        # Fetch urls of meta-files
+        local_paths, dataset_formation = self.api.get_dataset_meta_files_local_paths(
+            dataset_name, namespace, revision, meta_cache_dir, dataset_type,
+            file_list)
+
+        return local_paths, dataset_formation
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 5c8ea59f..dc0e1e48 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -1,34 +1,33 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
+import warnings
 from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional,
                     Sequence, Union)
 
-import json
 import numpy as np
 import torch
-from datasets import Dataset, DatasetDict
-from datasets import load_dataset as hf_load_dataset
-from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from datasets.packaged_modules import _PACKAGED_DATASETS_MODULES
-from datasets.utils.download_manager import DownloadConfig
-from datasets.utils.file_utils import (is_relative_path,
-                                       relative_to_absolute_path)
+from datasets.utils.file_utils import is_relative_path
 
 from modelscope.hub.repository import DatasetRepository
+from modelscope.msdatasets.context.dataset_context_config import \
+    DatasetContextConfig
+from modelscope.msdatasets.data_loader.data_loader_manager import (
+    LocalDataLoaderManager, LocalDataLoaderType, RemoteDataLoaderManager,
+    RemoteDataLoaderType)
+from modelscope.msdatasets.dataset_cls.dataset import (ExternalDataset,
+                                                       NativeIterableDataset)
 from modelscope.msdatasets.task_datasets.builder import build_task_dataset
-from modelscope.msdatasets.utils.dataset_builder import ExternalDataset
-from modelscope.msdatasets.utils.dataset_utils import (
-    get_dataset_files, get_target_dataset_structure, load_dataset_builder)
 from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager
-from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager
 from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager
 from modelscope.utils.config import ConfigDict
 from modelscope.utils.config_ds import MS_DATASETS_CACHE
 from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
-                                       DEFAULT_DATASET_REVISION,
-                                       DatasetFormations, DownloadMode, Hubs,
-                                       UploadMode)
+                                       DEFAULT_DATASET_REVISION, DownloadMode,
+                                       Hubs, UploadMode)
+from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -92,8 +91,10 @@ class MsDataset:
     # the underlying huggingface Dataset
     _hf_ds = None
 
-    def __init__(self, hf_ds: Dataset, target: Optional[str] = None):
-        self._hf_ds = hf_ds
+    def __init__(self,
+                 ds_instance: Union[Dataset, IterableDataset, ExternalDataset],
+                 target: Optional[str] = None):
+        self._hf_ds = ds_instance
         if target is not None and target not in self._hf_ds.features:
             raise TypeError(
                 f'"target" must be a column of the dataset({list(self._hf_ds.features.keys())}, but got {target}'
@@ -111,8 +112,18 @@ class MsDataset:
         return self._hf_ds[key]
 
     def __len__(self):
+        if isinstance(self._hf_ds, IterableDataset) or isinstance(
+                self._hf_ds, NativeIterableDataset):
+            logger.error(
+                f'object of type `{self._hf_ds.__class__.__name__}` has no __len__()'
+            )
+            return None
         return len(self._hf_ds)
 
+    @property
+    def ds_instance(self):
+        return self._hf_ds
+
     @property
     def config_kwargs(self):
         if isinstance(self._hf_ds, ExternalDataset):
@@ -124,6 +135,13 @@ class MsDataset:
     def from_hf_dataset(cls,
                         hf_ds: Union[Dataset, DatasetDict, ExternalDataset],
                         target: str = None) -> Union[dict, 'MsDataset']:
+        r"""
+        @deprecated
+        This method is deprecated and may be removed in future releases, please use `to_ms_dataset()` instead.
+        """
+        warnings.warn(
+            'from_hf_dataset is deprecated, please use to_ms_dataset instead.',
+            DeprecationWarning)
         if isinstance(hf_ds, Dataset):
             return cls(hf_ds, target)
         elif isinstance(hf_ds, DatasetDict):
@@ -137,6 +155,34 @@ class MsDataset:
                 f'"hf_ds" must be a Dataset or DatasetDict, but got {type(hf_ds)}'
             )
 
+    @classmethod
+    def to_ms_dataset(cls,
+                      ds_instance: Union[Dataset, DatasetDict, ExternalDataset,
+                                         NativeIterableDataset,
+                                         IterableDataset, IterableDatasetDict],
+                      target: str = None) -> Union[dict, 'MsDataset']:
+        """Convert input to `MsDataset` instance."""
+        if isinstance(ds_instance, Dataset):
+            return cls(ds_instance, target)
+        elif isinstance(ds_instance, DatasetDict):
+            if len(ds_instance.keys()) == 1:
+                return cls(next(iter(ds_instance.values())), target)
+            return {k: cls(v, target) for k, v in ds_instance.items()}
+        elif isinstance(ds_instance, ExternalDataset):
+            return cls(ds_instance)
+        elif isinstance(ds_instance, NativeIterableDataset):
+            return cls(ds_instance)
+        elif isinstance(ds_instance, IterableDataset):
+            return cls(ds_instance)
+        elif isinstance(ds_instance, IterableDatasetDict):
+            if len(ds_instance.keys()) == 1:
+                return cls(next(iter(ds_instance.values())), target)
+            return {k: cls(v, target) for k, v in ds_instance.items()}
+        else:
+            raise TypeError(
+                f'"ds_instance" must be a Dataset or DatasetDict, but got {type(ds_instance)}'
+            )
+
     @staticmethod
     def load(
         dataset_name: Union[str, list],
@@ -152,14 +198,20 @@ class MsDataset:
                                                       Sequence[str]]]]] = None,
         download_mode: Optional[DownloadMode] = DownloadMode.
         REUSE_DATASET_IF_EXISTS,
+        cache_dir: Optional[str] = MS_DATASETS_CACHE,
+        use_streaming: Optional[bool] = False,
         **config_kwargs,
-    ) -> Union[dict, 'MsDataset']:
+    ) -> Union[dict, 'MsDataset', NativeIterableDataset]:
         """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
-            Args:
 
+            Args:
                 dataset_name (str): Path or name of the dataset.
+                                    The form of `namespace/dataset_name` is also supported.
                 namespace(str, optional): Namespace of the dataset. It should not be None if you load a remote dataset
                 from Hubs.modelscope,
+                namespace (str, optional):
+                    Namespace of the dataset. It should not be None if you load a remote dataset
+                    from Hubs.modelscope,
                 target (str, optional): Name of the column to output.
                 version (str, optional): Version of the dataset script to load:
                 subset_name (str, optional): Defining the subset_name of the dataset.
@@ -167,173 +219,83 @@ class MsDataset:
                 data_files (str or Sequence or Mapping, optional): Path(s) to source data file(s).
                 split (str, optional): Which split of the data to load.
                 hub (Hubs or str, optional): When loading from a remote hub, where it is from. default Hubs.modelscope
+                download_mode (DownloadMode or str, optional):
+                    How to treat existing datasets. default DownloadMode.REUSE_DATASET_IF_EXISTS
+                config_kwargs (additional keyword arguments): Keyword arguments to be passed
                 download_mode (DownloadMode or str, optional): How to treat existing datasets. default
                                                                DownloadMode.REUSE_DATASET_IF_EXISTS
+                cache_dir (str, Optional): User-define local cache directory.
+                use_streaming (bool, Optional): If set to True, no need to download all data files.
+                                                Instead, it streams the data progressively, and returns
+                                                NativeIterableDataset or a dict of NativeIterableDataset.
                 **config_kwargs (additional keyword arguments): Keyword arguments to be passed
 
             Returns:
-                MsDataset (obj:`MsDataset`): MsDataset object for a certain dataset.
+                MsDataset (MsDataset): MsDataset object for a certain dataset.
             """
+
         download_mode = DownloadMode(download_mode
                                      or DownloadMode.REUSE_DATASET_IF_EXISTS)
         hub = Hubs(hub or Hubs.modelscope)
-        if hub == Hubs.huggingface:
-            dataset = hf_load_dataset(
-                dataset_name,
-                name=subset_name,
-                revision=version,
-                split=split,
-                data_dir=data_dir,
-                data_files=data_files,
-                download_mode=download_mode.value,
-                **config_kwargs)
-            return MsDataset.from_hf_dataset(dataset, target=target)
-        elif hub == Hubs.modelscope:
-            return MsDataset._load_ms_dataset(
-                dataset_name,
-                namespace=namespace,
-                target=target,
-                subset_name=subset_name,
-                version=version,
-                split=split,
-                data_dir=data_dir,
-                data_files=data_files,
-                download_mode=download_mode,
-                **config_kwargs)
 
-    @staticmethod
-    def _load_ms_dataset(dataset_name: Union[str, list],
-                         namespace: Optional[str] = None,
-                         target: Optional[str] = None,
-                         version: Optional[str] = DEFAULT_DATASET_REVISION,
-                         subset_name: Optional[str] = None,
-                         split: Optional[str] = None,
-                         data_dir: Optional[str] = None,
-                         data_files: Optional[Union[
-                             str, Sequence[str],
-                             Mapping[str, Union[str, Sequence[str]]]]] = None,
-                         download_mode: Optional[DownloadMode] = None,
-                         **config_kwargs) -> Union[dict, 'MsDataset']:
-        from modelscope.hub.api import HubApi
-        api = HubApi()
-        download_dataset = ''
-        if isinstance(dataset_name, str):
-            dataset_formation = DatasetFormations.native
-            if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(
-                    dataset_name):
-                dataset_formation = DatasetFormations.hf_compatible
-            elif os.path.isfile(dataset_name) and dataset_name.endswith('.py'):
-                dataset_formation = DatasetFormations.hf_compatible
-                file_name = os.path.basename(dataset_name)
-                download_dataset = os.path.splitext(file_name)[0]
-            elif is_relative_path(dataset_name) and dataset_name.count(
-                    '/') == 0:
-                download_dataset = dataset_name
-                dataset_scripts, dataset_formation, download_dir = api.fetch_dataset_scripts(
-                    dataset_name, namespace, download_mode, version)
-                # dataset organized to be compatible with hf format
-                if dataset_formation == DatasetFormations.hf_compatible:
-                    dataset_name = dataset_scripts['.py'][0]
-            else:
-                raise FileNotFoundError(
-                    f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} "
-                    f'or any data file in the same directory.')
+        if not isinstance(dataset_name, str) and not isinstance(
+                dataset_name, list):
+            raise TypeError(
+                f'dataset_name must be `str` or `list`, but got {type(dataset_name)}'
+            )
 
-            if dataset_formation == DatasetFormations.hf_compatible:
-                dataset = hf_load_dataset(
-                    dataset_name,
-                    name=subset_name,
-                    revision=version,
-                    split=split,
-                    data_dir=data_dir,
-                    data_files=data_files,
-                    cache_dir=MS_DATASETS_CACHE,
-                    download_mode=download_mode.value,
-                    **config_kwargs)
-            else:
-                dataset = MsDataset._load_from_ms(
-                    dataset_name,
-                    dataset_scripts,
-                    download_dir,
-                    namespace=namespace,
-                    version=version,
-                    subset_name=subset_name,
-                    split=split,
-                    download_mode=download_mode,
-                    **config_kwargs)
-        elif isinstance(dataset_name, list):
+        if isinstance(dataset_name, list):
             if target is None:
                 target = 'target'
-            dataset = Dataset.from_dict({target: dataset_name})
-        else:
-            raise TypeError('path must be a str or a list, but got'
-                            f' {type(dataset_name)}')
+            dataset_inst = Dataset.from_dict({target: dataset_name})
+            return MsDataset.to_ms_dataset(dataset_inst, target=target)
 
-        is_ci_test = os.getenv('CI_TEST') == 'True'
-        if download_dataset and not is_ci_test:
-            try:
-                api.on_dataset_download(
-                    dataset_name=download_dataset, namespace=namespace)
-                api.dataset_download_uv(
-                    dataset_name=download_dataset, namespace=namespace)
-            except Exception as e:
-                logger.error(e)
+        dataset_name = os.path.expanduser(dataset_name)
+        if is_relative_path(dataset_name) and dataset_name.count('/') == 1:
+            dataset_name_split = dataset_name.split('/')
+            namespace = dataset_name_split[0].strip()
+            dataset_name = dataset_name_split[1].strip()
+            if not namespace or not dataset_name:
+                raise 'The dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.'
 
-        return MsDataset.from_hf_dataset(dataset, target=target)
-
-    @staticmethod
-    def _load_from_ms(dataset_name: str,
-                      dataset_files: dict,
-                      download_dir: str,
-                      namespace: Optional[str] = None,
-                      version: Optional[str] = DEFAULT_DATASET_REVISION,
-                      subset_name: Optional[str] = None,
-                      split: Optional[str] = None,
-                      download_mode: Optional[DownloadMode] = None,
-                      **config_kwargs) -> Union[Dataset, DatasetDict]:
-        for json_path in dataset_files['.json']:
-            if json_path.endswith(f'{dataset_name}.json'):
-                with open(json_path, encoding='utf-8') as dataset_json_file:
-                    dataset_json = json.load(dataset_json_file)
-                break
-        target_subset_name, target_dataset_structure = get_target_dataset_structure(
-            dataset_json, subset_name, split)
-        meta_map, file_map, args_map = get_dataset_files(
-            target_dataset_structure, dataset_name, namespace, version)
-        builder = load_dataset_builder(
-            dataset_name,
-            subset_name,
-            namespace,
-            meta_data_files=meta_map,
-            zip_data_files=file_map,
-            args_map=args_map,
-            cache_dir=MS_DATASETS_CACHE,
-            version=version,
-            split=list(target_dataset_structure.keys()),
-            **config_kwargs)
-
-        download_config = DownloadConfig(
-            cache_dir=download_dir,
-            force_download=bool(
-                download_mode == DownloadMode.FORCE_REDOWNLOAD),
-            force_extract=bool(download_mode == DownloadMode.FORCE_REDOWNLOAD),
-            use_etag=False,
-        )
-
-        dl_manager = DatasetDownloadManager(
+        # Init context config
+        dataset_context_config = DatasetContextConfig(
             dataset_name=dataset_name,
             namespace=namespace,
             version=version,
-            download_config=download_config,
-            data_dir=download_dir,
-        )
-        builder.download_and_prepare(
-            dl_manager=dl_manager,
-            download_mode=download_mode.value,
-            try_from_hf_gcs=False)
+            subset_name=subset_name,
+            split=split,
+            target=target,
+            hub=hub,
+            data_dir=data_dir,
+            data_files=data_files,
+            download_mode=download_mode,
+            cache_root_dir=cache_dir,
+            use_streaming=use_streaming,
+            **config_kwargs)
 
-        ds = builder.as_dataset()
-        return ds
+        # Load from local disk
+        if dataset_name in _PACKAGED_DATASETS_MODULES or os.path.isdir(
+                dataset_name) or os.path.isfile(dataset_name):
+            dataset_inst = LocalDataLoaderManager(
+                dataset_context_config).load_dataset(
+                    LocalDataLoaderType.HF_DATA_LOADER)
+            return MsDataset.to_ms_dataset(dataset_inst, target=target)
+        # Load from the huggingface hub
+        elif hub == Hubs.huggingface:
+            dataset_inst = RemoteDataLoaderManager(
+                dataset_context_config).load_dataset(
+                    RemoteDataLoaderType.HF_DATA_LOADER)
+            return MsDataset.to_ms_dataset(dataset_inst, target=target)
+        # Load from the modelscope hub
+        elif hub == Hubs.modelscope:
+            dataset_inst = RemoteDataLoaderManager(
+                dataset_context_config).load_dataset(
+                    RemoteDataLoaderType.MS_DATA_LOADER)
+            return MsDataset.to_ms_dataset(dataset_inst, target=target)
+        else:
+            raise 'Please adjust input args to specify a loading mode, we support following scenes: ' \
+                  'loading from local disk, huggingface hub and modelscope hub.'
 
     def to_torch_dataset_with_processors(
         self,
@@ -401,7 +363,7 @@ class MsDataset:
             :class:`tf.data.Dataset`
 
         """
-        if not TORCH_AVAILABLE:
+        if not is_torch_available():
             raise ImportError(
                 'The function to_torch_dataset requires pytorch to be installed'
             )
@@ -470,8 +432,9 @@ class MsDataset:
             )
             return {key: output[i] for i, key in enumerate(sample_res)}
 
+        from tensorflow.data.experimental import AUTOTUNE
         tf_dataset = tf_dataset.map(
-            fetch_function, num_parallel_calls=tf.data.AUTOTUNE)
+            fetch_function, num_parallel_calls=AUTOTUNE)
         if label_cols:
 
             def split_features_and_labels(input_batch):
@@ -494,7 +457,7 @@ class MsDataset:
                 batch_size, drop_remainder=drop_remainder)
 
         if prefetch:
-            tf_dataset = tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
+            tf_dataset = tf_dataset.prefetch(AUTOTUNE)
         return tf_dataset
 
     def to_tf_dataset(
@@ -533,7 +496,7 @@ class MsDataset:
             :class:`tf.data.Dataset`
 
         """
-        if not TF_AVAILABLE:
+        if not is_tf_available():
             raise ImportError(
                 'The function to_tf_dataset requires Tensorflow to be installed.'
             )
@@ -645,15 +608,16 @@ class MsDataset:
                    auth_token: Optional[str] = None,
                    git_path: Optional[str] = None) -> None:
         """Clone meta-file of dataset from the ModelScope Hub.
+
         Args:
             dataset_work_dir (str): Current git working directory.
             dataset_id (str): Dataset id, in the form of your-namespace/your-dataset-name .
-            revision(`Optional[str]`):
+            revision (str, optional):
                 revision of the model you want to clone from. Can be any of a branch, tag or commit hash
-            auth_token(`Optional[str]`):
+            auth_token (str, optional):
                 token obtained when calling `HubApi.login()`. Usually you can safely ignore the parameter
                 as the token is already saved when you login the first time, if None, we will use saved token.
-            git_path:(`Optional[str]`):
+            git_path (str, optional):
                 The git command line path, if None, we use 'git'
         Returns:
             None
diff --git a/modelscope/msdatasets/task_datasets/audio/__init__.py b/modelscope/msdatasets/task_datasets/audio/__init__.py
index c62a8d9c..dc66bd8d 100644
--- a/modelscope/msdatasets/task_datasets/audio/__init__.py
+++ b/modelscope/msdatasets/task_datasets/audio/__init__.py
@@ -5,10 +5,12 @@ from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
     from .kws_farfield_dataset import KWSDataset, KWSDataLoader
+    from .kws_nearfield_dataset import kws_nearfield_dataset
 
 else:
     _import_structure = {
         'kws_farfield_dataset': ['KWSDataset', 'KWSDataLoader'],
+        'kws_nearfield_dataset': ['kws_nearfield_dataset'],
     }
     import sys
 
diff --git a/modelscope/msdatasets/task_datasets/audio/kws_nearfield_dataset.py b/modelscope/msdatasets/task_datasets/audio/kws_nearfield_dataset.py
new file mode 100644
index 00000000..43f28e01
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/audio/kws_nearfield_dataset.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2021 Binbin Zhang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+
+import modelscope.msdatasets.task_datasets.audio.kws_nearfield_processor as processor
+from modelscope.trainers.audio.kws_utils.file_utils import (make_pair,
+                                                            read_lists)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class Processor(IterableDataset):
+
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+
+
+class DistributedSampler:
+
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(
+            rank=self.rank,
+            world_size=self.world_size,
+            worker_id=self.worker_id,
+            num_workers=self.num_workers)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+
+            Args:
+                data(List): input data list
+
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            data = data[self.rank::self.world_size]
+        data = data[self.worker_id::self.num_workers]
+        return data
+
+
+class DataList(IterableDataset):
+
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            # yield dict(src=src)
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+
+
+def kws_nearfield_dataset(data_file,
+                          trans_file,
+                          conf,
+                          symbol_table,
+                          lexicon_table,
+                          partition=True):
+    """ Construct dataset from arguments
+
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+
+        Args:
+            data_file (str): wave list with kaldi style
+            trans_file (str): transcription list with kaldi style
+            symbol_table (Dict): token list, [token_str, token_id]
+            lexicon_table (Dict): words list defined with basic tokens
+            partition (bool): whether to do data partition in terms of rank
+    """
+
+    lists = []
+    filter_conf = conf.get('filter_conf', {})
+
+    wav_lists = read_lists(data_file)
+    trans_lists = read_lists(trans_file)
+    lists = make_pair(wav_lists, trans_lists)
+
+    shuffle = conf.get('shuffle', True)
+    dataset = DataList(lists, shuffle=shuffle, partition=partition)
+
+    dataset = Processor(dataset, processor.parse_wav)
+    dataset = Processor(dataset, processor.tokenize, symbol_table,
+                        lexicon_table, conf.get('split_with_space', False))
+
+    dataset = Processor(dataset, processor.filter, **filter_conf)
+
+    feature_extraction_conf = conf.get('feature_extraction_conf', {})
+    if feature_extraction_conf['feature_type'] == 'mfcc':
+        dataset = Processor(dataset, processor.compute_mfcc,
+                            **feature_extraction_conf)
+    elif feature_extraction_conf['feature_type'] == 'fbank':
+        dataset = Processor(dataset, processor.compute_fbank,
+                            **feature_extraction_conf)
+
+    spec_aug = conf.get('spec_aug', True)
+    if spec_aug:
+        spec_aug_conf = conf.get('spec_aug_conf', {})
+        dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf)
+
+    context_expansion = conf.get('context_expansion', False)
+    if context_expansion:
+        context_expansion_conf = conf.get('context_expansion_conf', {})
+        dataset = Processor(dataset, processor.context_expansion,
+                            **context_expansion_conf)
+
+    frame_skip = conf.get('frame_skip', 1)
+    if frame_skip > 1:
+        dataset = Processor(dataset, processor.frame_skip, frame_skip)
+
+    batch_conf = conf.get('batch_conf', {})
+    dataset = Processor(dataset, processor.batch, **batch_conf)
+    dataset = Processor(dataset, processor.padding)
+    return dataset
diff --git a/modelscope/msdatasets/task_datasets/audio/kws_nearfield_processor.py b/modelscope/msdatasets/task_datasets/audio/kws_nearfield_processor.py
new file mode 100644
index 00000000..d27c9e38
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/audio/kws_nearfield_processor.py
@@ -0,0 +1,427 @@
+# Copyright (c) 2021 Binbin Zhang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import random
+
+import json
+import kaldiio
+import numpy as np
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.nn.utils.rnn import pad_sequence
+
+# torch.set_printoptions(profile="full")
+
+
+def parse_wav(data):
+    """ Parse key/wav/txt from dict line
+
+        Args:
+            data: Iterable[dict()], dict has key/wav/txt/sample_rate keys
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        obj = sample['src']
+        assert 'key' in obj
+        assert 'wav' in obj
+        assert 'txt' in obj
+        key = obj['key']
+        wav_file = obj['wav']
+        txt = obj['txt']
+
+        try:
+            sample_rate, kaldi_waveform = kaldiio.load_mat(wav_file)
+            waveform = torch.tensor(kaldi_waveform, dtype=torch.float32)
+            waveform = waveform.unsqueeze(0)
+            example = dict(
+                key=key, label=txt, wav=waveform, sample_rate=sample_rate)
+            yield example
+        except Exception:
+            logging.warning('Failed to read {}'.format(wav_file))
+
+
+def tokenize(data, token_table, lexicon_table, split_with_space=False):
+    """ Decode text to chars
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+            token_table (Dict): token list, [token_str, token_id]
+            lexicon_table (Dict): words list defined with basic tokens
+            split_with_space (bool): if transciption split with space or not
+
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'label' in sample
+        txt = sample['label'].strip()
+
+        if token_table is None or lexicon_table is None:
+            # to compatible with hard token map for max-pooling loss
+            label = int(txt)
+        else:
+            parts = [txt]
+            tokens = []
+            for part in parts:
+                if split_with_space:
+                    part = part.split(' ')
+                for ch in part:
+                    if ch == ' ':
+                        ch = '▁'
+                    tokens.append(ch)
+
+            label = []
+            for ch in tokens:
+                if ch in lexicon_table:
+                    for sub_ch in lexicon_table[ch]:
+                        if sub_ch in token_table:
+                            label.append(token_table[sub_ch])
+                        else:
+                            label.append(token_table['<blk>'])
+                else:
+                    label.append(token_table['<blk>'])
+
+            sample['tokens'] = tokens
+        sample['label'] = label
+        yield sample
+
+
+def filter(data, max_length=10240, min_length=10):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample or 'feat' in sample
+        num_frames = -1
+        if 'wav' in sample:
+            # sample['wav'] is torch.Tensor, we have 100 frames every second
+            num_frames = int(sample['wav'].size(1) / sample['sample_rate']
+                             * 100)
+        elif 'feat' in sample:
+            num_frames = sample['feat'].size(0)
+
+        # print("{} num frames is {}".format(sample['key'], num_frames))
+        if num_frames < min_length:
+            logging.warning('{} is discard for too short: {} frames'.format(
+                sample['key'], num_frames))
+            continue
+        if num_frames > max_length:
+            logging.warning('{} is discard for too long: {} frames'.format(
+                sample['key'], num_frames))
+            continue
+        yield sample
+
+
+def resample(data, resample_rate=16000):
+    """ Resample data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        if 'wav' in sample:
+            sample_rate = sample['sample_rate']
+            waveform = sample['wav']
+            if sample_rate != resample_rate:
+                sample['sample_rate'] = resample_rate
+                sample['wav'] = torchaudio.transforms.Resample(
+                    orig_freq=sample_rate, new_freq=resample_rate)(
+                        waveform)
+        yield sample
+
+
+def speed_perturb(data, speeds=None):
+    """ Apply speed perturb to the data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            speeds(List[float]): optional speed
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    if speeds is None:
+        speeds = [0.9, 1.0, 1.1]
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        speed = random.choice(speeds)
+        if speed != 1.0:
+            wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+                waveform, sample_rate,
+                [['speed', str(speed)], ['rate', str(sample_rate)]])
+            sample['wav'] = wav
+
+        yield sample
+
+
+def compute_mfcc(
+    data,
+    feature_type='mfcc',
+    num_ceps=80,
+    num_mel_bins=80,
+    frame_length=25,
+    frame_shift=10,
+    dither=0.0,
+):
+    """Extract mfcc
+
+    Args:
+        data: Iterable[{key, wav, label, sample_rate}]
+
+    Returns:
+        Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        # waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.mfcc(
+            waveform,
+            num_ceps=num_ceps,
+            num_mel_bins=num_mel_bins,
+            frame_length=frame_length,
+            frame_shift=frame_shift,
+            dither=dither,
+            energy_floor=0.0,
+            sample_frequency=sample_rate,
+        )
+        yield dict(key=sample['key'], label=sample['label'], feat=mat)
+
+
+def compute_fbank(data,
+                  feature_type='fbank',
+                  num_mel_bins=23,
+                  frame_length=25,
+                  frame_shift=10,
+                  dither=0.0):
+    """ Extract fbank
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        # waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.fbank(
+            waveform,
+            num_mel_bins=num_mel_bins,
+            frame_length=frame_length,
+            frame_shift=frame_shift,
+            dither=dither,
+            energy_floor=0.0,
+            window_type='hamming',
+            sample_frequency=sample_rate)
+        yield dict(key=sample['key'], label=sample['label'], feat=mat)
+
+
+def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10):
+    """ Do spec augmentation
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            num_t_mask: number of time mask to apply
+            num_f_mask: number of freq mask to apply
+            max_t: max width of time mask
+            max_f: max width of freq mask
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        y = x.clone().detach()
+        max_frames = y.size(0)
+        max_freq = y.size(1)
+        # time mask
+        for i in range(num_t_mask):
+            start = random.randint(0, max_frames - 1)
+            length = random.randint(1, max_t)
+            end = min(max_frames, start + length)
+            y[start:end, :] = 0
+        # freq mask
+        for i in range(num_f_mask):
+            start = random.randint(0, max_freq - 1)
+            length = random.randint(1, max_f)
+            end = min(max_freq, start + length)
+            y[:, start:end] = 0
+        sample['feat'] = y
+        yield sample
+
+
+def shuffle(data, shuffle_size=1000):
+    """ Local shuffle the data
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+
+
+def context_expansion(data, left=1, right=1):
+    """ expand left and right frames
+        Args:
+            data: Iterable[{key, feat, label}]
+            left (int): feature left context frames
+            right (int): feature right context frames
+
+        Returns:
+            data: Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        index = 0
+        feats = sample['feat']
+        ctx_dim = feats.shape[0]
+        ctx_frm = feats.shape[1] * (left + right + 1)
+        feats_ctx = torch.zeros(ctx_dim, ctx_frm, dtype=torch.float32)
+        for lag in range(-left, right + 1):
+            feats_ctx[:, index:index + feats.shape[1]] = torch.roll(
+                feats, -lag, 0)
+            index = index + feats.shape[1]
+
+        # replication pad left margin
+        for idx in range(left):
+            for cpx in range(left - idx):
+                feats_ctx[idx, cpx * feats.shape[1]:(cpx + 1)
+                          * feats.shape[1]] = feats_ctx[left, :feats.shape[1]]
+
+        feats_ctx = feats_ctx[:feats_ctx.shape[0] - right]
+        sample['feat'] = feats_ctx
+        yield sample
+
+
+def frame_skip(data, skip_rate=1):
+    """ skip frame
+        Args:
+            data: Iterable[{key, feat, label}]
+            skip_rate (int): take every N-frames for model input
+
+        Returns:
+            data: Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        feats_skip = sample['feat'][::skip_rate, :]
+        sample['feat'] = feats_skip
+        yield sample
+
+
+def batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+
+
+def padding(data):
+    """ Padding the data into training data
+
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        feats_length = torch.tensor([x['feat'].size(0) for x in sample],
+                                    dtype=torch.int32)
+        order = torch.argsort(feats_length, descending=True)
+        feats_lengths = torch.tensor(
+            [sample[i]['feat'].size(0) for i in order], dtype=torch.int32)
+        sorted_feats = [sample[i]['feat'] for i in order]
+        sorted_keys = [sample[i]['key'] for i in order]
+
+        assert type(sample[0]['label']) is list
+
+        sorted_labels = [
+            torch.tensor(sample[i]['label'], dtype=torch.int32) for i in order
+        ]
+        label_lengths = torch.tensor([len(sample[i]['label']) for i in order],
+                                     dtype=torch.int32)
+
+        padded_feats = pad_sequence(
+            sorted_feats, batch_first=True, padding_value=0)
+        padded_labels = pad_sequence(
+            sorted_labels, batch_first=True, padding_value=-1)
+        yield (sorted_keys, padded_feats, padded_labels, feats_lengths,
+               label_lengths)
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/__init__.py b/modelscope/msdatasets/task_datasets/damoyolo/__init__.py
new file mode 100644
index 00000000..2a3bccdb
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .build import build_dataloader, build_dataset
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/build.py b/modelscope/msdatasets/task_datasets/damoyolo/build.py
new file mode 100644
index 00000000..7592fe6b
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/build.py
@@ -0,0 +1,141 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+import bisect
+import copy
+import math
+
+import torch.utils.data
+
+from modelscope.utils.torch_utils import get_world_size
+from . import datasets as D
+from .collate_batch import BatchCollator
+from .datasets import MosaicWrapper
+from .samplers import DistributedSampler, IterationBasedBatchSampler
+from .transforms import build_transforms
+
+
+def build_dataset(cfg,
+                  image_dir,
+                  ann_file,
+                  is_train=True,
+                  mosaic_mixup=None,
+                  dataset_format='COCODataset'):
+
+    factory = getattr(D, dataset_format)
+    args = dict(root=image_dir, ann_file=ann_file)
+    args['transforms'] = None
+    # make dataset from factory
+    dataset = factory(**args)
+    # mosaic wrapped
+    if is_train and mosaic_mixup is not None:
+        dataset = MosaicWrapper(
+            dataset=dataset,
+            img_size=mosaic_mixup.mosaic_size,
+            mosaic_prob=mosaic_mixup.mosaic_prob,
+            mixup_prob=mosaic_mixup.mixup_prob,
+            transforms=None,
+            degrees=mosaic_mixup.degrees,
+            translate=mosaic_mixup.translate,
+            shear=mosaic_mixup.shear,
+            mosaic_scale=mosaic_mixup.mosaic_scale,
+            mixup_scale=mosaic_mixup.mixup_scale)
+
+    return [
+        dataset,
+    ]
+
+
+def make_data_sampler(dataset, shuffle, distributed=False):
+    if distributed:
+        return DistributedSampler(dataset, shuffle=shuffle)
+    else:
+        return torch.utils.data.RandomSampler(dataset)
+
+
+def _quantize(x, bins):
+    bins = copy.copy(bins)
+    bins = sorted(bins)
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+
+def _compute_aspect_ratios(dataset):
+    aspect_ratios = []
+    for i in range(len(dataset)):
+        img_info = dataset.get_img_info(i)
+        aspect_ratio = float(img_info['height']) / float(img_info['width'])
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def make_batch_sampler(dataset,
+                       sampler,
+                       images_per_batch,
+                       num_iters=None,
+                       start_iter=0,
+                       mosaic_warpper=False):
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, images_per_batch, drop_last=False)
+    if num_iters is not None:
+        batch_sampler = IterationBasedBatchSampler(
+            batch_sampler, num_iters, start_iter, enable_mosaic=mosaic_warpper)
+    return batch_sampler
+
+
+def build_dataloader(datasets,
+                     augment,
+                     batch_size=128,
+                     start_epoch=None,
+                     total_epochs=None,
+                     no_aug_epochs=0,
+                     is_train=True,
+                     num_workers=8,
+                     size_div=32,
+                     distributed=False):
+
+    num_gpus = get_world_size()
+    assert (batch_size % num_gpus == 0), 'training_imgs_per_batch ({}) must be' \
+        'divisible by the number of GPUs ({}) used.'.format(batch_size, num_gpus)
+    images_per_gpu = batch_size // num_gpus
+
+    if is_train:
+        iters_per_epoch = math.ceil(len(datasets[0]) / batch_size)
+        shuffle = True
+        num_iters = total_epochs * iters_per_epoch
+        start_iter = start_epoch * iters_per_epoch
+    else:
+        iters_per_epoch = math.ceil(len(datasets[0]) / batch_size)
+        shuffle = False
+        num_iters = None
+        start_iter = 0
+
+    transforms = augment.transform
+    enable_mosaic_mixup = 'mosaic_mixup' in augment
+
+    transforms = build_transforms(start_epoch, total_epochs, no_aug_epochs,
+                                  iters_per_epoch, num_workers, batch_size,
+                                  num_gpus, **transforms)
+
+    for dataset in datasets:
+        dataset._transforms = transforms
+        if hasattr(dataset, '_dataset'):
+            dataset._dataset._transforms = transforms
+
+    data_loaders = []
+    for dataset in datasets:
+        sampler = make_data_sampler(dataset, shuffle)
+        batch_sampler = make_batch_sampler(dataset, sampler, images_per_gpu,
+                                           num_iters, start_iter,
+                                           enable_mosaic_mixup)
+        collator = BatchCollator(size_div)
+        data_loader = torch.utils.data.DataLoader(
+            dataset,
+            num_workers=num_workers,
+            batch_sampler=batch_sampler,
+            collate_fn=collator,
+        )
+        data_loaders.append(data_loader)
+    if is_train:
+        assert len(
+            data_loaders) == 1, 'multi-training set is not supported yet!'
+        return data_loaders[0]
+    return data_loaders
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/collate_batch.py b/modelscope/msdatasets/task_datasets/damoyolo/collate_batch.py
new file mode 100644
index 00000000..ac14a3c4
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/collate_batch.py
@@ -0,0 +1,32 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+from modelscope.models.cv.tinynas_detection.damo.structures.image_list import \
+    to_image_list
+
+
+class BatchCollator(object):
+    """
+    From a list of samples from the dataset,
+    returns the batched images and targets.
+    This should be passed to the DataLoader
+    """
+
+    def __init__(self, size_divisible=0):
+        self.size_divisible = size_divisible
+
+    def __call__(self, batch):
+        transposed_batch = list(zip(*batch))
+        images = to_image_list(transposed_batch[0], self.size_divisible)
+        targets = transposed_batch[1]
+        img_ids = transposed_batch[2]
+        return images, targets, img_ids
+
+
+class TTACollator(object):
+    """
+    From a list of samples from the dataset,
+    returns the images and targets.
+    Images should be converted to batched images in `im_detect_bbox_aug`
+    """
+
+    def __call__(self, batch):
+        return list(zip(*batch))
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/datasets/__init__.py b/modelscope/msdatasets/task_datasets/damoyolo/datasets/__init__.py
new file mode 100644
index 00000000..7dc2de4a
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/datasets/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .coco import COCODataset
+from .mosaic_wrapper import MosaicWrapper
+
+__all__ = [
+    'COCODataset',
+    'MosaicWrapper',
+]
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/datasets/coco.py b/modelscope/msdatasets/task_datasets/damoyolo/datasets/coco.py
new file mode 100644
index 00000000..99999eb1
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/datasets/coco.py
@@ -0,0 +1,112 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+import cv2
+import numpy as np
+import torch
+from torchvision.datasets.coco import CocoDetection
+
+from modelscope.models.cv.tinynas_detection.damo.structures.bounding_box import \
+    BoxList
+
+cv2.setNumThreads(0)
+
+
+class COCODataset(CocoDetection):
+
+    def __init__(self, ann_file, root, transforms=None):
+        super(COCODataset, self).__init__(root, ann_file)
+        # sort indices for reproducible results
+        self.ids = sorted(self.ids)
+
+        self.json_category_id_to_contiguous_id = {
+            v: i + 1
+            for i, v in enumerate(self.coco.getCatIds())
+        }
+        self.contiguous_category_id_to_json_id = {
+            v: k
+            for k, v in self.json_category_id_to_contiguous_id.items()
+        }
+        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
+        self._transforms = transforms
+
+    def __getitem__(self, inp):
+        if type(inp) is tuple:
+            idx = inp[1]
+        else:
+            idx = inp
+        img, anno = super(COCODataset, self).__getitem__(idx)
+        # filter crowd annotations
+        # TODO might be better to add an extra field
+        anno = [obj for obj in anno if obj['iscrowd'] == 0]
+
+        boxes = [obj['bbox'] for obj in anno]
+        boxes = torch.as_tensor(boxes).reshape(-1, 4)  # guard against no boxes
+        target = BoxList(boxes, img.size, mode='xywh').convert('xyxy')
+
+        classes = [obj['category_id'] for obj in anno]
+        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
+
+        classes = torch.tensor(classes)
+        target.add_field('labels', classes)
+
+        if anno and 'keypoints' in anno[0]:
+            keypoints = [obj['keypoints'] for obj in anno]
+            target.add_field('keypoints', keypoints)
+
+        target = target.clip_to_image(remove_empty=True)
+
+        # PIL to numpy array
+        img = np.asarray(img)  # rgb
+
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target, idx
+
+    def pull_item(self, idx):
+        img, anno = super(COCODataset, self).__getitem__(idx)
+
+        # filter crowd annotations
+        # TODO might be better to add an extra field
+        anno = [obj for obj in anno if obj['iscrowd'] == 0]
+
+        boxes = [obj['bbox'] for obj in anno]
+        boxes = torch.as_tensor(boxes).reshape(-1, 4)  # guard against no boxes
+        target = BoxList(boxes, img.size, mode='xywh').convert('xyxy')
+        target = target.clip_to_image(remove_empty=True)
+
+        classes = [obj['category_id'] for obj in anno]
+        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
+
+        obj_masks = []
+        for obj in anno:
+            obj_mask = []
+            if 'segmentation' in obj:
+                for mask in obj['segmentation']:
+                    obj_mask += mask
+                if len(obj_mask) > 0:
+                    obj_masks.append(obj_mask)
+        seg_masks = [
+            np.array(obj_mask, dtype=np.float32).reshape(-1, 2)
+            for obj_mask in obj_masks
+        ]
+
+        res = np.zeros((len(target.bbox), 5))
+        for idx in range(len(target.bbox)):
+            res[idx, 0:4] = target.bbox[idx]
+            res[idx, 4] = classes[idx]
+
+        img = np.asarray(img)  # rgb
+
+        return img, res, seg_masks, idx
+
+    def load_anno(self, idx):
+        _, anno = super(COCODataset, self).__getitem__(idx)
+        anno = [obj for obj in anno if obj['iscrowd'] == 0]
+        classes = [obj['category_id'] for obj in anno]
+        classes = [self.json_category_id_to_contiguous_id[c] for c in classes]
+
+        return classes
+
+    def get_img_info(self, index):
+        img_id = self.id_to_img_map[index]
+        img_data = self.coco.imgs[img_id]
+        return img_data
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/datasets/mosaic_wrapper.py b/modelscope/msdatasets/task_datasets/damoyolo/datasets/mosaic_wrapper.py
new file mode 100644
index 00000000..318c46e6
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/datasets/mosaic_wrapper.py
@@ -0,0 +1,424 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import math
+import random
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.models.cv.tinynas_detection.damo.structures.bounding_box import \
+    BoxList
+from modelscope.models.cv.tinynas_detection.damo.utils import adjust_box_anns
+
+
+def xyn2xy(x, scale, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = scale * x[:, 0] + padw  # top left x
+    y[:, 1] = scale * x[:, 1] + padh  # top left y
+    return y
+
+
+def resample_segments(segments, n=1000):
+    # Up-sample an (n,2) segment
+    for i, s in enumerate(segments):
+        x = np.linspace(0, len(s) - 1, n)
+        xp = np.arange(len(s))
+        segments[i] = np.concatenate([
+            np.interp(x, xp, s[:, i]) for i in range(2)
+        ]).reshape(2, -1).T  # segment xy
+    return segments
+
+
+def segment2box(segment, width=640, height=640):
+    # Convert 1 segment label to 1 box label, applying inside-image constraint,
+    # i.e. (xy1, xy2, ...) to (xyxy)
+    x, y = segment.T  # segment xy
+    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+    x, y, = x[inside], y[inside]
+    return np.array([x.min(), y.min(), x.max(),
+                     y.max()]) if any(x) else np.zeros((1, 4))  # xyxy
+
+
+def get_aug_params(value, center=0):
+    if isinstance(value, float):
+        return random.uniform(center - value, center + value)
+    elif len(value) == 2:
+        return random.uniform(value[0], value[1])
+    else:
+        raise ValueError(
+            'Affine params should be either a sequence containing two values\
+                          or single float values. Got {}'.format(value))
+
+
+def box_candidates(box1,
+                   box2,
+                   wh_thr=2,
+                   ar_thr=20,
+                   area_thr=0.1,
+                   eps=1e-16):  # box1(4,n), box2(4,n)
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
+    valid_w = w2 > wh_thr
+    valid_h = h2 > wh_thr
+    valid_ar = (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)
+    return valid_w & valid_h & valid_ar
+
+
+def get_transform_matrix(img_shape, new_shape, degrees, scale, shear,
+                         translate):
+    new_height, new_width = new_shape
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img_shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img_shape[0] / 2  # y translation (pixels)
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    s = get_aug_params(scale, center=1.0)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi
+                       / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi
+                       / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = random.uniform(
+        0.5 - translate, 0.5 + translate) * new_width  # x translation (pixels)
+    T[1,
+      2] = random.uniform(0.5 - translate, 0.5
+                          + translate) * new_height  # y transla ion (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ C  # order of operations (right to left) is IMPORTANT
+    return M, s
+
+
+def random_affine(
+        img,
+        targets=(),
+        segments=None,
+        target_size=(640, 640),
+        degrees=10,
+        translate=0.1,
+        scales=0.1,
+        shear=10,
+):
+    M, scale = get_transform_matrix(img.shape[:2], target_size, degrees,
+                                    scales, shear, translate)
+
+    if (M != np.eye(3)).any():  # image changed
+        img = cv2.warpAffine(
+            img, M[:2], dsize=target_size, borderValue=(114, 114, 114))
+
+    # Transform label coordinates
+    n = len(targets)
+    if (n and len(segments) == 0) or (len(segments) != len(targets)):
+        new = np.zeros((n, 4))
+
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+            n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        xy = xy[:, :2].reshape(n, 8)  # perspective rescale or affine
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        new = np.concatenate(
+            (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # clip
+        new[:, [0, 2]] = new[:, [0, 2]].clip(0, target_size[0])
+        new[:, [1, 3]] = new[:, [1, 3]].clip(0, target_size[1])
+
+    else:
+        segments = resample_segments(segments)  # upsample
+        new = np.zeros((len(targets), 4))
+        assert len(segments) <= len(targets)
+        for i, segment in enumerate(segments):
+            xy = np.ones((len(segment), 3))
+            xy[:, :2] = segment
+            xy = xy @ M.T  # transform
+            xy = xy[:, :2]  # perspective rescale or affine
+            # clip
+            new[i] = segment2box(xy, target_size[0], target_size[1])
+
+    # filter candidates
+    i = box_candidates(
+        box1=targets[:, 0:4].T * scale, box2=new.T, area_thr=0.1)
+    targets = targets[i]
+    targets[:, 0:4] = new[i]
+
+    return img, targets
+
+
+def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h,
+                          input_w):
+    # TODO update doc
+    # index0 to top left part of image
+    if mosaic_index == 0:
+        x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
+        small_coord = w - (x2 - x1), h - (y2 - y1), w, h
+    # index1 to top right part of image
+    elif mosaic_index == 1:
+        x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
+        small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
+    # index2 to bottom left part of image
+    elif mosaic_index == 2:
+        x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
+        small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
+    # index2 to bottom right part of image
+    elif mosaic_index == 3:
+        x1, y1, x2, y2 = xc, yc, min(xc + w,
+                                     input_w * 2), min(input_h * 2,
+                                                       yc + h)  # noqa
+        small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
+    return (x1, y1, x2, y2), small_coord
+
+
+class MosaicWrapper(torch.utils.data.dataset.Dataset):
+    """Detection dataset wrapper that performs mixup for normal dataset."""
+
+    def __init__(self,
+                 dataset,
+                 img_size,
+                 mosaic_prob=1.0,
+                 mixup_prob=1.0,
+                 transforms=None,
+                 degrees=10.0,
+                 translate=0.1,
+                 mosaic_scale=(0.1, 2.0),
+                 mixup_scale=(0.5, 1.5),
+                 shear=2.0,
+                 *args):
+        super().__init__()
+        self._dataset = dataset
+        self.input_dim = img_size
+        self._transforms = transforms
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = mosaic_scale
+        self.shear = shear
+        self.mixup_scale = mixup_scale
+        self.mosaic_prob = mosaic_prob
+        self.mixup_prob = mixup_prob
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __getitem__(self, inp):
+        if type(inp) is tuple:
+            enable_mosaic_mixup = inp[0]
+            idx = inp[1]
+        else:
+            enable_mosaic_mixup = False
+            idx = inp
+        img, labels, segments, img_id = self._dataset.pull_item(idx)
+
+        if enable_mosaic_mixup:
+            if random.random() < self.mosaic_prob:
+                mosaic_labels = []
+                mosaic_segments = []
+                input_h, input_w = self.input_dim[0], self.input_dim[1]
+
+                yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
+                xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
+
+                # 3 additional image indices
+                indices = [idx] + [
+                    random.randint(0,
+                                   len(self._dataset) - 1) for _ in range(3)
+                ]
+
+                for i_mosaic, index in enumerate(indices):
+                    img, _labels, _segments, img_id = self._dataset.pull_item(
+                        index)
+                    h0, w0 = img.shape[:2]  # orig hw
+                    scale = min(1. * input_h / h0, 1. * input_w / w0)
+                    img = cv2.resize(
+                        img, (int(w0 * scale), int(h0 * scale)),
+                        interpolation=cv2.INTER_LINEAR)
+                    # generate output mosaic image
+                    (h, w, c) = img.shape[:3]
+                    if i_mosaic == 0:
+                        mosaic_img = np.full((input_h * 2, input_w * 2, c),
+                                             114,
+                                             dtype=np.uint8)  # pad 114
+
+                    (l_x1, l_y1, l_x2,
+                     l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate(
+                         mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w)
+
+                    mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2,
+                                                           s_x1:s_x2]
+                    padw, padh = l_x1 - s_x1, l_y1 - s_y1
+
+                    labels = _labels.copy()
+                    # Normalized xywh to pixel xyxy format
+                    if _labels.size > 0:
+                        labels[:, 0] = scale * _labels[:, 0] + padw
+                        labels[:, 1] = scale * _labels[:, 1] + padh
+                        labels[:, 2] = scale * _labels[:, 2] + padw
+                        labels[:, 3] = scale * _labels[:, 3] + padh
+                    segments = [
+                        xyn2xy(x, scale, padw, padh) for x in _segments
+                    ]
+                    mosaic_segments.extend(segments)
+                    mosaic_labels.append(labels)
+
+                if len(mosaic_labels):
+                    mosaic_labels = np.concatenate(mosaic_labels, 0)
+                    np.clip(
+                        mosaic_labels[:, 0],
+                        0,
+                        2 * input_w,
+                        out=mosaic_labels[:, 0])
+                    np.clip(
+                        mosaic_labels[:, 1],
+                        0,
+                        2 * input_h,
+                        out=mosaic_labels[:, 1])
+                    np.clip(
+                        mosaic_labels[:, 2],
+                        0,
+                        2 * input_w,
+                        out=mosaic_labels[:, 2])
+                    np.clip(
+                        mosaic_labels[:, 3],
+                        0,
+                        2 * input_h,
+                        out=mosaic_labels[:, 3])
+
+                if len(mosaic_segments):
+                    assert input_w == input_h
+                    for x in mosaic_segments:
+                        np.clip(
+                            x, 0, 2 * input_w,
+                            out=x)  # clip when using random_perspective()
+
+                img, labels = random_affine(
+                    mosaic_img,
+                    mosaic_labels,
+                    mosaic_segments,
+                    target_size=(input_w, input_h),
+                    degrees=self.degrees,
+                    translate=self.translate,
+                    scales=self.scale,
+                    shear=self.shear,
+                )
+
+            # -----------------------------------------------------------------
+            # CopyPaste: https://arxiv.org/abs/2012.07177
+            # -----------------------------------------------------------------
+            if (not len(labels) == 0 and random.random() < self.mixup_prob):
+                img, labels = self.mixup(img, labels, self.input_dim)
+
+            # transfer labels to BoxList
+            h_tmp, w_tmp = img.shape[:2]
+            boxes = np.array([label[:4] for label in labels])
+            boxes = torch.as_tensor(boxes).reshape(-1, 4)
+            areas = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
+            valid_idx = areas > 4
+
+            target = BoxList(boxes[valid_idx], (w_tmp, h_tmp), mode='xyxy')
+
+            classes = [label[4] for label in labels]
+            classes = torch.tensor(classes)[valid_idx]
+            target.add_field('labels', classes.long())
+
+            if self._transforms is not None:
+                img, target = self._transforms(img, target)
+
+            # -----------------------------------------------------------------
+            # img_info and img_id are not used for training.
+            # They are also hard to be specified on a mosaic image.
+            # -----------------------------------------------------------------
+            return img, target, img_id
+
+        else:
+            return self._dataset.__getitem__(idx)
+
+    def mixup(self, origin_img, origin_labels, input_dim):
+        jit_factor = random.uniform(*self.mixup_scale)
+        FLIP = random.uniform(0, 1) > 0.5
+        cp_labels = []
+        while len(cp_labels) == 0:
+            cp_index = random.randint(0, self.__len__() - 1)
+            cp_labels = self._dataset.load_anno(cp_index)
+        img, cp_labels, _, _ = self._dataset.pull_item(cp_index)
+
+        if len(img.shape) == 3:
+            cp_img = np.ones((input_dim[0], input_dim[1], 3),
+                             dtype=np.uint8) * 114  # pad 114
+        else:
+            cp_img = np.ones(input_dim, dtype=np.uint8) * 114  # pad 114
+
+        cp_scale_ratio = min(input_dim[0] / img.shape[0],
+                             input_dim[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * cp_scale_ratio),
+             int(img.shape[0] * cp_scale_ratio)),
+            interpolation=cv2.INTER_LINEAR,
+        )
+
+        cp_img[:int(img.shape[0]
+                    * cp_scale_ratio), :int(img.shape[1]
+                                            * cp_scale_ratio)] = resized_img
+
+        cp_img = cv2.resize(
+            cp_img,
+            (int(cp_img.shape[1] * jit_factor),
+             int(cp_img.shape[0] * jit_factor)),
+        )
+        cp_scale_ratio *= jit_factor
+
+        if FLIP:
+            cp_img = cp_img[:, ::-1, :]
+
+        origin_h, origin_w = cp_img.shape[:2]
+        target_h, target_w = origin_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w, target_w), 3),
+            dtype=np.uint8)
+        padded_img[:origin_h, :origin_w] = cp_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        cp_bboxes_origin_np = adjust_box_anns(cp_labels[:, :4].copy(),
+                                              cp_scale_ratio, 0, 0, origin_w,
+                                              origin_h)
+        if FLIP:
+            cp_bboxes_origin_np[:, 0::2] = (
+                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
+        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
+        cp_bboxes_transformed_np[:, 0::2] = np.clip(
+            cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
+        cp_bboxes_transformed_np[:, 1::2] = np.clip(
+            cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
+
+        cls_labels = cp_labels[:, 4:5].copy()
+        box_labels = cp_bboxes_transformed_np
+        labels = np.hstack((box_labels, cls_labels))
+        origin_labels = np.vstack((origin_labels, labels))
+        origin_img = origin_img.astype(np.float32)
+        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
+            np.float32)
+
+        return origin_img.astype(np.uint8), origin_labels
+
+    def get_img_info(self, index):
+        return self._dataset.get_img_info(index)
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/__init__.py b/modelscope/msdatasets/task_datasets/damoyolo/evaluation/__init__.py
new file mode 100644
index 00000000..b121b80b
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/evaluation/__init__.py
@@ -0,0 +1,28 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from modelscope.msdatasets.task_datasets.damoyolo import datasets
+from .coco import coco_evaluation
+
+
+def evaluate(dataset, predictions, output_folder, **kwargs):
+    """evaluate dataset using different methods based on dataset type.
+    Args:
+        dataset: Dataset object
+        predictions(list[BoxList]): each item in the list represents the
+            prediction results for one image.
+        output_folder: output folder, to save evaluation files or results.
+        **kwargs: other args.
+    Returns:
+        evaluation result
+    """
+    args = dict(
+        dataset=dataset,
+        predictions=predictions,
+        output_folder=output_folder,
+        **kwargs)
+    if isinstance(dataset, datasets.COCODataset):
+        return coco_evaluation(**args)
+    else:
+        dataset_name = dataset.__class__.__name__
+        raise NotImplementedError(
+            'Unsupported dataset type {}.'.format(dataset_name))
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/__init__.py b/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/__init__.py
new file mode 100644
index 00000000..e9f4030f
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/__init__.py
@@ -0,0 +1,23 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from .coco_eval import do_coco_evaluation
+
+
+def coco_evaluation(
+    dataset,
+    predictions,
+    output_folder,
+    box_only,
+    iou_types,
+    expected_results,
+    expected_results_sigma_tol,
+):
+    return do_coco_evaluation(
+        dataset=dataset,
+        predictions=predictions,
+        box_only=box_only,
+        output_folder=output_folder,
+        iou_types=iou_types,
+        expected_results=expected_results,
+        expected_results_sigma_tol=expected_results_sigma_tol,
+    )
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/coco_eval.py b/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/coco_eval.py
new file mode 100644
index 00000000..b42648fc
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/evaluation/coco/coco_eval.py
@@ -0,0 +1,338 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import os
+import tempfile
+from collections import OrderedDict
+
+import torch
+
+from modelscope.models.cv.tinynas_detection.damo.structures.bounding_box import \
+    BoxList
+from modelscope.models.cv.tinynas_detection.damo.structures.boxlist_ops import \
+    boxlist_iou
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def do_coco_evaluation(
+    dataset,
+    predictions,
+    box_only,
+    output_folder,
+    iou_types,
+    expected_results,
+    expected_results_sigma_tol,
+):
+
+    if box_only:
+        logger.info('Evaluating bbox proposals')
+        areas = {'all': '', 'small': 's', 'medium': 'm', 'large': 'l'}
+        res = COCOResults('box_proposal')
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = evaluate_box_proposals(
+                    predictions, dataset, area=area, limit=limit)
+                key = 'AR{}@{:d}'.format(suffix, limit)
+                res.results['box_proposal'][key] = stats['ar'].item()
+        logger.info(res)
+        check_expected_results(res, expected_results,
+                               expected_results_sigma_tol)
+        if output_folder:
+            torch.save(res, os.path.join(output_folder, 'box_proposals.pth'))
+        return
+    logger.info('Preparing results for COCO format')
+    coco_results = {}
+    if 'bbox' in iou_types:
+        logger.info('Preparing bbox results')
+        coco_results['bbox'] = prepare_for_coco_detection(predictions, dataset)
+
+    results = COCOResults(*iou_types)
+    logger.info('Evaluating predictions')
+    for iou_type in iou_types:
+        with tempfile.NamedTemporaryFile() as f:
+            file_path = f.name
+            if output_folder:
+                file_path = os.path.join(output_folder, iou_type + '.json')
+            res = evaluate_predictions_on_coco(dataset.coco,
+                                               coco_results[iou_type],
+                                               file_path, iou_type)
+            results.update(res)
+    logger.info(results)
+    check_expected_results(results, expected_results,
+                           expected_results_sigma_tol)
+    if output_folder:
+        torch.save(results, os.path.join(output_folder, 'coco_results.pth'))
+    return results, coco_results
+
+
+def prepare_for_coco_detection(predictions, dataset):
+    # assert isinstance(dataset, COCODataset)
+    coco_results = []
+    for image_id, prediction in enumerate(predictions):
+        original_id = dataset.id_to_img_map[image_id]
+        if len(prediction) == 0:
+            continue
+
+        img_info = dataset.get_img_info(image_id)
+        image_width = img_info['width']
+        image_height = img_info['height']
+        prediction = prediction.resize((image_width, image_height))
+        prediction = prediction.convert('xywh')
+
+        boxes = prediction.bbox.tolist()
+        scores = prediction.get_field('scores').tolist()
+        labels = prediction.get_field('labels').tolist()
+
+        mapped_labels = [
+            dataset.contiguous_category_id_to_json_id[i] for i in labels
+        ]
+        coco_results.extend([{
+            'image_id': original_id,
+            'category_id': mapped_labels[k],
+            'bbox': box,
+            'score': scores[k],
+        } for k, box in enumerate(boxes)])
+    return coco_results
+
+
+# inspired from Detectron
+def evaluate_box_proposals(predictions,
+                           dataset,
+                           thresholds=None,
+                           area='all',
+                           limit=None):
+    """Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code.
+    However, it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        'all': 0,
+        'small': 1,
+        'medium': 2,
+        'large': 3,
+        '96-128': 4,
+        '128-256': 5,
+        '256-512': 6,
+        '512-inf': 7,
+    }
+    area_ranges = [
+        [0**2, 1e5**2],  # all
+        [0**2, 32**2],  # small
+        [32**2, 96**2],  # medium
+        [96**2, 1e5**2],  # large
+        [96**2, 128**2],  # 96-128
+        [128**2, 256**2],  # 128-256
+        [256**2, 512**2],  # 256-512
+        [512**2, 1e5**2],
+    ]  # 512-inf
+    assert area in areas, 'Unknown area range: {}'.format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+
+    for image_id, prediction in enumerate(predictions):
+        original_id = dataset.id_to_img_map[image_id]
+
+        img_info = dataset.get_img_info(image_id)
+        image_width = img_info['width']
+        image_height = img_info['height']
+        prediction = prediction.resize((image_width, image_height))
+        # prediction = prediction.resize((image_height, image_width))
+
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = prediction.get_field('objectness').sort(descending=True)[1]
+        prediction = prediction[inds]
+
+        ann_ids = dataset.coco.getAnnIds(imgIds=original_id)
+        anno = dataset.coco.loadAnns(ann_ids)
+        gt_boxes = [obj['bbox'] for obj in anno if obj['iscrowd'] == 0]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(
+            -1, 4)  # guard against no boxes
+        gt_boxes = BoxList(
+            gt_boxes, (image_width, image_height), mode='xywh').convert('xyxy')
+        gt_areas = torch.as_tensor(
+            [obj['area'] for obj in anno if obj['iscrowd'] == 0])
+
+        if len(gt_boxes) == 0:
+            continue
+
+        valid_gt_inds = (gt_areas >= area_range[0]) & (
+            gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+
+        num_pos += len(gt_boxes)
+
+        if len(gt_boxes) == 0:
+            continue
+
+        if len(prediction) == 0:
+            continue
+
+        if limit is not None and len(prediction) > limit:
+            prediction = prediction[:limit]
+
+        overlaps = boxlist_iou(prediction, gt_boxes)
+
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(prediction), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = torch.cat(gt_overlaps, dim=0)
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        'ar': ar,
+        'recalls': recalls,
+        'thresholds': thresholds,
+        'gt_overlaps': gt_overlaps,
+        'num_pos': num_pos,
+    }
+
+
+def evaluate_predictions_on_coco(coco_gt,
+                                 coco_results,
+                                 json_result_file,
+                                 iou_type='bbox'):
+    import json
+
+    with open(json_result_file, 'w') as f:
+        json.dump(coco_results, f)
+
+    from pycocotools.coco import COCO
+    from pycocotools.cocoeval import COCOeval
+
+    coco_dt = coco_gt.loadRes(
+        str(json_result_file)) if coco_results else COCO()
+
+    # coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+
+    # compute_thresholds_for_classes(coco_eval)
+
+    return coco_eval
+
+
+def compute_thresholds_for_classes(coco_eval):
+    '''
+    The function is used to compute the thresholds corresponding to best
+    f-measure. The resulting thresholds are used in fcos_demo.py.
+    '''
+    import numpy as np
+    # dimension of precision: [TxRxKxAxM]
+    precision = coco_eval.eval['precision']
+    # we compute thresholds with IOU being 0.5
+    precision = precision[0, :, :, 0, -1]
+    scores = coco_eval.eval['scores']
+    scores = scores[0, :, :, 0, -1]
+
+    recall = np.linspace(0, 1, num=precision.shape[0])
+    recall = recall[:, None]
+
+    f_measure = (2 * precision * recall) / (
+        np.maximum(precision + recall, 1e-6))
+    max_f_measure = f_measure.max(axis=0)
+    max_f_measure_inds = f_measure.argmax(axis=0)
+    scores = scores[max_f_measure_inds, range(len(max_f_measure_inds))]
+
+    print('Maximum f-measures for classes:')
+    print(list(max_f_measure))
+    print('Score thresholds for classes (used in demos for visualization):')
+    print(list(scores))
+
+
+class COCOResults(object):
+    METRICS = {
+        'bbox': ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl'],
+        'segm': ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl'],
+        'box_proposal': [
+            'AR@100',
+            'ARs@100',
+            'ARm@100',
+            'ARl@100',
+            'AR@1000',
+            'ARs@1000',
+            'ARm@1000',
+            'ARl@1000',
+        ],
+        'keypoints': ['AP', 'AP50', 'AP75', 'APm', 'APl'],
+    }
+
+    def __init__(self, *iou_types):
+        allowed_types = ('box_proposal', 'bbox', 'segm', 'keypoints')
+        assert all(iou_type in allowed_types for iou_type in iou_types)
+        results = OrderedDict()
+        for iou_type in iou_types:
+            results[iou_type] = OrderedDict([
+                (metric, -1) for metric in COCOResults.METRICS[iou_type]
+            ])
+        self.results = results
+
+    def update(self, coco_eval):
+        if coco_eval is None:
+            return
+        from pycocotools.cocoeval import COCOeval
+
+        assert isinstance(coco_eval, COCOeval)
+        s = coco_eval.stats
+        iou_type = coco_eval.params.iouType
+        res = self.results[iou_type]
+        metrics = COCOResults.METRICS[iou_type]
+        for idx, metric in enumerate(metrics):
+            res[metric] = s[idx]
+
+    def __repr__(self):
+        # TODO make it pretty
+        return repr(self.results)
+
+
+def check_expected_results(results, expected_results, sigma_tol):
+    if not expected_results:
+        return
+
+    for task, metric, (mean, std) in expected_results:
+        actual_val = results.results[task][metric]
+        lo = mean - sigma_tol * std
+        hi = mean + sigma_tol * std
+        ok = (lo < actual_val) and (actual_val < hi)
+        msg = ('{} > {} sanity check (actual vs. expected): '
+               '{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})'
+               ).format(task, metric, actual_val, mean, std, lo, hi)
+        if not ok:
+            msg = 'FAIL: ' + msg
+            logger.error(msg)
+        else:
+            msg = 'PASS: ' + msg
+            logger.info(msg)
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/__init__.py b/modelscope/msdatasets/task_datasets/damoyolo/samplers/__init__.py
new file mode 100644
index 00000000..3d824800
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/samplers/__init__.py
@@ -0,0 +1,9 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from .distributed import DistributedSampler
+from .grouped_batch_sampler import GroupedBatchSampler
+from .iteration_based_batch_sampler import IterationBasedBatchSampler
+
+__all__ = [
+    'DistributedSampler', 'GroupedBatchSampler', 'IterationBasedBatchSampler'
+]
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/distributed.py b/modelscope/msdatasets/task_datasets/damoyolo/samplers/distributed.py
new file mode 100644
index 00000000..edc58c4e
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/samplers/distributed.py
@@ -0,0 +1,70 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Code is copy-pasted exactly as in torch.utils.data.distributed.
+# FIXME remove this once c10d fixes the bug it has
+import math
+
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import Sampler
+
+
+class DistributedSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'Requires distributed package to be available')
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError(
+                    'Requires distributed package to be available')
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+
+    def __iter__(self):
+        if self.shuffle:
+            # deterministically shuffle based on epoch
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/grouped_batch_sampler.py b/modelscope/msdatasets/task_datasets/damoyolo/samplers/grouped_batch_sampler.py
new file mode 100644
index 00000000..27e45ad9
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/samplers/grouped_batch_sampler.py
@@ -0,0 +1,115 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import itertools
+
+import torch
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces elements from the same group appear in groups of batch_size.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_uneven (bool): If ``True``, the sampler will drop the batches
+            whose size is less than ``batch_size``
+    """
+
+    def __init__(self, sampler, group_ids, batch_size, drop_uneven=False):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                'sampler should be an instance of '
+                'torch.utils.data.Sampler, but got sampler={}'.format(sampler))
+        self.sampler = sampler
+        self.group_ids = torch.as_tensor(group_ids)
+        assert self.group_ids.dim() == 1
+        self.batch_size = batch_size
+        self.drop_uneven = drop_uneven
+
+        self.groups = torch.unique(self.group_ids).sort(0)[0]
+
+        self._can_reuse_batches = False
+
+    def _prepare_batches(self):
+        dataset_size = len(self.group_ids)
+        # get the sampled indices from the sampler
+        sampled_ids = torch.as_tensor(list(self.sampler))
+        # potentially not all elements of the dataset were sampled
+        # by the sampler (e.g., DistributedSampler).
+        # construct a tensor which contains -1 if the element was
+        # not sampled, and a non-negative number indicating the
+        # order where the element was sampled.
+        # for example. if sampled_ids = [3, 1] and dataset_size = 5,
+        # the order is [-1, 1, -1, 0, -1]
+        order = torch.full((dataset_size, ), -1, dtype=torch.int64)
+        order[sampled_ids] = torch.arange(len(sampled_ids))
+
+        # get a mask with the elements that were sampled
+        mask = order >= 0
+
+        # find the elements that belong to each individual cluster
+        clusters = [(self.group_ids == i) & mask for i in self.groups]
+        # get relative order of the elements inside each cluster
+        # that follows the order from the sampler
+        relative_order = [order[cluster] for cluster in clusters]
+        # with the relative order, find the absolute order in the
+        # sampled space
+        permutation_ids = [s[s.sort()[1]] for s in relative_order]
+        # permute each cluster so that they follow the order from
+        # the sampler
+        permuted_clusters = [sampled_ids[idx] for idx in permutation_ids]
+
+        # splits each cluster in batch_size, and merge as a list of tensors
+        splits = [c.split(self.batch_size) for c in permuted_clusters]
+        merged = tuple(itertools.chain.from_iterable(splits))
+
+        # now each batch internally has the right order, but
+        # they are grouped by clusters. Find the permutation between
+        # different batches that brings them as close as possible to
+        # the order that we have in the sampler. For that, we will consider the
+        # ordering as coming from the first element of each batch, and sort
+        # correspondingly
+        first_element_of_batch = [t[0].item() for t in merged]
+        # get and inverse mapping from sampled indices and the position where
+        # they occur (as returned by the sampler)
+        inv_sampled_ids_map = {
+            v: k
+            for k, v in enumerate(sampled_ids.tolist())
+        }
+        # from the first element in each batch, get a relative ordering
+        first_index_of_batch = torch.as_tensor(
+            [inv_sampled_ids_map[s] for s in first_element_of_batch])
+
+        # permute the batches so that they approximately follow the order
+        # from the sampler
+        permutation_order = first_index_of_batch.sort(0)[1].tolist()
+        # finally, permute the batches
+        batches = [merged[i].tolist() for i in permutation_order]
+
+        if self.drop_uneven:
+            kept = []
+            for batch in batches:
+                if len(batch) == self.batch_size:
+                    kept.append(batch)
+            batches = kept
+        return batches
+
+    def __iter__(self):
+        if self._can_reuse_batches:
+            batches = self._batches
+            self._can_reuse_batches = False
+        else:
+            batches = self._prepare_batches()
+        self._batches = batches
+        return iter(batches)
+
+    def __len__(self):
+        if not hasattr(self, '_batches'):
+            self._batches = self._prepare_batches()
+            self._can_reuse_batches = True
+        return len(self._batches)
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/samplers/iteration_based_batch_sampler.py b/modelscope/msdatasets/task_datasets/damoyolo/samplers/iteration_based_batch_sampler.py
new file mode 100644
index 00000000..0d8eeb65
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/samplers/iteration_based_batch_sampler.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+from torch.utils.data.sampler import BatchSampler
+
+
+class IterationBasedBatchSampler(BatchSampler):
+    """
+    Wraps a BatchSampler, resampling from it until
+    a specified number of iterations have been sampled
+    """
+
+    def __init__(self,
+                 batch_sampler,
+                 num_iterations,
+                 start_iter=0,
+                 enable_mosaic=False):
+        self.batch_sampler = batch_sampler
+        self.num_iterations = num_iterations
+        self.start_iter = start_iter
+        self.enable_mosaic = enable_mosaic
+
+    def __iter__(self):
+        iteration = self.start_iter
+        while iteration <= self.num_iterations:
+            # if the underlying sampler has a set_epoch method, like
+            # DistributedSampler, used for making each process see
+            # a different split of the dataset, then set it
+            if hasattr(self.batch_sampler.sampler, 'set_epoch'):
+                self.batch_sampler.sampler.set_epoch(iteration)
+            for batch in self.batch_sampler:
+                iteration += 1
+                if iteration > self.num_iterations:
+                    break
+                yield [(self.enable_mosaic, idx) for idx in batch]
+
+    def __len__(self):
+        return self.num_iterations
+
+    def set_mosaic(self, enable_mosaic):
+        self.enable_mosaic = enable_mosaic
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/transforms/__init__.py b/modelscope/msdatasets/task_datasets/damoyolo/transforms/__init__.py
new file mode 100644
index 00000000..e4c1954d
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/transforms/__init__.py
@@ -0,0 +1,5 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from .build import build_transforms
+from .transforms import (Compose, Normalize, RandomHorizontalFlip, Resize,
+                         ToTensor)
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/transforms/build.py b/modelscope/msdatasets/task_datasets/damoyolo/transforms/build.py
new file mode 100644
index 00000000..29911078
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/transforms/build.py
@@ -0,0 +1,35 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+from modelscope.models.cv.tinynas_detection.damo.augmentations.scale_aware_aug import \
+    SA_Aug
+from . import transforms as T
+
+
+def build_transforms(start_epoch,
+                     total_epochs,
+                     no_aug_epochs,
+                     iters_per_epoch,
+                     num_workers,
+                     batch_size,
+                     num_gpus,
+                     image_max_range=(640, 640),
+                     flip_prob=0.5,
+                     image_mean=[0, 0, 0],
+                     image_std=[1., 1., 1.],
+                     autoaug_dict=None):
+
+    transform = [
+        T.Resize(image_max_range),
+        T.RandomHorizontalFlip(flip_prob),
+        T.ToTensor(),
+        T.Normalize(mean=image_mean, std=image_std),
+    ]
+
+    if autoaug_dict is not None:
+        transform += [
+            SA_Aug(iters_per_epoch, start_epoch, total_epochs, no_aug_epochs,
+                   batch_size, num_gpus, num_workers, autoaug_dict)
+        ]
+
+    transform = T.Compose(transform)
+
+    return transform
diff --git a/modelscope/msdatasets/task_datasets/damoyolo/transforms/transforms.py b/modelscope/msdatasets/task_datasets/damoyolo/transforms/transforms.py
new file mode 100644
index 00000000..58c5db54
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/damoyolo/transforms/transforms.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+import random
+
+import cv2
+import numpy as np
+import torch
+from torchvision.transforms import functional as F
+
+
+class Compose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target=None):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+
+
+class Resize(object):
+
+    def __init__(self, max_range):
+        if not isinstance(max_range, (list, tuple)):
+            max_range = (max_range, )
+        self.max_range = max_range
+
+    def get_size_ratio(self, image_size):
+        target_size = random.choice(self.max_range)
+        w, h = image_size
+        t_w, t_h = target_size, target_size
+        r = min(t_w / w, t_h / h)
+        o_w, o_h = int(w * r), int(h * r)
+        return (o_w, o_h)
+
+    def __call__(self, image, target=None):
+        h, w = image.shape[:2]
+        size = self.get_size_ratio((w, h))
+
+        image = cv2.resize(
+            image, size, interpolation=cv2.INTER_LINEAR).astype(np.uint8)
+        image = image.transpose((2, 0, 1))
+        image = np.ascontiguousarray(image, dtype=np.float32)
+        if isinstance(target, list):
+            target = [t.resize(size) for t in target]
+        elif target is None:
+            return image, target
+        else:
+            target = target.resize(size)
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+
+    def __init__(self, prob=0.5):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            image = image[:, :, ::-1]
+            image = np.ascontiguousarray(image, dtype=np.float32)
+            if target is not None:
+                target = target.transpose(0)
+        return image, target
+
+
+class ToTensor(object):
+
+    def __call__(self, image, target):
+        return torch.from_numpy(image), target
+
+
+class Normalize(object):
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        return image, target
diff --git a/modelscope/msdatasets/task_datasets/gopro_image_deblurring_dataset.py b/modelscope/msdatasets/task_datasets/gopro_image_deblurring_dataset.py
new file mode 100644
index 00000000..fb621551
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/gopro_image_deblurring_dataset.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Datasets
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.sidd_image_denoising.data_utils import (
+    img2tensor, padding)
+from modelscope.msdatasets.task_datasets.sidd_image_denoising.transforms import (
+    augment, paired_random_crop)
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
+
+
+@TASK_DATASETS.register_module(
+    Tasks.image_deblurring, module_name=Datasets.PairedDataset)
+class GoproImageDeblurringDataset(TorchTaskDataset):
+    """Paired image dataset for image restoration.
+    """
+
+    def __init__(self, dataset, opt, is_train):
+        self.dataset = dataset
+        self.opt = opt
+        self.is_train = is_train
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load gt and lq images. Dimension order: HWC; channel order: BGR;
+        # image range: [0, 1], float32.
+        item_dict = self.dataset[index]
+        gt_path = item_dict['Sharp Image:FILE']
+        img_gt = default_loader(gt_path)
+        lq_path = item_dict['Blur Image:FILE']
+        img_lq = default_loader(lq_path)
+
+        # augmentation for training
+        if self.is_train:
+            gt_size = self.opt.gt_size
+            # padding
+            img_gt, img_lq = padding(img_gt, img_lq, gt_size)
+
+            # random crop
+            img_gt, img_lq = paired_random_crop(
+                img_gt, img_lq, gt_size, scale=1)
+
+            # flip, rotation
+            img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip,
+                                     self.opt.use_rot)
+
+        # BGR to RGB, HWC to CHW, numpy to tensor
+        img_gt, img_lq = img2tensor([img_gt, img_lq],
+                                    bgr2rgb=True,
+                                    float32=True)
+
+        return {'input': img_lq, 'target': img_gt}
diff --git a/modelscope/msdatasets/task_datasets/mgeo_ranking_dataset.py b/modelscope/msdatasets/task_datasets/mgeo_ranking_dataset.py
new file mode 100644
index 00000000..9adccd7c
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/mgeo_ranking_dataset.py
@@ -0,0 +1,176 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+import json
+import torch
+from datasets import Dataset, IterableDataset, concatenate_datasets
+from torch.utils.data import ConcatDataset
+from transformers import DataCollatorWithPadding
+
+from modelscope.metainfo import Models
+from modelscope.utils.constant import ModeKeys, Tasks
+from .base import TaskDataset
+from .builder import TASK_DATASETS
+from .torch_base_dataset import TorchTaskDataset
+
+
+@TASK_DATASETS.register_module(
+    group_key=Tasks.text_ranking, module_name=Models.mgeo)
+class MGeoRankingDataset(TorchTaskDataset):
+
+    def __init__(self,
+                 datasets: Union[Any, List[Any]],
+                 mode,
+                 preprocessor=None,
+                 *args,
+                 **kwargs):
+        self.seed = kwargs.get('seed', 42)
+        self.permutation = None
+        self.datasets = None
+        self.dataset_config = kwargs
+        self.query_sequence = self.dataset_config.get('query_sequence',
+                                                      'query')
+        self.query_gis_sequence = self.dataset_config.get(
+            'query_gis_sequence', 'query_gis')
+        self.pos_sequence = self.dataset_config.get('pos_sequence',
+                                                    'positive_passages')
+        self.neg_sequence = self.dataset_config.get('neg_sequence',
+                                                    'negative_passages')
+        self.text_fileds = self.dataset_config.get('text_fileds',
+                                                   ['text', 'gis'])
+        self.qid_field = self.dataset_config.get('qid_field', 'query_id')
+        if mode == ModeKeys.TRAIN:
+            self.neg_samples = self.dataset_config.get('neg_sample', 4)
+
+        super().__init__(datasets, mode, preprocessor, **kwargs)
+
+    def __getitem__(self, index) -> Any:
+        if self.mode == ModeKeys.TRAIN:
+            return self.__get_train_item__(index)
+        else:
+            return self.__get_test_item__(index)
+
+    def __get_test_item__(self, index):
+        group = self._inner_dataset[index]
+        labels = []
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [ele['text'] for ele in pos_sequences]
+
+        labels.extend([1] * len(pos_sequences))
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [ele['text'] for ele in neg_sequences]
+        labels.extend([0] * len(neg_sequences))
+        qid = group[self.qid_field]
+        examples = pos_sequences + neg_sequences
+
+        if 'gis' in self.text_fileds:
+            qry_gis = [json.loads(group[self.query_gis_sequence])]
+            pos_sequences_gis = [
+                json.loads(ele['gis']) for ele in group[self.pos_sequence]
+            ]
+            neg_sequences_gis = [
+                json.loads(ele['gis']) for ele in group[self.neg_sequence]
+            ]
+            examples_gis = pos_sequences_gis + neg_sequences_gis
+        else:
+            qry_gis = None
+            pos_sequences_gis = None
+            neg_sequences_gis = None
+            examples_gis = None
+
+        sample = {
+            'qid': torch.LongTensor([int(qid)] * len(labels)),
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+            self.preprocessor.first_sequence_gis: qry_gis,
+            self.preprocessor.second_sequence_gis: examples_gis,
+            'labels': torch.LongTensor(labels),
+        }
+        return self.prepare_sample(sample)
+
+    def __get_train_item__(self, index):
+        group = self._inner_dataset[index]
+
+        qry = group[self.query_sequence]
+
+        pos_sequences = group[self.pos_sequence]
+        pos_sequences = [ele['text'] for ele in pos_sequences]
+
+        neg_sequences = group[self.neg_sequence]
+        neg_sequences = [ele['text'] for ele in neg_sequences]
+
+        pos_psg = random.choice(pos_sequences)
+
+        if len(neg_sequences) < self.neg_samples:
+            negs = random.choices(neg_sequences, k=self.neg_samples)
+        else:
+            negs = random.sample(neg_sequences, k=self.neg_samples)
+        examples = [pos_psg] + negs
+
+        if 'gis' in self.text_fileds:
+            qry_gis = [json.loads(group[self.query_gis_sequence])]
+            pos_sequences_gis = [
+                json.loads(ele['gis']) for ele in group[self.pos_sequence]
+            ]
+            neg_sequences_gis = [
+                json.loads(ele['gis']) for ele in group[self.neg_sequence]
+            ]
+            examples_gis = pos_sequences_gis + neg_sequences_gis
+        else:
+            qry_gis = None
+            pos_sequences_gis = None
+            neg_sequences_gis = None
+            examples_gis = None
+
+        sample = {
+            self.preprocessor.first_sequence: qry,
+            self.preprocessor.second_sequence: examples,
+            self.preprocessor.first_sequence_gis: qry_gis,
+            self.preprocessor.second_sequence_gis: examples_gis,
+        }
+        return self.prepare_sample(sample)
+
+    def __len__(self):
+        return len(self._inner_dataset)
+
+    def prepare_dataset(self, datasets: Union[Any, List[Any]]) -> Any:
+        """Prepare a dataset.
+
+        User can process the input datasets in a whole dataset perspective.
+        This method gives a default implementation of datasets merging, user can override this
+        method to write custom logics.
+
+        Args:
+            datasets: The original dataset(s)
+
+        Returns: A single dataset, which may be created after merging.
+
+        """
+        if isinstance(datasets, List):
+            if len(datasets) == 1:
+                return datasets[0]
+            elif len(datasets) > 1:
+                return ConcatDataset(datasets)
+        else:
+            return datasets
+
+    def prepare_sample(self, data):
+        """Preprocess the data fetched from the inner_dataset.
+
+        If the preprocessor is None, the original data will be returned, else the preprocessor will be called.
+        User can override this method to implement custom logics.
+
+        Args:
+            data: The data fetched from the dataset.
+
+        Returns: The processed data.
+
+        """
+        return self.preprocessor(
+            data) if self.preprocessor is not None else data
diff --git a/modelscope/msdatasets/task_datasets/video_frame_interpolation/__init__.py b/modelscope/msdatasets/task_datasets/video_frame_interpolation/__init__.py
new file mode 100644
index 00000000..b9a338c1
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_frame_interpolation/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .video_frame_interpolation_dataset import VideoFrameInterpolationDataset
+
+else:
+    _import_structure = {
+        'video_frame_interpolation_dataset':
+        ['VideoFrameInterpolationDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/video_frame_interpolation/data_utils.py b/modelscope/msdatasets/task_datasets/video_frame_interpolation/data_utils.py
new file mode 100644
index 00000000..ae876b18
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_frame_interpolation/data_utils.py
@@ -0,0 +1,41 @@
+# ------------------------------------------------------------------------
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+# ------------------------------------------------------------------------
+
+import cv2
+import torch
+import torch.nn.functional as F
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    else:
+        return _totensor(imgs, bgr2rgb, float32)
+
+
+def img_padding(img_tensor, height, width, pad_num=32):
+    ph = ((height - 1) // pad_num + 1) * pad_num
+    pw = ((width - 1) // pad_num + 1) * pad_num
+    padding = (0, pw - width, 0, ph - height)
+    img_tensor = F.pad(img_tensor, padding)
+    return img_tensor
diff --git a/modelscope/msdatasets/task_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py b/modelscope/msdatasets/task_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py
new file mode 100644
index 00000000..44b965a7
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_frame_interpolation/video_frame_interpolation_dataset.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from collections import defaultdict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.msdatasets.task_datasets.video_frame_interpolation.data_utils import (
+    img2tensor, img_padding)
+from modelscope.utils.constant import Tasks
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32)
+
+
+@TASK_DATASETS.register_module(
+    Tasks.video_frame_interpolation,
+    module_name=Models.video_frame_interpolation)
+class VideoFrameInterpolationDataset(TorchTaskDataset):
+    """Dataset for video frame-interpolation.
+    """
+
+    def __init__(self, dataset, opt):
+        self.dataset = dataset
+        self.opt = opt
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load frames. Dimension order: HWC; channel order: BGR;
+        # image range: [0, 1], float32
+        item_dict = self.dataset[index]
+        img0 = default_loader(item_dict['Input1:FILE'])
+        img1 = default_loader(item_dict['Input2:FILE'])
+        img2 = default_loader(item_dict['Input3:FILE'])
+        img3 = default_loader(item_dict['Input4:FILE'])
+        gt = default_loader(item_dict['Output:FILE'])
+
+        img0, img1, img2, img3, gt = img2tensor([img0, img1, img2, img3, gt],
+                                                bgr2rgb=False,
+                                                float32=True)
+
+        imgs = torch.cat((img0, img1, img2, img3), dim=0)
+        height, width = imgs.size(1), imgs.size(2)
+        imgs = img_padding(imgs, height, width, pad_num=32)
+        return {'input': imgs, 'target': gt / 255.0}
diff --git a/modelscope/msdatasets/task_datasets/video_stabilization/__init__.py b/modelscope/msdatasets/task_datasets/video_stabilization/__init__.py
new file mode 100644
index 00000000..f1f43607
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_stabilization/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .video_stabilization_dataset import VideoStabilizationDataset
+
+else:
+    _import_structure = {
+        'video_stabilization_dataset': ['VideoStabilizationDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/video_stabilization/video_stabilization_dataset.py b/modelscope/msdatasets/task_datasets/video_stabilization/video_stabilization_dataset.py
new file mode 100644
index 00000000..b0e6bdef
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_stabilization/video_stabilization_dataset.py
@@ -0,0 +1,29 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+
+
+@TASK_DATASETS.register_module(
+    Tasks.video_stabilization, module_name=Models.video_stabilization)
+class VideoStabilizationDataset(TorchTaskDataset):
+    """Paired video dataset for video stabilization.
+    """
+
+    def __init__(self, dataset, opt):
+        self.dataset = dataset
+        self.opt = opt
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load input video paths.
+        item_dict = self.dataset[index]
+        input_path = item_dict['input_video:FILE']
+
+        return {'input': input_path}
diff --git a/modelscope/msdatasets/task_datasets/video_super_resolution/__init__.py b/modelscope/msdatasets/task_datasets/video_super_resolution/__init__.py
new file mode 100644
index 00000000..c3283ebc
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_super_resolution/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .video_super_resolution_dataset import VideoSuperResolutionDataset
+
+else:
+    _import_structure = {
+        'video_super_resolution_dataset': ['VideoSuperResolutionDataset'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/msdatasets/task_datasets/video_super_resolution/video_super_resolution_dataset.py b/modelscope/msdatasets/task_datasets/video_super_resolution/video_super_resolution_dataset.py
new file mode 100644
index 00000000..69faa527
--- /dev/null
+++ b/modelscope/msdatasets/task_datasets/video_super_resolution/video_super_resolution_dataset.py
@@ -0,0 +1,76 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from collections import defaultdict
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+
+
+def default_loader(path):
+    return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+    """Numpy array to tensor.
+    Args:
+        imgs (list[ndarray] | ndarray): Input images.
+        bgr2rgb (bool): Whether to change bgr to rgb.
+        float32 (bool): Whether to change to float32.
+    Returns:
+        list[tensor] | tensor: Tensor images. If returned results only have
+            one element, just return tensor.
+    """
+
+    def _totensor(img, bgr2rgb, float32):
+        if img.shape[2] == 3 and bgr2rgb:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = torch.from_numpy(img.transpose(2, 0, 1))
+        if float32:
+            img = img.float()
+        return img
+
+    if isinstance(imgs, list):
+        return [_totensor(img, bgr2rgb, float32) for img in imgs]
+    else:
+        return _totensor(imgs, bgr2rgb, float32)
+
+
+@TASK_DATASETS.register_module(
+    Tasks.video_super_resolution, module_name=Models.real_basicvsr)
+class VideoSuperResolutionDataset(TorchTaskDataset):
+    """single video dataset for video super-resolution.
+    """
+
+    def __init__(self, dataset):
+        frames_len = len(dataset)
+        self.dataset = defaultdict(list)
+        for i in range(frames_len):
+            item_dict = dataset[i]
+            frame_path = item_dict['LQ Frame:FILE']
+            clip_num = item_dict['Clip Num']
+            self.dataset[int(clip_num)].append(frame_path)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, index):
+
+        # Load frames. Dimension order: HWC; channel order: BGR;
+        # image range: [0, 1], float32.
+        clip = self.dataset[index]
+        frames = []
+
+        for frame_path in clip:
+            frame = default_loader(frame_path)
+            # BGR to RGB, HWC to CHW, numpy to tensor
+            frames.append(img2tensor(frame, bgr2rgb=True, float32=True))
+
+        input = torch.stack(frames, dim=0)  # (T, C, H, W)
+        return {'input': input, 'target': input}
diff --git a/modelscope/msdatasets/utils/dataset_builder.py b/modelscope/msdatasets/utils/dataset_builder.py
deleted file mode 100644
index e110a3e9..00000000
--- a/modelscope/msdatasets/utils/dataset_builder.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import copy
-import os
-from typing import Mapping, Sequence, Union
-
-import datasets
-import pandas as pd
-import pyarrow as pa
-from datasets.info import DatasetInfo
-from datasets.naming import camelcase_to_snakecase
-from datasets.packaged_modules import csv
-from datasets.utils.filelock import FileLock
-
-from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
-                                       EXTENSIONS_TO_LOAD, DownloadMode)
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-
-class MsCsvDatasetBuilder(csv.Csv):
-
-    def __init__(
-        self,
-        dataset_name: str,
-        cache_dir: str,
-        namespace: str,
-        subset_name: str,
-        hash: str,
-        meta_data_files: Mapping[str, Union[str, Sequence[str]]],
-        zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
-        **config_kwargs,
-    ):
-        super().__init__(
-            cache_dir=cache_dir,
-            name=subset_name,
-            hash=hash,
-            data_files=meta_data_files,
-            **config_kwargs)
-
-        self.name = camelcase_to_snakecase(dataset_name)
-        self.info.builder_name = dataset_name
-        self._cache_dir = self._build_cache_dir(namespace=namespace)
-        lock_path = os.path.join(
-            self._cache_dir_root,
-            self._cache_dir.replace(os.sep, '_') + '.lock')
-        with FileLock(lock_path):
-            # check if data exist
-            if os.path.exists(self._cache_dir):
-                if len(os.listdir(self._cache_dir)) > 0:
-                    logger.info(
-                        f'Overwrite dataset info from restored data version, cache_dir is {self._cache_dir}'
-                    )
-                # dir exists but no data, remove the empty dir as data aren't available anymore
-                else:
-                    logger.warning(
-                        f'Old caching folder {self._cache_dir} for dataset {self.name} exists '
-                        f'but not data were found. Removing it. ')
-                    os.rmdir(self._cache_dir)
-        self.zip_data_files = zip_data_files
-
-    def _relative_data_dir(self,
-                           with_version=True,
-                           with_hash=True,
-                           namespace=DEFAULT_DATASET_NAMESPACE) -> str:
-        """Relative path of this dataset in cache_dir:
-        Will be:
-            self.name/self.config.version/self.hash/
-        or if a namespace has been specified:
-            self.namespace___self.name/self.config.version/self.hash/
-        """
-        builder_data_dir = self.info.builder_name if namespace is None else f'{namespace}___{self.info.builder_name}'
-        builder_config = self.config
-        hash = self.hash
-        if builder_config:
-            builder_data_dir = os.path.join(builder_data_dir, self.config_id)
-        if with_version:
-            builder_data_dir = os.path.join(builder_data_dir,
-                                            str(self.config.version))
-        if with_hash and hash and isinstance(hash, str):
-            builder_data_dir = os.path.join(builder_data_dir, hash)
-        return builder_data_dir
-
-    def _build_cache_dir(self, namespace=DEFAULT_DATASET_NAMESPACE):
-        builder_data_dir = os.path.join(
-            self._cache_dir_root,
-            self._relative_data_dir(
-                with_version=False, with_hash=True, namespace=namespace))
-
-        return builder_data_dir
-
-    def _split_generators(self, dl_manager):
-        if not self.config.data_files:
-            raise ValueError(
-                'At least one data file must be specified, but got none.')
-        data_files = dl_manager.download_and_extract(self.config.data_files)
-        zip_data_files = dl_manager.download_and_extract(self.zip_data_files)
-        splits = []
-        for split_name, files in data_files.items():
-            if isinstance(files, str):
-                files = [files]
-            splits.append(
-                datasets.SplitGenerator(
-                    name=split_name,
-                    gen_kwargs={
-                        'files': dl_manager.iter_files(files),
-                        'base_dir': zip_data_files.get(split_name)
-                    }))
-        return splits
-
-    def _generate_tables(self, files, base_dir):
-        schema = pa.schema(self.config.features.type
-                           ) if self.config.features is not None else None
-        dtype = {
-            name: dtype.to_pandas_dtype()
-            for name, dtype in zip(schema.names, schema.types)
-        } if schema else None
-        for file_idx, file in enumerate(files):
-            csv_file_reader = pd.read_csv(
-                file,
-                iterator=True,
-                dtype=dtype,
-                **self.config.read_csv_kwargs)
-            transform_fields = []
-            for field_name in csv_file_reader._engine.names:
-                if field_name.endswith(':FILE'):
-                    transform_fields.append(field_name)
-            try:
-                for batch_idx, df in enumerate(csv_file_reader):
-                    for field_name in transform_fields:
-                        if base_dir:
-                            df[field_name] = df[field_name].apply(
-                                lambda x: os.path.join(base_dir, x))
-                    pa_table = pa.Table.from_pandas(df, schema=schema)
-                    yield (file_idx, batch_idx), pa_table
-            except ValueError as e:
-                logger.error(
-                    f"Failed to read file '{file}' with error {type(e)}: {e}")
-                raise
-
-
-class TaskSpecificDatasetBuilder(MsCsvDatasetBuilder):
-
-    def __init__(
-        self,
-        dataset_name: str,
-        cache_dir: str,
-        namespace: str,
-        subset_name: str,
-        hash: str,
-        meta_data_files: Mapping[str, Union[str, Sequence[str]]],
-        zip_data_files: Mapping[str, Union[str, Sequence[str]]] = None,
-        **config_kwargs,
-    ):
-        self.name = dataset_name
-        self.subset_name = subset_name
-        self.namespace = namespace
-        self.hash = hash
-        self.data_files = meta_data_files
-        self.zip_data_files = zip_data_files
-        self.split_path_dict = None
-        self.config = None
-        self.info = DatasetInfo.from_dict({'builder_name': dataset_name})
-        self._cache_dir_root = os.path.expanduser(cache_dir)
-        self._cache_dir = self._build_cache_dir()
-        self._config_kwargs = config_kwargs
-
-    def download_and_prepare(self, download_mode, dl_manager,
-                             **download_kwargs):
-        # Prevent parallel disk operations
-        lock_path = os.path.join(
-            self._cache_dir_root,
-            self._cache_dir.replace(os.sep, '_') + '.lock')
-        with FileLock(lock_path):
-            data_exists = os.path.exists(self._cache_dir)
-            if data_exists and download_mode == DownloadMode.REUSE_DATASET_IF_EXISTS:
-                logger.warning(
-                    f'Reusing dataset {self.name} ({self._cache_dir})')
-                return
-            logger.info(f'Generating dataset {self.name} ({self._cache_dir})')
-        self._download_and_prepare(dl_manager=dl_manager)
-
-    def _download_and_prepare(self, dl_manager):
-        self.split_path_dict = dl_manager.download_and_extract(
-            self.zip_data_files)
-
-    def as_dataset(self):
-        return ExternalDataset(self.split_path_dict, self._config_kwargs)
-
-
-class ExternalDataset(object):
-
-    def __init__(self, split_path_dict, config_kwargs):
-        self.split_path_dict = split_path_dict
-        self.config_kwargs = copy.deepcopy(config_kwargs)
-        self.config_kwargs.update({'split_config': split_path_dict})
-        self.ext_dataset = None
-        self.split_data_files = {k: [] for k, _ in split_path_dict.items()}
-        file_ext = ''
-
-        for split_name, split_dir in split_path_dict.items():
-            if isinstance(split_dir, str) and os.path.isdir(split_dir):
-                split_file_names = os.listdir(split_dir)
-                set_files_exts = set([
-                    os.path.splitext(file_name)[-1].strip('.')
-                    for file_name in split_file_names
-                ])
-                if '' in set_files_exts:
-                    continue
-                # ensure these files have same extensions
-                if len(set_files_exts) != 1:
-                    supported_exts = ','.join(EXTENSIONS_TO_LOAD.keys())
-                    logger.error(
-                        f'Split-{split_name} has been ignored, please flatten your folder structure, '
-                        f'and make sure these files have same extensions. '
-                        f'Supported extensions: {supported_exts} .')
-                    continue
-                file_ext = list(set_files_exts)[0]
-                if file_ext not in EXTENSIONS_TO_LOAD:
-                    continue
-
-                split_file_paths = [
-                    os.path.join(split_dir, file_name)
-                    for file_name in split_file_names
-                ]
-                self.split_data_files[split_name] = split_file_paths
-
-        if file_ext:
-            file_ext = EXTENSIONS_TO_LOAD.get(file_ext)
-            self.ext_dataset = datasets.load_dataset(
-                file_ext, data_files=self.split_data_files, **config_kwargs)
-
-    def __len__(self):
-        return len(self.split_path_dict
-                   ) if not self.ext_dataset else self.ext_dataset.__len__()
-
-    def __getitem__(self, item):
-        if not self.ext_dataset:
-            return self.split_path_dict.get(item)
-        else:
-            return self.ext_dataset.__getitem__(item)
-
-    def __iter__(self):
-        if not self.ext_dataset:
-            for k, v in self.split_path_dict.items():
-                yield k, v
-        else:
-            for k, v in self.ext_dataset.items():
-                yield k, v
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index fc636126..785337eb 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -2,14 +2,11 @@
 
 import os
 from collections import defaultdict
-from typing import Any, Mapping, Optional, Sequence, Union
-
-from datasets.builder import DatasetBuilder
+from typing import Optional, Union
 
 from modelscope.hub.api import HubApi
 from modelscope.utils.constant import DEFAULT_DATASET_REVISION, MetaDataFields
 from modelscope.utils.logger import get_logger
-from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder
 
 logger = get_logger()
 
@@ -105,6 +102,8 @@ def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool,
 
     for item in objects:
         object_key = item.get('Key')
+        if not object_key:
+            continue
         res.append(object_key)
 
     return res
@@ -127,6 +126,22 @@ def contains_dir(file_map) -> bool:
     return res
 
 
+def get_subdir_hash_from_split(split: Union[str, list], version: str) -> str:
+    if isinstance(split, str):
+        split = [split]
+    return os.path.join(version, '_'.join(split))
+
+
+def get_split_list(split: Union[str, list]) -> list:
+    """ Unify the split to list-format. """
+    if isinstance(split, str):
+        return [split]
+    elif isinstance(split, list):
+        return split
+    else:
+        raise f'Expected format of split: str or list, but got {type(split)}.'
+
+
 def get_split_objects_map(file_map, objects):
     """
     Get the map between dataset split and oss objects.
@@ -187,7 +202,12 @@ def get_dataset_files(subset_split_into: dict,
                 meta_csv_file_url)
             if not script_content:
                 raise 'Meta-csv file cannot be empty when meta-args `big_data` is true.'
-            objects = [item.split(',')[0] for item in script_content]
+            for item in script_content:
+                if not item:
+                    continue
+                item = item.strip().split(',')[0]
+                if item:
+                    objects.append(item)
             file_map[split] = objects
     # More general but low-efficiency.
     if not objects:
@@ -202,46 +222,3 @@ def get_dataset_files(subset_split_into: dict,
             file_map = get_split_objects_map(file_map, objects)
 
     return meta_map, file_map, args_map
-
-
-def load_dataset_builder(dataset_name: str, subset_name: str, namespace: str,
-                         meta_data_files: Mapping[str, Union[str,
-                                                             Sequence[str]]],
-                         zip_data_files: Mapping[str, Union[str,
-                                                            Sequence[str]]],
-                         args_map: Mapping[str, Any], cache_dir: str,
-                         version: Optional[Union[str]], split: Sequence[str],
-                         **config_kwargs) -> DatasetBuilder:
-    sub_dir = os.path.join(version, '_'.join(split))
-    meta_data_file = next(iter(meta_data_files.values()))
-    args_map_value = next(iter(args_map.values()))
-    if args_map_value is None:
-        args_map_value = {}
-
-    if not meta_data_file or args_map_value.get(MetaDataFields.ARGS_BIG_DATA):
-        args_map_value.update(config_kwargs)
-        builder_instance = TaskSpecificDatasetBuilder(
-            dataset_name=dataset_name,
-            namespace=namespace,
-            cache_dir=cache_dir,
-            subset_name=subset_name,
-            meta_data_files=meta_data_files,
-            zip_data_files=zip_data_files,
-            hash=sub_dir,
-            **args_map_value)
-    elif meta_data_file.endswith('.csv'):
-        builder_instance = MsCsvDatasetBuilder(
-            dataset_name=dataset_name,
-            namespace=namespace,
-            cache_dir=cache_dir,
-            subset_name=subset_name,
-            meta_data_files=meta_data_files,
-            zip_data_files=zip_data_files,
-            hash=sub_dir,
-            **config_kwargs)
-    else:
-        raise NotImplementedError(
-            f'Dataset mete file extensions "{os.path.splitext(meta_data_file)[-1]}" is not implemented yet'
-        )
-
-    return builder_instance
diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py
deleted file mode 100644
index ebe9b8f5..00000000
--- a/modelscope/msdatasets/utils/download_utils.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from typing import Optional
-
-from datasets.utils.download_manager import DownloadConfig, DownloadManager
-from datasets.utils.file_utils import cached_path, is_relative_path
-
-from .oss_utils import OssUtilities
-
-
-class DatasetDownloadManager(DownloadManager):
-
-    def __init__(self,
-                 dataset_name: str,
-                 namespace: str,
-                 version: str,
-                 data_dir: Optional[str] = None,
-                 download_config: Optional[DownloadConfig] = None,
-                 base_path: Optional[str] = None,
-                 record_checksums=True):
-        super().__init__(dataset_name, data_dir, download_config, base_path,
-                         record_checksums)
-        self._namespace = namespace
-        self._version = version
-        from modelscope.hub.api import HubApi
-        api = HubApi()
-        oss_config = api.get_dataset_access_config(self._dataset_name,
-                                                   self._namespace,
-                                                   self._version)
-        self.oss_utilities = OssUtilities(
-            oss_config=oss_config,
-            dataset_name=self._dataset_name,
-            namespace=self._namespace,
-            revision=self._version)
-
-    def _download(self, url_or_filename: str,
-                  download_config: DownloadConfig) -> str:
-        url_or_filename = str(url_or_filename)
-        if is_relative_path(url_or_filename):
-            # fetch oss files
-            return self.oss_utilities.download(
-                url_or_filename, download_config=download_config)
-        else:
-            return cached_path(
-                url_or_filename, download_config=download_config)
diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py
index c16eab73..44ee5446 100644
--- a/modelscope/msdatasets/utils/oss_utils.py
+++ b/modelscope/msdatasets/utils/oss_utils.py
@@ -1,13 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 from __future__ import print_function
+import multiprocessing
 import os
 
 import oss2
 from datasets.utils.file_utils import hash_url_to_filename
 
 from modelscope.hub.api import HubApi
-from modelscope.utils.constant import UploadMode
+from modelscope.msdatasets.download.download_config import DataDownloadConfig
+from modelscope.utils.config_ds import MS_CACHE_HOME
+from modelscope.utils.constant import (DEFAULT_DATA_ACCELERATION_ENDPOINT,
+                                       MetaDataFields, UploadMode)
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -29,24 +33,27 @@ class OssUtilities:
         self.namespace = namespace
         self.revision = revision
 
-        self.upload_resumable_store_root_path = '/tmp/modelscope/tmp_dataset/upload'
-        self.download_resumable_store_root_path = '/tmp/modelscope/tmp_dataset/download'
-        self.num_threads = 4
+        self.resumable_store_root_path = os.path.join(MS_CACHE_HOME,
+                                                      'tmp/resumable_store')
+        self.num_threads = multiprocessing.cpu_count()
         self.part_size = 1 * 1024 * 1024
         self.multipart_threshold = 50 * 1024 * 1024
         self.max_retries = 3
 
-        self.upload_resumable_store = oss2.ResumableStore(
-            root=self.upload_resumable_store_root_path)
-        self.download_resumable_store = oss2.ResumableDownloadStore(
-            root=self.download_resumable_store_root_path)
+        self.resumable_store_download = oss2.ResumableDownloadStore(
+            root=self.resumable_store_root_path)
+        self.resumable_store_upload = oss2.ResumableStore(
+            root=self.resumable_store_root_path)
         self.api = HubApi()
 
     def _do_init(self, oss_config):
         self.key = oss_config[ACCESS_ID]
         self.secret = oss_config[ACCESS_SECRET]
         self.token = oss_config[SECURITY_TOKEN]
-        self.endpoint = f"https://{oss_config['Region']}.aliyuncs.com"
+        if os.getenv('ENABLE_DATASET_ACCELERATION') == 'True':
+            self.endpoint = DEFAULT_DATA_ACCELERATION_ENDPOINT
+        else:
+            self.endpoint = f"https://{oss_config['Region']}.aliyuncs.com"
         self.bucket_name = oss_config[BUCKET]
         auth = oss2.StsAuth(self.key, self.secret, self.token)
         self.bucket = oss2.Bucket(auth, self.endpoint, self.bucket_name)
@@ -54,11 +61,11 @@ class OssUtilities:
         self.oss_backup_dir = oss_config[BACK_DIR]
 
     def _reload_sts(self):
-        cookies = self.api.check_local_cookies(use_cookies=True)
+        logger.info('Reloading sts token automatically.')
         oss_config_refresh = self.api.get_dataset_access_config_session(
-            cookies=cookies,
             dataset_name=self.dataset_name,
             namespace=self.namespace,
+            check_cookie=True,
             revision=self.revision)
         self._do_init(oss_config_refresh)
 
@@ -68,32 +75,46 @@ class OssUtilities:
             rate = int(100 * (float(consumed_bytes) / float(total_bytes)))
             print('\r{0}% '.format(rate), end='', flush=True)
 
-    def download(self, oss_file_name, download_config):
+    def download(self, oss_file_name: str,
+                 download_config: DataDownloadConfig):
         cache_dir = download_config.cache_dir
         candidate_key = os.path.join(self.oss_dir, oss_file_name)
         candidate_key_backup = os.path.join(self.oss_backup_dir, oss_file_name)
-        file_oss_key = candidate_key if self.bucket.object_exists(
-            candidate_key) else candidate_key_backup
-        filename = hash_url_to_filename(file_oss_key, etag=None)
-        local_path = os.path.join(cache_dir, filename)
+        split = download_config.split
+
+        big_data = False
+        if split:
+            args_dict = download_config.meta_args_map.get(split)
+            if args_dict:
+                big_data = args_dict.get(MetaDataFields.ARGS_BIG_DATA)
+
         retry_count = 0
         while True:
             try:
                 retry_count += 1
+                # big_data is True when the dataset contains large number of objects
+                if big_data:
+                    file_oss_key = candidate_key
+                else:
+                    file_oss_key = candidate_key if self.bucket.object_exists(
+                        candidate_key) else candidate_key_backup
+                filename = hash_url_to_filename(file_oss_key, etag=None)
+                local_path = os.path.join(cache_dir, filename)
+
                 if download_config.force_download or not os.path.exists(
                         local_path):
                     oss2.resumable_download(
                         self.bucket,
                         file_oss_key,
                         local_path,
-                        store=self.download_resumable_store,
+                        store=self.resumable_store_download,
                         multiget_threshold=self.multipart_threshold,
                         part_size=self.part_size,
                         progress_callback=self._percentage,
                         num_threads=self.num_threads)
                 break
             except Exception as e:
-                if e.__getattribute__('status') == 403:
+                if e.__dict__.get('status') == 403:
                     self._reload_sts()
                 if retry_count >= self.max_retries:
                     raise
@@ -125,14 +146,14 @@ class OssUtilities:
                     self.bucket,
                     object_key,
                     local_file_path,
-                    store=self.upload_resumable_store,
+                    store=self.resumable_store_upload,
                     multipart_threshold=self.multipart_threshold,
                     part_size=self.part_size,
                     progress_callback=progress_callback,
                     num_threads=self.num_threads)
                 break
             except Exception as e:
-                if e.__getattribute__('status') == 403:
+                if e.__dict__.get('status') == 403:
                     self._reload_sts()
                 if retry_count >= self.max_retries:
                     raise
diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py
index bbdcd9e9..b8a4cb51 100644
--- a/modelscope/msdatasets/utils/upload_utils.py
+++ b/modelscope/msdatasets/utils/upload_utils.py
@@ -5,8 +5,8 @@ from multiprocessing.dummy import Pool as ThreadPool
 
 from tqdm import tqdm
 
+from modelscope.msdatasets.utils.oss_utils import OssUtilities
 from modelscope.utils.constant import UploadMode
-from .oss_utils import OssUtilities
 
 
 class DatasetUploadManager(object):
@@ -14,11 +14,10 @@ class DatasetUploadManager(object):
     def __init__(self, dataset_name: str, namespace: str, version: str):
         from modelscope.hub.api import HubApi
         _hub_api = HubApi()
-        _cookies = _hub_api.check_local_cookies(use_cookies=True)
         _oss_config = _hub_api.get_dataset_access_config_session(
-            cookies=_cookies,
             dataset_name=dataset_name,
             namespace=namespace,
+            check_cookie=False,
             revision=version)
 
         self.oss_utilities = OssUtilities(
diff --git a/modelscope/ops/__init__.py b/modelscope/ops/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/ops/quadtree_attention/__init__.py b/modelscope/ops/quadtree_attention/__init__.py
new file mode 100644
index 00000000..9fe42a7e
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from .modules.quadtree_attention import QTAttA, QTAttB
diff --git a/modelscope/ops/quadtree_attention/functions/__init__.py b/modelscope/ops/quadtree_attention/functions/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/ops/quadtree_attention/functions/quadtree_attention.py b/modelscope/ops/quadtree_attention/functions/quadtree_attention.py
new file mode 100644
index 00000000..6a9010fd
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/functions/quadtree_attention.py
@@ -0,0 +1,83 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from pathlib import Path
+
+import torch
+from einops.einops import rearrange
+from torch.autograd import Function
+from torch.utils.cpp_extension import load
+
+cur_dir = Path(__file__).parent.resolve()
+score_computation_cuda = \
+    load(name='score_computation_cuda', # noqa
+        sources=[str(cur_dir / '../src/score_computation.cpp'), # noqa
+                str(cur_dir / '../src/score_computation_kernal.cu')], # noqa
+        extra_cflags=['-g'], extra_cuda_cflags=['-O2']) # noqa
+
+value_aggregation_cuda = \
+    load(name='value_aggregation_cuda', # noqa
+        sources=[str(cur_dir / '../src/value_aggregation.cpp'), # noqa
+                str(cur_dir / '../src/value_aggregation_kernel.cu')], # noqa
+        extra_cflags=['-g'], extra_cuda_cflags=['-O2']) # noqa
+
+
+class ScoreComputation(Function):
+
+    @staticmethod
+    def forward(ctx, query, key, index):
+        x = score_computation_cuda.score_forward(query, key, index)
+        ctx.save_for_backward(query, key, index)
+        return x[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input1, input2, index = ctx.saved_tensors
+        grad_output = grad_output.contiguous()
+        x = score_computation_cuda.score_backward(grad_output, input1, input2,
+                                                  index)
+        return x[0], x[1], None
+
+
+score_computation_op = ScoreComputation.apply
+
+
+class value_aggregation(Function):
+
+    @staticmethod
+    def forward(ctx, score, value, index):
+        ctx.save_for_backward(score, value, index)
+        f = score.shape[2]
+        score = rearrange(
+            score,
+            'b n f K h -> b (n f) K h')  # [b, N, 4, 4K, H] -> [b, 4N, 4K, H]
+        index = rearrange(
+            index,
+            'b n f K h -> b (n f) K h')  # [b, N, 4, 4K, H] -> [b, 4N, 4K, H]
+        b, N, _, H = score.shape
+        D = value.shape[-1]
+        # value [b, M, H, D]
+        output = score.new_zeros([b, N, H, D]).contiguous()  # b, 4N, H, D
+        value_aggregation_cuda.value_aggregation_forward(
+            score, value, index, output)
+        output = rearrange(output, 'b (n f) h d -> b n f h d', f=f)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        score, value, index = ctx.saved_tensors
+        f = score.shape[2]
+        score = rearrange(score, 'b n f K h -> b (n f) K h')
+        index = rearrange(index, 'b n f K h -> b (n f) K h')
+
+        grad_output = grad_output.contiguous()
+
+        grad_score = score.new_zeros(score.shape).contiguous()
+        grad_value = value.new_zeros(value.shape).contiguous()
+
+        value_aggregation_cuda.value_aggregation_backward(
+            grad_output, score, value, index, grad_score, grad_value)
+        grad_score = rearrange(grad_score, 'b (n f) K h -> b n f K h', f=f)
+        return grad_score, grad_value, None
+
+
+value_aggregation_op = value_aggregation.apply
diff --git a/modelscope/ops/quadtree_attention/modules/__init__.py b/modelscope/ops/quadtree_attention/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/ops/quadtree_attention/modules/quadtree_attention.py b/modelscope/ops/quadtree_attention/modules/quadtree_attention.py
new file mode 100644
index 00000000..5d4288c1
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/modules/quadtree_attention.py
@@ -0,0 +1,370 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.einops import rearrange
+
+from modelscope.ops.quadtree_attention.functions.quadtree_attention import (
+    score_computation_op, value_aggregation_op)
+
+
+class QTAttA(nn.Module):
+
+    def __init__(
+        self,
+        nhead,
+        dim,
+        topks=[32, 32, 32, 32],
+        scale=None,
+        use_dropout=False,
+        attention_dropout=0.1,
+    ):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.topks = topks
+        self.nhead = nhead
+        self.dim = dim
+
+    def process_coarse_level(self, query, key, value, topk):
+        bs, c, h, w = key.shape
+        cur_dim = key.shape[1] // self.nhead
+
+        key = rearrange(key,
+                        'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                     cur_dim)  # [N, S, H, D]
+        value = rearrange(value,
+                          'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                       cur_dim)  # [N, S, H, D]
+        query = rearrange(query,
+                          'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                       cur_dim)
+
+        QK = torch.einsum('nlhd,nshd->nlsh', query, key)
+        softmax_temp = 1.0 / cur_dim**0.5  # sqrt(D)
+        A = torch.softmax(softmax_temp * QK, dim=-2)
+
+        # mask out top K tokens
+        topk_score, topk_idx = torch.topk(A, dim=-2, k=topk, largest=True)
+        mask = torch.ones_like(A)
+        mask = mask.scatter(
+            dim=-2, index=topk_idx, src=torch.zeros_like(topk_idx).float())
+
+        # message is only computed within the unmasked
+        message = torch.einsum(
+            'nlsh,nshd->nlhd', A * mask,
+            value)  # .reshape(bs, h, w, self.nhead, cur_dim)
+
+        return A, message, topk_score, topk_idx
+
+    def process_fine_level(self,
+                           query,
+                           key,
+                           value,
+                           topk_score,
+                           topk_pos,
+                           topk_prev,
+                           topk,
+                           final=False):
+        bs, c, h, w = key.shape
+
+        cur_dim = key.shape[1] // self.nhead
+        key = rearrange(key,
+                        'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                     cur_dim)  # [N, S, H, D]
+        value = rearrange(value,
+                          'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                       cur_dim)  # [N, S, H, D]
+
+        query = query.view(bs, c, h // 2, 2, w // 2, 2)
+        query = rearrange(query, 'b c h t1 w t2-> b (h w) (t1 t2) c ').view(
+            bs, -1, 4, self.nhead, cur_dim)
+
+        # convert 2d coordinates to 1d index
+        idx_gather = []
+        topk_pos = topk_pos * 2
+        for x in [0, 1]:
+            for y in [0, 1]:
+                idx = (topk_pos[0]
+                       + x) * w + topk_pos[1] + y  # convert to index
+                idx_gather.append(idx)
+
+        idx = torch.stack(idx_gather, dim=3)  # [N, L, K, 4, H, D]
+
+        # Compute score
+        # query: [b, N, 4, H, D]
+        # key: [b, 4N, H, D]
+        # idx: [b, N, K, 4, H]
+        # QK: [b, N, 4, 4K, H]
+        QK = score_computation_op(query, key.contiguous(),
+                                  idx.view(bs, -1, topk_prev * 4, self.nhead))
+        QK = rearrange(QK, 'n l w (k f) h -> n l w k f h', k=topk_prev, f=4)
+        softmax_temp = 1.0 / cur_dim**0.5  # sqrt(D)
+        A = torch.softmax(
+            softmax_temp * QK, dim=-2)  # [N, L//scale**i, K, 4, H]
+        # Score redistribution
+        topk_score = topk_score.unsqueeze(-2).unsqueeze(2)
+        A = (A * topk_score).reshape(bs, -1, 4, topk_prev * 4, self.nhead)
+        idx = idx.view(bs, -1, 1, topk_prev * 4,
+                       self.nhead).repeat(1, 1, 4, 1, 1)  # [N, L,4, K*4, H]
+        topk_score, topk_idx = torch.topk(A, dim=-2, k=topk, largest=True)
+
+        if not final:
+            mask = torch.ones_like(A)
+            mask = mask.scatter(
+                dim=-2, index=topk_idx, src=torch.zeros_like(topk_idx).float())
+            message = value_aggregation_op(A * mask, value.contiguous(), idx)
+        else:
+            message = value_aggregation_op(A, value.contiguous(), idx)
+
+        if not final:
+            topk_idx = torch.gather(idx, index=topk_idx, dim=-2)
+            topk_idx = rearrange(
+                topk_idx,
+                'b (h w) (t1 t2) k nh -> b (h t1 w t2) k nh',
+                h=h // 2,
+                t1=2)  # reshape back
+            topk_score = rearrange(
+                topk_score,
+                'b (h w) (t1 t2) k nh -> b (h t1 w t2) k nh',
+                h=h // 2,
+                t1=2)  # reshape back
+
+        return A, message, topk_score, topk_idx
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """Multi-head quadtree attention
+        Args:
+            queries: Query pyramid [N, C, H, W]
+            keys: Key pyramid [N, C, H, W]
+            values: Value pyramid [N, C, H, W]
+        Returns:
+            message: (N, C, H, W)
+        """
+
+        bs = queries[0].shape[0]
+        messages = []
+        topk = self.topks[0]
+
+        for i, (query, key, value) in enumerate(
+                zip(reversed(queries), reversed(keys), reversed(values))):
+            bs, c, h, w = key.shape
+            if i == 0:
+                A, message, topk_score, topk_idx = self.process_coarse_level(
+                    query, key, value,
+                    topk)  # Full attention for coarest level
+            else:
+                topk_prev = topk
+                topk = self.topks[i]
+                final = True if i == len(queries) - 1 else False
+                A, message, topk_score, topk_idx = self.process_fine_level(
+                    query, key, value, topk_score, topk_pos, topk_prev, topk,
+                    final)  # Quadtree attention
+
+            messages.append(message)
+            if topk_idx is not None:
+                topk_pos = torch.stack([  # noqa
+                    topk_idx // w, topk_idx % w
+                ])  # convert to coordinate
+
+        final_message = 0
+        for i, m in enumerate(messages):
+            if i == 0:
+                final_message = m
+            else:
+                final_message = final_message.unsqueeze(2) + m
+                final_message = rearrange(
+                    final_message,
+                    'b (H W) (t1 t2) h d -> b (H t1 W t2) h d',
+                    t1=2,
+                    t2=2,
+                    H=queries[-i].shape[2])
+
+        return final_message
+
+
+class QTAttB(nn.Module):
+
+    def __init__(self,
+                 nhead,
+                 dim,
+                 scale,
+                 topks=[32, 32, 32, 32],
+                 use_dropout=False,
+                 attention_dropout=0.1,
+                 lepe=False):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.topks = topks
+        self.nhead = nhead
+        self.dim = dim
+        self.lepe = lepe
+        if lepe:  # locally enhanced position encoding
+            self.get_vs = nn.ModuleList([
+                nn.Conv2d(
+                    dim * nhead,
+                    dim * nhead,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    groups=dim * nhead) for _ in range(scale)
+            ])
+        self.register_parameter('weight', nn.Parameter(torch.randn(scale)))
+
+    def process_coarse_level(self, query, key, value, topk):
+        bs, c, h, w = key.shape
+
+        cur_dim = key.shape[1] // self.nhead
+        key = rearrange(key,
+                        'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                     cur_dim)  # [N, S, H, D]
+        value = rearrange(value,
+                          'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                       cur_dim)  # [N, S, H, D]
+        query = rearrange(query,
+                          'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                       cur_dim)
+        QK = torch.einsum('nlhd,nshd->nlsh', query, key)
+        softmax_temp = 1.0 / cur_dim**0.5  # sqrt(D)
+
+        A = torch.softmax(softmax_temp * QK, dim=-2)
+        topk_score, topk_idx = torch.topk(A, dim=-2, k=topk, largest=True)
+
+        message = torch.einsum(
+            'nlsh,nshd->nlhd', A,
+            value)  # .reshape(bs, h, w, self.nhead, cur_dim)
+
+        return A, message, topk_score, topk_idx
+
+    def process_fine_level(self,
+                           query,
+                           key,
+                           value,
+                           topk_score,
+                           topk_pos,
+                           topk_prev,
+                           topk,
+                           final=False):
+        bs, c, h, w = key.shape
+
+        cur_dim = key.shape[1] // self.nhead
+        key = rearrange(key,
+                        'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                     cur_dim)  # [N, S, H, D]
+        value = rearrange(value,
+                          'b c h w -> b (h w) c').view(bs, -1, self.nhead,
+                                                       cur_dim)  # [N, S, H, D]
+
+        query = query.view(bs, c, h // 2, 2, w // 2, 2)
+        query = rearrange(query, 'b c h t1 w t2-> b (h w) (t1 t2) c ').view(
+            bs, -1, 4, self.nhead, cur_dim)
+
+        # convert 2D coordiantes to 1D index
+        topk_pos = topk_pos * 2
+        idx_gather = []
+        for x in [0, 1]:
+            for y in [0, 1]:
+                idx = (topk_pos[0]
+                       + x) * w + topk_pos[1] + y  # convert to index
+                idx_gather.append(idx)
+        idx = torch.stack(idx_gather, dim=3)  # [N, L, K, 4, H, D]
+
+        # score computation
+        # query: [b, N, 4, H, D]
+        # key: [b, 4N, H, D]
+        # idx: [b, N, K, 4, H]
+        # QK: [b, N, 4, 4K, H]
+        QK = score_computation_op(query, key.contiguous(),
+                                  idx.view(bs, -1, topk_prev * 4, self.nhead))
+        softmax_temp = 1.0 / cur_dim**0.5  # sqrt(D)
+        A = torch.softmax(
+            softmax_temp * QK, dim=-2)  # [N, L//scale**i, K, 4, H]
+        A = A.reshape(bs, -1, 4, topk_prev * 4, self.nhead)
+        idx = idx.view(bs, -1, 1, topk_prev * 4,
+                       self.nhead).repeat(1, 1, 4, 1, 1)  # [N, L,4, K*4, H]
+
+        topk_score, topk_idx = torch.topk(A, dim=-2, k=topk, largest=True)
+        message = value_aggregation_op(A, value.contiguous(), idx)
+        topk_idx = torch.gather(idx, index=topk_idx, dim=-2)
+        topk_idx = rearrange(
+            topk_idx,
+            'b (h w) (t1 t2) k nh -> b (h t1 w t2) k nh',
+            h=h // 2,
+            t1=2)  # reshape back
+        topk_score = rearrange(
+            topk_score,
+            'b (h w) (t1 t2) k nh -> b (h t1 w t2) k nh',
+            h=h // 2,
+            t1=2)  # reshape back
+
+        return A, message, topk_score, topk_idx
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """Multi-head quadtree attention
+        Args:
+            queries: Query pyramid [N, C, H, W]
+            keys: Key pyramid [N, C, H, W]
+            values: Value pyramid [N, C, H, W]
+        Returns:
+            message: (N, C, H, W)
+        """
+
+        bs = queries[0].shape[0]
+
+        messages = []
+        topk = self.topks[0]
+        for i, (query, key, value) in enumerate(
+                zip(reversed(queries), reversed(keys), reversed(values))):
+            bs, c, h, w = key.shape
+            if i == 0:  # Full attention for the coarest level
+                A, message, topk_score, topk_idx = self.process_coarse_level(
+                    query, key, value, topk)
+            else:
+                topk_prev = topk
+                topk = self.topks[i]
+                final = True if i == len(queries) - 1 else False
+                A, message, topk_score, topk_idx = self.process_fine_level(
+                    query, key, value, topk_score, topk_pos, topk_prev, topk,
+                    final)
+
+            messages.append(message)
+            topk_pos = torch.stack([  # noqa
+                topk_idx // w, topk_idx % w
+            ])  # convert to coordinate
+
+        # Merge messages of different layers
+        final_message = 0
+
+        weight = torch.softmax(self.weight, dim=0)
+        for i, m in enumerate(messages):
+            if self.lepe:
+                H, W = values[-(i + 1)].shape[-2:]
+                lepe = self.get_vs[i](values[-(i + 1)])
+
+            if i == 0:
+                if self.lepe:
+                    lepe = rearrange(
+                        lepe, 'b (hd d) H W -> b (H W) hd d', hd=self.nhead)
+                    final_message = (m + lepe) * weight[i]
+                else:
+                    final_message = m * weight[i]
+            else:
+                if self.lepe:
+                    lepe = rearrange(
+                        lepe,
+                        'b (hd d) (H t1) (W t2) -> b (H W) (t1 t2) hd d',
+                        hd=self.nhead,
+                        t1=2,
+                        t2=2)
+                    final_message = final_message.unsqueeze(
+                        2) + (m + lepe) * weight[i]
+                else:
+                    final_message = final_message.unsqueeze(2) + m * weight[i]
+
+                final_message = rearrange(
+                    final_message,
+                    'b (H W) (t1 t2) h d -> b (H t1 W t2) h d',
+                    t1=2,
+                    t2=2,
+                    H=queries[-i].shape[2])
+        return final_message
diff --git a/modelscope/ops/quadtree_attention/src/__init__.py b/modelscope/ops/quadtree_attention/src/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelscope/ops/quadtree_attention/src/score_computation.cpp b/modelscope/ops/quadtree_attention/src/score_computation.cpp
new file mode 100644
index 00000000..0df4f273
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/score_computation.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#include "score_computation.h"
+#include <torch/extension.h>
+#include <vector>
+#include<iostream>
+#include<stdio.h>
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+// == Forward
+std::vector<torch::Tensor> score_cuda_forward(torch::Tensor input1, //parameter: K*group_num, C
+                      torch::Tensor input2, //tensor : B, N, C
+                      torch::Tensor index) //tensor: B, N, K
+{
+    CHECK_INPUT(input1);
+    CHECK_INPUT(input2);
+    CHECK_INPUT(index);
+    return ScoreData_ongpu(input1, input2, index);
+
+}
+
+std::vector<torch::Tensor> score_cuda_backward(torch::Tensor grad_output1, //B,N,C,group_num
+                      torch::Tensor input1, //scene : N, H, W, C1
+                      torch::Tensor input2, // scene coords: N, H, W, 3
+                      torch::Tensor index) //tensor: B, N, K
+{
+    CHECK_INPUT(grad_output1);
+    CHECK_INPUT(input1);
+    CHECK_INPUT(input2);
+    CHECK_INPUT(index);
+    return ScoreData_backward_ongpu(grad_output1, input1, input2, index);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("score_forward", &score_cuda_forward, "score forward (CUDA)");
+  m.def("score_backward", &score_cuda_backward, "score forward (CUDA)");
+}
diff --git a/modelscope/ops/quadtree_attention/src/score_computation.h b/modelscope/ops/quadtree_attention/src/score_computation.h
new file mode 100644
index 00000000..9f3c19a6
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/score_computation.h
@@ -0,0 +1,24 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#ifndef _Score_CUDA
+#define _Score_CUDA
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<torch::Tensor> score_cuda_forward(torch::Tensor input1,       //query t: N, H, W, C1
+                                             torch::Tensor input2,       //scene : N, H, W, C1
+                                            torch::Tensor index);       //scene : N, H, W, C1
+
+
+
+std::vector<at::Tensor> ScoreData_ongpu(at::Tensor input1,       //query t: N, H, W, C1
+                                             at::Tensor input2,       //scene : N, H, W, C1
+                                            at::Tensor index);       //scene : N, H, W, C1
+
+
+std::vector<torch::Tensor> ScoreData_backward_ongpu(torch::Tensor grad_output1, //B,N,C,group_num
+                      torch::Tensor input1, //scene : N, H, W, C1
+                      torch::Tensor input2, // scene coords: N, H, W, 3
+                      torch::Tensor index); //tensor: B, N, K
+
+#endif
diff --git a/modelscope/ops/quadtree_attention/src/score_computation_kernal.cu b/modelscope/ops/quadtree_attention/src/score_computation_kernal.cu
new file mode 100644
index 00000000..7511d827
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/score_computation_kernal.cu
@@ -0,0 +1,186 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#include <vector>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <vector>
+#include "score_computation.h"
+#include <stdio.h>
+
+#define ROUND_OFF 50000
+
+#define CUDA_NUM_THREADS 1024
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_WARP 32
+#define MAX_H 8
+
+#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+#define GET_BLOCKS(n, t) (n+t-1) / t
+
+
+template <typename scalar_t>
+__global__ void ScoreData(
+  torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> query, // B, N1, 4, H, dim
+  torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> key, //B, N2, H, dim
+  torch::PackedTensorAccessor32<long,4,torch::RestrictPtrTraits> index, //B, N1, K*4, H
+  torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> output //B, N1, 4, K*4, H
+  ){
+  extern __shared__ char patch_data_char[];
+
+  scalar_t *feat1_data = (scalar_t *)patch_data_char;
+
+
+  int b = blockIdx.x;
+  int n1 = blockIdx.y;
+  int f = blockIdx.z;
+
+  int ch_off = threadIdx.x;
+
+  int D=query.size(4);
+  int HD=query.size(3)*D;
+  int K=index.size(2);
+  for(int ch = ch_off; ch < HD; ch += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
+    feat1_data[ch] = query[b][n1][f][ch/D][ch%D];
+  }
+  __syncthreads();
+
+  __shared__ scalar_t score[THREADS_PER_WARP*MAX_H];
+  for(int k = ch_off; k < K; k += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
+
+      for(int h=0;h<query.size(3);h++){
+          int score_idx=ch_off*query.size(3)+h;
+          score[score_idx]=0;
+          int idx=index[b][n1][k][h];
+          for(int d=0;d<query.size(4);d++){
+              score[score_idx]+=feat1_data[h*D+d]*key[b][idx][h][d];
+          }
+          output[b][n1][f][k][h]=score[score_idx];
+      }
+  }
+
+
+}
+
+
+std::vector<torch::Tensor> ScoreData_ongpu(torch::Tensor query, // B, N1, 4, H, dim
+  torch::Tensor key, // B, N2, H, dim
+  torch::Tensor index) // B, N1, K, 4, H
+{
+
+    const auto B = query.size(0);
+    const auto N1 = query.size(1);
+    const auto H = query.size(3);
+    const auto D = query.size(4);
+    const auto K = index.size(-2);
+
+
+    auto output = torch::zeros({B, N1, 4, K, H},torch::device(torch::kCUDA));
+
+    int shared_memory_per_block = H*D;
+
+    dim3 totalBlocks(B, N1, 4);
+    dim3 threadsPerBlock(THREADS_PER_WARP);
+    AT_DISPATCH_FLOATING_TYPES(query.type(), "ScoreData_ongpu", ([&] {
+      ScoreData<scalar_t><<<totalBlocks, threadsPerBlock, shared_memory_per_block * sizeof(scalar_t)>>>(
+          query.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+          key.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+          index.packed_accessor32<long,4,torch::RestrictPtrTraits>(),
+          output.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+    }));
+  return {output};
+
+}
+
+template <typename scalar_t>
+__global__ void ScoreDataBackward(
+  torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad, //B, N1, 4, K*4, H
+  torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> query, //B, N1, 4, H, dim
+  torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> key, // B, N2, H, dim
+  torch::PackedTensorAccessor32<long,4,torch::RestrictPtrTraits> index,// B, N1, K*4, H
+  torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> query_grad, //B, N1, 4, H, D
+  torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> key_grad //B, N2, H, D
+  ){
+  int b = blockIdx.x;
+  int n1 = blockIdx.y;
+  int f = blockIdx.z;
+
+  extern __shared__ char patch_data_char[];
+
+
+  int ch_off = threadIdx.x;
+
+  int D=query.size(4);
+  int H=query.size(3);
+  int HD=H*D;
+  int K=index.size(2);
+
+  scalar_t *query_data = (scalar_t *)patch_data_char;
+
+  scalar_t *grad_data = (scalar_t *) (HD*sizeof(scalar_t)+patch_data_char);
+
+
+  for(int ch = ch_off; ch <HD; ch += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
+    query_data[ch] = query[b][n1][f][ch/D][ch%D];
+  }
+  for(int ch = ch_off; ch <K*H; ch += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
+    grad_data[ch] = grad[b][n1][f][ch/H][ch%H];
+  }
+  __syncthreads();
+
+   for(int k = ch_off; k < K; k += (WARPS_PER_BLOCK*THREADS_PER_WARP)) { // CHANNELS
+
+      for(int h=0;h<H;h++){
+          int idx=index[b][n1][k][h];
+          for(int d=0;d<D;d++){
+
+              atomicAdd(&query_grad[b][n1][f][h][d], grad_data[k*H+h]*key[b][idx][h][d]);
+              atomicAdd(&key_grad[b][idx][h][d],grad_data[k*H+h]*query_data[h*D+d]);
+          }
+
+      }
+  }
+
+}
+
+std::vector<torch::Tensor> ScoreData_backward_ongpu(torch::Tensor grad_output1, //B, N1, 4, K*4, H
+                      torch::Tensor query, //B, N1, 4, H, dim
+                      torch::Tensor key, //B, N2, H, dim
+                      torch::Tensor index) //B, N1, K*4, H
+
+{
+
+    const auto B = grad_output1.size(0);
+    const auto N1 = grad_output1.size(1);
+    const auto N2 = key.size(1);
+    const auto K = grad_output1.size(3);
+    const auto H = key.size(2);
+    const auto D = key.size(3);
+
+
+    auto query_grad = torch::zeros({B, N1, 4, H, D},torch::device(torch::kCUDA));
+
+    auto key_grad = torch::zeros({B, N2, H, D},torch::device(torch::kCUDA));
+
+
+    int shared_memory_per_block = H*D+K*H;
+
+    dim3 totalBlocks(B, N1, 4);
+    dim3 threadsPerBlock(THREADS_PER_WARP);
+
+
+    AT_DISPATCH_FLOATING_TYPES(key.type(), "ScoreDatabackward_ongpu", ([&] {
+      ScoreDataBackward<scalar_t><<<totalBlocks, threadsPerBlock, shared_memory_per_block * sizeof(scalar_t)>>>(
+          grad_output1.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+          query.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+          key.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+          index.packed_accessor32<long,4,torch::RestrictPtrTraits>(),
+          query_grad.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+          key_grad.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>()
+          );
+    }));
+
+  return {query_grad, key_grad};
+
+}
diff --git a/modelscope/ops/quadtree_attention/src/utils.h b/modelscope/ops/quadtree_attention/src/utils.h
new file mode 100644
index 00000000..5840bc7f
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/utils.h
@@ -0,0 +1,28 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+class Formatter {
+public:
+  Formatter() {}
+  ~Formatter() {}
+
+  template <typename Type> Formatter &operator<<(const Type &value) {
+    stream_ << value;
+    return *this;
+  }
+
+  std::string str() const { return stream_.str(); }
+  operator std::string() const { return stream_.str(); }
+
+  enum ConvertToString { to_str };
+
+  std::string operator>>(ConvertToString) { return stream_.str(); }
+
+private:
+  std::stringstream stream_;
+  Formatter(const Formatter &);
+  Formatter &operator=(Formatter &);
+};
diff --git a/modelscope/ops/quadtree_attention/src/value_aggregation.cpp b/modelscope/ops/quadtree_attention/src/value_aggregation.cpp
new file mode 100644
index 00000000..cf06093c
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/value_aggregation.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "value_aggregation.h"
+//extern THCState *state;
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void value_aggregation_cuda_forward(
+                    at::Tensor score, // B, N, K, H
+                    at::Tensor value, // B, M, H, D
+                    at::Tensor index, // B, N, K, H
+                    at::Tensor output)// B, N, H, D
+{
+    CHECK_INPUT(score);
+    CHECK_INPUT(value);
+    CHECK_INPUT(index);
+    auto score_size = score.sizes();
+    auto value_size = value.sizes();
+    int B = score_size[0];
+    int N = score_size[1];
+    int K = score_size[2];
+    int H = score_size[3];
+    int M = value_size[1];
+    int D = value_size[3];
+
+
+    value_aggregation_forward_kernel(score.data<float>(), value.data<float>(),
+        index.data<long>(), output.data<float>(), B, N, K, H, M, D,
+        at::cuda::getCurrentCUDAStream());
+}
+
+void value_aggregation_cuda_backward(
+                    at::Tensor grad_output, // B, N, H, D
+                    at::Tensor score, // B, N, K, H
+                    at::Tensor value, // B, M, H, D
+                    at::Tensor index, // B, N, K, H
+                    at::Tensor grad_score, // B, N, K, H
+                    at::Tensor grad_value // B, M, H, D
+                    )
+{
+    CHECK_INPUT(score);
+    CHECK_INPUT(value);
+    CHECK_INPUT(index);
+    CHECK_INPUT(grad_output);
+
+    auto score_size = score.sizes();
+    auto value_size = value.sizes();
+    int B = score_size[0];
+    int N = score_size[1];
+    int K = score_size[2];
+    int H = score_size[3];
+    int M = value_size[1];
+    int D = value_size[3];
+
+
+    value_aggregation_backward_kernel(grad_output.data<float>(), score.data<float>(),
+        value.data<float>(), index.data<long>(), grad_score.data<float>(), grad_value.data<float>(),
+        B, N, K, H, M, D, at::cuda::getCurrentCUDAStream());
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("value_aggregation_forward", &value_aggregation_cuda_forward, "value forward (CUDA)");
+  m.def("value_aggregation_backward", &value_aggregation_cuda_backward, "value backward (CUDA)");
+}
diff --git a/modelscope/ops/quadtree_attention/src/value_aggregation.h b/modelscope/ops/quadtree_attention/src/value_aggregation.h
new file mode 100644
index 00000000..cafc2462
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/value_aggregation.h
@@ -0,0 +1,19 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#ifndef _VALUE_AGGREGATION_
+#define _VALUE_AGGREGATION_
+#include <torch/extension.h>
+#include <vector>
+
+void value_aggregation_forward_kernel(float* score, // B, N, K, H
+                    float* value, // B, M, H, D
+                    long* index, // B, N, K, H
+                    float* output, // B, N, H, D
+                    int B, int N, int K, int H, int M, int D, cudaStream_t stream
+                    );
+
+void value_aggregation_cuda_forward(at::Tensor score, at::Tensor value, at::Tensor index, at::Tensor output);
+
+void value_aggregation_backward_kernel(float* grad_output, float* score, float* value,long* index, float* grad_score, float* grad_value, int B, int N, int K, int H, int M, int D, cudaStream_t stream);
+
+#endif // _VALUE_AGGREGATION_
diff --git a/modelscope/ops/quadtree_attention/src/value_aggregation_kernel.cu b/modelscope/ops/quadtree_attention/src/value_aggregation_kernel.cu
new file mode 100644
index 00000000..f34b60f4
--- /dev/null
+++ b/modelscope/ops/quadtree_attention/src/value_aggregation_kernel.cu
@@ -0,0 +1,88 @@
+// Copyright (c) Alibaba, Inc. and its affiliates.
+
+#include <vector>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include <vector>
+#include "value_aggregation.h"
+#include "THC/THCAtomics.cuh"
+#include <stdio.h>
+#include "utils.h"
+
+#define ROUND_OFF 50000
+
+#define CUDA_NUM_THREADS 1024
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_WARP 32
+
+#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+#define GET_BLOCKS(n, t) (n+t-1) / t
+
+__global__ void ValueAggregationForwardFunc(float* score, float* value, long* index, float* output, int B, int N, int K, int H, int M, int D) {
+  ///*
+  long LENGTH = B*N*H*D;
+  CUDA_KERNEL_LOOP(cur_idx, LENGTH){
+      long d_idx = cur_idx % D;
+      long h_idx = (cur_idx - d_idx) / D % H;
+      long n_idx = (cur_idx - d_idx - h_idx * D) / D / H % N;
+      long b_idx = (cur_idx - d_idx - h_idx * D - n_idx * H * D) / D / H / N;
+      if (cur_idx < LENGTH) {
+        long score_start_idx = b_idx * N * K * H + n_idx * K * H + h_idx;
+        long value_start_idx = b_idx * M * H * D + h_idx * D + d_idx;
+
+        float out_val = 0;
+        for(int k_idx = 0; k_idx < K; k_idx++){
+            int score_idx = score_start_idx + k_idx * H;
+            int value_idx = value_start_idx + index[score_idx] * H * D;
+            out_val += score[score_idx] * value[value_idx];
+        }
+        output[cur_idx] = out_val;
+      }
+  }
+}
+
+
+void value_aggregation_forward_kernel(float* score, float* value, long* index, float* ouput, int B, int N, int K, int H, int M, int D, cudaStream_t stream){
+  ValueAggregationForwardFunc
+    <<<GET_BLOCKS(B*N*H*D, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, stream>>>(score, value, index, ouput, B, N, K, H, M, D);
+
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+    throw std::runtime_error(Formatter()
+                             << "CUDA kernel failed : " << std::to_string(err));
+}
+
+__global__ void ValueAggregationBackwardFunc(float* grad_output, float* score, float* value, long* index, float* grad_score,
+         float* grad_value, int B, int N, int K, int H, int M, int D) {
+  long LENGTH = B*N*K*H;
+  CUDA_KERNEL_LOOP(cur_idx, LENGTH){
+      long h_idx = cur_idx % H;
+      long k_idx = (cur_idx - h_idx) / H % K;
+      long n_idx = (cur_idx - h_idx - k_idx * H) / H / K % N;
+      long b_idx = (cur_idx - h_idx - k_idx * H - n_idx * H * K) / H / K / N;
+
+      if (cur_idx < LENGTH) {
+        long output_start_idx = b_idx * N * H * D + n_idx * H * D + h_idx * D;
+        long value_start_idx = b_idx * M * H * D + h_idx * D;
+        for (int d_idx = 0; d_idx < D; d_idx ++){
+            long output_idx = output_start_idx + d_idx;
+            long value_idx = value_start_idx + index[cur_idx] * H * D + d_idx;
+            auto grad_output_val = grad_output[output_idx];
+            grad_score[cur_idx] += grad_output_val * value[value_idx];
+            gpuAtomicAdd(&grad_value[value_idx], grad_output_val * score[cur_idx]);
+        }
+      }
+  }
+}
+
+void value_aggregation_backward_kernel(float* grad_output, float* score, float* value, long* index, float* grad_score, float* grad_value, int B, int N, int K, int H, int M, int D, cudaStream_t stream){
+  ValueAggregationBackwardFunc
+    <<<GET_BLOCKS(B*N*K*H, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0, stream>>>(grad_output, score, value, index, grad_score, grad_value, B, N, K, H, M, D);
+
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+    throw std::runtime_error(Formatter()
+                             << "CUDA kernel failed : " << std::to_string(err));
+}
diff --git a/modelscope/outputs/__init__.py b/modelscope/outputs/__init__.py
index 47e66714..dbfdac0d 100644
--- a/modelscope/outputs/__init__.py
+++ b/modelscope/outputs/__init__.py
@@ -1,2 +1,3 @@
-from .nlp.model_outputs import *  # noqa
+from .cv_outputs import *  # noqa
+from .nlp_outputs import *  # noqa
 from .outputs import TASK_OUTPUTS, ModelOutputBase, OutputKeys
diff --git a/modelscope/outputs/nlp/model_outputs.py b/modelscope/outputs/nlp_outputs.py
similarity index 98%
rename from modelscope/outputs/nlp/model_outputs.py
rename to modelscope/outputs/nlp_outputs.py
index 464ba7ef..e285d40f 100644
--- a/modelscope/outputs/nlp/model_outputs.py
+++ b/modelscope/outputs/nlp_outputs.py
@@ -157,6 +157,9 @@ class FaqQuestionAnsweringOutput(ModelOutputBase):
     """
 
     scores: Tensor = None
+    labels: Tensor = None
+    loss: Tensor = None
+    logits: Tensor = None
 
 
 @dataclass
@@ -402,3 +405,13 @@ class AttentionTokenClassificationModelOutput(TokenClassificationModelOutput):
     """
     attentions: Tensor = None
     hidden_states: Tensor = None
+
+
+@dataclass
+class DialogueUserSatisfactionEstimationModelOutput(ModelOutputBase):
+    """The output class for user satisfaction estimation.
+
+    Args:
+        logits (`Tensor`): The logits output of the model.
+    """
+    logits: Tensor = None
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index 371b89f5..d8b95ab4 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -21,14 +21,17 @@ class OutputKeys(object):
     MASKS = 'masks'
     DEPTHS = 'depths'
     DEPTHS_COLOR = 'depths_color'
+    LAYOUT = 'layout'
     TEXT = 'text'
     POLYGONS = 'polygons'
     OUTPUT = 'output'
     OUTPUT_IMG = 'output_img'
     OUTPUT_VIDEO = 'output_video'
     OUTPUT_PCM = 'output_pcm'
+    OUTPUT_PCM_LIST = 'output_pcm_list'
     OUTPUT_WAV = 'output_wav'
     IMG_EMBEDDING = 'img_embedding'
+    SPK_EMBEDDING = 'spk_embedding'
     SPO_LIST = 'spo_list'
     TEXT_EMBEDDING = 'text_embedding'
     TRANSLATION = 'translation'
@@ -50,6 +53,9 @@ class OutputKeys(object):
     SCENE_NUM = 'scene_num'
     SCENE_META_LIST = 'scene_meta_list'
     SHOT_META_LIST = 'shot_meta_list'
+    MATCHES = 'matches'
+    PCD12 = 'pcd12'
+    PCD12_ALIGN = 'pcd12_align'
 
 
 TASK_OUTPUTS = {
@@ -70,6 +76,16 @@ TASK_OUTPUTS = {
     #    "text": "电子元器件提供BOM配单"
     # }
     Tasks.ocr_recognition: [OutputKeys.TEXT],
+    Tasks.sudoku: [OutputKeys.TEXT],
+    Tasks.text2sql: [OutputKeys.TEXT],
+
+    # document vl embedding for single sample
+    # {
+    #    "img_embedding": np.array with shape [M, D],
+    #    "text_embedding": np.array with shape [N, D]
+    # }
+    Tasks.document_vl_embedding:
+    [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING],
 
     # face 2d keypoint result for single sample
     #   {
@@ -130,6 +146,13 @@ TASK_OUTPUTS = {
     Tasks.card_detection:
     [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
 
+    # facial expression recognition result for single sample
+    #   {
+    #       "scores": [0.9]
+    #       "boxes": [x1, y1, x2, y2]
+    #   }
+    Tasks.face_liveness: [OutputKeys.SCORES, OutputKeys.BOXES],
+
     # facial expression recognition result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
@@ -171,6 +194,13 @@ TASK_OUTPUTS = {
     #   }
     Tasks.face_recognition: [OutputKeys.IMG_EMBEDDING],
 
+    # face recognition ood result for single sample
+    #   {
+    #       "img_embedding": np.array with shape [1, D],
+    #       "ood_score ": [0.95]
+    #   }
+    Tasks.face_recognition_ood: [OutputKeys.IMG_EMBEDDING, OutputKeys.SCORES],
+
     # human detection result for single sample
     #   {
     #       "scores": [0.9, 0.1, 0.05, 0.05]
@@ -210,6 +240,8 @@ TASK_OUTPUTS = {
     #   }
     Tasks.image_object_detection:
     [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
+    Tasks.domain_specific_object_detection:
+    [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
 
     # video object detection result for single sample
     #   {
@@ -281,6 +313,11 @@ TASK_OUTPUTS = {
     Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG],
     Tasks.image_body_reshaping: [OutputKeys.OUTPUT_IMG],
 
+    # video editing task result for a single video
+    # {"output_video": "path_to_rendered_video"}
+    Tasks.video_frame_interpolation: [OutputKeys.OUTPUT_VIDEO],
+    Tasks.video_super_resolution: [OutputKeys.OUTPUT_VIDEO],
+
     # live category recognition result for single video
     # {
     #       "scores": [0.885272, 0.014790631, 0.014558001]
@@ -330,8 +367,9 @@ TASK_OUTPUTS = {
     #   "output_video": "path_to_rendered_video" , this is optional
     # and is only avaialbe when the "render" option is enabled.
     # }
-    Tasks.body_3d_keypoints:
-    [OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO],
+    Tasks.body_3d_keypoints: [
+        OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
+    ],
 
     # 2D hand keypoints result for single sample
     # {
@@ -357,8 +395,22 @@ TASK_OUTPUTS = {
     #             ],
     #   "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
     # }
-    Tasks.video_single_object_tracking:
-    [OutputKeys.BOXES, OutputKeys.TIMESTAMPS],
+    Tasks.video_single_object_tracking: [
+        OutputKeys.BOXES, OutputKeys.TIMESTAMPS
+    ],
+
+    # video multi object tracking result for single video
+    # {
+    #   "boxes": [
+    #               [frame_num, obj_id, x1, y1, x2, y2],
+    #               [frame_num, obj_id, x1, y1, x2, y2],
+    #               [frame_num, obj_id, x1, y1, x2, y2],
+    #             ],
+    #   "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
+    # }
+    Tasks.video_multi_object_tracking: [
+        OutputKeys.BOXES, OutputKeys.TIMESTAMPS
+    ],
 
     # live category recognition result for single video
     # {
@@ -386,6 +438,10 @@ TASK_OUTPUTS = {
     # }
     Tasks.video_embedding: [OutputKeys.VIDEO_EMBEDDING],
 
+    # video stabilization task result for a single video
+    # {"output_video": "path_to_rendered_video"}
+    Tasks.video_stabilization: [OutputKeys.OUTPUT_VIDEO],
+
     # virtual_try_on result for a single sample
     # {
     #    "output_img": np.ndarray with shape [height, width, 3]
@@ -467,9 +523,11 @@ TASK_OUTPUTS = {
     #   {
     #       "masks": [np.array # 3D array with shape [frame_num, height, width]]
     #       "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
+    #       "output_video": "path_to_rendered_video" , this is optional
+    # and is only avaialbe when the "render" option is enabled.
     #   }
     Tasks.referring_video_object_segmentation: [
-        OutputKeys.MASKS, OutputKeys.TIMESTAMPS
+        OutputKeys.MASKS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO
     ],
 
     # video human matting result for a single video
@@ -673,6 +731,18 @@ TASK_OUTPUTS = {
     # { "text": "每一天都要快乐喔"}
     Tasks.auto_speech_recognition: [OutputKeys.TEXT],
 
+    # itn result for single sample
+    # {"text": "123"}
+    Tasks.inverse_text_processing: [OutputKeys.TEXT],
+
+    # speaker verification for single compare task
+    # {'score': 84.2332}
+    Tasks.speaker_verification: [OutputKeys.SCORES],
+
+    # punctuation result for single sample
+    # { "text": "你好，明天！"}
+    Tasks.punctuation: [OutputKeys.TEXT],
+
     # audio processed for single file in PCM format
     # {
     #   "output_pcm": pcm encoded audio bytes
@@ -680,6 +750,7 @@ TASK_OUTPUTS = {
     Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
     Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
     Tasks.acoustic_noise_suppression: [OutputKeys.OUTPUT_PCM],
+    Tasks.speech_separation: [OutputKeys.OUTPUT_PCM_LIST],
 
     # text_to_speech result for a single sample
     # {
@@ -707,6 +778,12 @@ TASK_OUTPUTS = {
     #   "caption": "this is an image caption text."
     # }
     Tasks.image_captioning: [OutputKeys.CAPTION],
+
+    # video caption result for single sample
+    # {
+    #   "caption": "this is an video caption text."
+    # }
+    Tasks.video_captioning: [OutputKeys.CAPTION],
     Tasks.ocr_recognition: [OutputKeys.TEXT],
 
     # visual grounding result for single sample
@@ -765,6 +842,10 @@ TASK_OUTPUTS = {
     # {"text": "this is a text answser. "}
     Tasks.visual_question_answering: [OutputKeys.TEXT],
 
+    # VideoQA result for a sample
+    # {"text": "this is a text answser. "}
+    Tasks.video_question_answering: [OutputKeys.TEXT],
+
     # auto_speech_recognition result for a single sample
     # {
     #    "text": "每天都要快乐喔"
diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index 50818dff..eab76cb3 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -3,7 +3,6 @@
 import numpy as np
 from PIL import Image
 
-from modelscope.models.base.base_head import Input
 from modelscope.utils.constant import Tasks
 
 
@@ -38,10 +37,10 @@ INPUT_TYPE = {
 
 def check_input_type(input_type, input):
     expected_type = INPUT_TYPE[input_type]
-    if expected_type == 'cv2.VideoCapture':
+    if input_type == InputType.VIDEO:
         # special type checking using class name, to avoid introduction of opencv dependency into fundamental framework.
-        assert type(input).__name__ == 'VideoCapture',\
-            f'invalid input type for {input_type}, expected cv2.VideoCapture but got {type(input)}\n {input}'
+        assert type(input).__name__ == 'VideoCapture' or isinstance(input, expected_type),\
+            f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
     else:
         assert isinstance(input, expected_type), \
             f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
@@ -53,7 +52,7 @@ TASK_INPUTS = {
     # if task input is a dict, value is a dict of InputType, where key
     # equals the one needed in pipeline input dict
     # if task input is a list, value is a set of input format, in which
-    # each elements corresponds to one input format as described above.
+    # each element corresponds to one input format as described above.
     # ============ vision tasks ===================
     Tasks.ocr_detection:
     InputType.IMAGE,
@@ -77,10 +76,14 @@ TASK_INPUTS = {
     InputType.IMAGE,
     Tasks.image_object_detection:
     InputType.IMAGE,
+    Tasks.domain_specific_object_detection:
+    InputType.IMAGE,
     Tasks.image_segmentation:
     InputType.IMAGE,
     Tasks.portrait_matting:
     InputType.IMAGE,
+    Tasks.image_fewshot_detection:
+    InputType.IMAGE,
 
     # image editing task result for a single image
     Tasks.skin_retouching:
@@ -128,6 +131,8 @@ TASK_INPUTS = {
     Tasks.hand_2d_keypoints:
     InputType.IMAGE,
     Tasks.video_single_object_tracking: (InputType.VIDEO, InputType.BOX),
+    Tasks.video_multi_object_tracking:
+    InputType.VIDEO,
     Tasks.video_category:
     InputType.VIDEO,
     Tasks.product_retrieval_embedding:
@@ -198,6 +203,12 @@ TASK_INPUTS = {
         'src': InputType.LIST,
         'ref': InputType.LIST,
     },
+    Tasks.sudoku:
+    InputType.TEXT,
+    Tasks.text2sql: {
+        'text': InputType.TEXT,
+        'database': InputType.TEXT
+    },
 
     # ============ audio tasks ===================
     Tasks.auto_speech_recognition:
@@ -211,17 +222,24 @@ TASK_INPUTS = {
         'nearend_mic': InputType.AUDIO,
         'farend_speech': InputType.AUDIO
     },
+    Tasks.speech_separation:
+    InputType.AUDIO,
     Tasks.acoustic_noise_suppression:
     InputType.AUDIO,
     Tasks.text_to_speech:
     InputType.TEXT,
     Tasks.keyword_spotting:
     InputType.AUDIO,
+    Tasks.inverse_text_processing:
+    InputType.TEXT,
 
     # ============ multi-modal tasks ===================
     Tasks.image_captioning: [InputType.IMAGE, {
         'image': InputType.IMAGE,
     }],
+    Tasks.video_captioning: [InputType.VIDEO, {
+        'video': InputType.VIDEO,
+    }],
     Tasks.visual_grounding: {
         'image': InputType.IMAGE,
         'text': InputType.TEXT
@@ -245,6 +263,10 @@ TASK_INPUTS = {
         'image': InputType.IMAGE,
         'text': InputType.TEXT
     },
+    Tasks.video_question_answering: {
+        'video': InputType.VIDEO,
+        'text': InputType.TEXT
+    },
     Tasks.visual_entailment: {
         'image': InputType.IMAGE,
         'text': InputType.TEXT,
diff --git a/modelscope/pipelines/audio/__init__.py b/modelscope/pipelines/audio/__init__.py
index b46ca87e..3ad32b3d 100644
--- a/modelscope/pipelines/audio/__init__.py
+++ b/modelscope/pipelines/audio/__init__.py
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
     from .kws_kwsbp_pipeline import KeyWordSpottingKwsbpPipeline
     from .linear_aec_pipeline import LinearAECPipeline
     from .text_to_speech_pipeline import TextToSpeechSambertHifiganPipeline
-
+    from .inverse_text_processing_pipeline import InverseTextProcessingPipeline
 else:
     _import_structure = {
         'ans_pipeline': ['ANSPipeline'],
@@ -19,6 +19,7 @@ else:
         'kws_kwsbp_pipeline': ['KeyWordSpottingKwsbpPipeline'],
         'linear_aec_pipeline': ['LinearAECPipeline'],
         'text_to_speech_pipeline': ['TextToSpeechSambertHifiganPipeline'],
+        'itn_inference_pipeline': ['InverseTextProcessingPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
index ae04f726..5a339a08 100644
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_inference_pipeline.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import yaml
@@ -10,6 +11,7 @@ from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import WavToScp
 from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
+                                                generate_scp_from_url,
                                                 load_bytes_from_url)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
@@ -34,6 +36,40 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
         super().__init__(model=model, preprocessor=preprocessor, **kwargs)
         self.model_cfg = self.model.forward()
 
+        self.output_dir = None
+        if 'output_dir' in kwargs:
+            self.output_dir = kwargs['output_dir']
+        self.cmd = self.get_cmd(kwargs)
+        if self.cmd['code_base'] == 'funasr':
+            from funasr.bin import asr_inference_launch
+            self.funasr_infer_modelscope = asr_inference_launch.inference_launch(
+                mode=self.cmd['mode'],
+                batch_size=self.cmd['batch_size'],
+                maxlenratio=self.cmd['maxlenratio'],
+                minlenratio=self.cmd['minlenratio'],
+                beam_size=self.cmd['beam_size'],
+                ngpu=self.cmd['ngpu'],
+                num_workers=self.cmd['num_workers'],
+                ctc_weight=self.cmd['ctc_weight'],
+                lm_weight=self.cmd['lm_weight'],
+                penalty=self.cmd['penalty'],
+                log_level=self.cmd['log_level'],
+                cmvn_file=self.cmd['cmvn_file'],
+                asr_train_config=self.cmd['asr_train_config'],
+                asr_model_file=self.cmd['asr_model_file'],
+                lm_file=self.cmd['lm_file'],
+                lm_train_config=self.cmd['lm_train_config'],
+                frontend_conf=self.cmd['frontend_conf'],
+                token_num_relax=self.cmd['token_num_relax'],
+                decoding_ind=self.cmd['decoding_ind'],
+                decoding_mode=self.cmd['decoding_mode'],
+                vad_model_file=self.cmd['vad_model_name'],
+                vad_infer_config=self.cmd['vad_model_config'],
+                vad_cmvn_file=self.cmd['vad_mvn_file'],
+                punc_model_file=self.cmd['punc_model_name'],
+                punc_infer_config=self.cmd['punc_model_config'],
+                output_dir=self.output_dir)
+
     def __call__(self,
                  audio_in: Union[str, bytes],
                  audio_fs: int = None,
@@ -41,11 +77,30 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
                  audio_format: str = None) -> Dict[str, Any]:
         from funasr.utils import asr_utils
 
+        # code base
+        code_base = self.cmd['code_base']
         self.recog_type = recog_type
         self.audio_format = audio_format
         self.audio_fs = audio_fs
-
-        if isinstance(audio_in, str):
+        checking_audio_fs = None
+        if code_base == 'funasr':
+            if isinstance(audio_in, str):
+                # for funasr code, generate wav.scp from url or local path
+                self.audio_in, self.raw_inputs = generate_scp_from_url(
+                    audio_in)
+            elif isinstance(audio_in, bytes):
+                self.audio_in = audio_in
+                self.raw_inputs = None
+            else:
+                import numpy
+                import torch
+                if isinstance(audio_in, torch.Tensor):
+                    self.audio_in = None
+                    self.raw_inputs = audio_in
+                elif isinstance(audio_in, numpy.ndarray):
+                    self.audio_in = None
+                    self.raw_inputs = audio_in
+        elif isinstance(audio_in, str):
             # load pcm data from url if audio_in is url str
             self.audio_in, checking_audio_fs = load_bytes_from_url(audio_in)
         elif isinstance(audio_in, bytes):
@@ -64,22 +119,154 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
                 recog_type=recog_type,
                 audio_format=audio_format)
 
-        if hasattr(asr_utils, 'sample_rate_checking'):
+        if hasattr(asr_utils,
+                   'sample_rate_checking') and self.audio_in is not None:
             checking_audio_fs = asr_utils.sample_rate_checking(
                 self.audio_in, self.audio_format)
             if checking_audio_fs is not None:
                 self.audio_fs = checking_audio_fs
 
-        if self.preprocessor is None:
-            self.preprocessor = WavToScp()
-
         output = self.preprocessor.forward(self.model_cfg, self.recog_type,
                                            self.audio_format, self.audio_in,
-                                           self.audio_fs)
+                                           self.audio_fs, self.cmd)
         output = self.forward(output)
         rst = self.postprocess(output)
         return rst
 
+    def get_cmd(self, extra_args) -> Dict[str, Any]:
+        if self.preprocessor is None:
+            self.preprocessor = WavToScp()
+
+        outputs = self.preprocessor.config_checking(self.model_cfg)
+
+        # generate asr inference command
+        cmd = {
+            'model_type': outputs['model_type'],
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'log_level': 'ERROR',
+            'asr_model_file': outputs['am_model_path'],
+            'idx_text': '',
+            'sampled_ids': 'seq2seq/sampled_ids',
+            'sampled_lengths': 'seq2seq/sampled_lengths',
+            'lang': 'zh-cn',
+            'code_base': outputs['code_base'],
+            'mode': outputs['mode'],
+            'fs': {
+                'model_fs': 16000
+            }
+        }
+
+        if self.framework == Frameworks.torch:
+            frontend_conf = None
+            token_num_relax = None
+            decoding_ind = None
+            decoding_mode = None
+            if os.path.exists(outputs['asr_model_config']):
+                config_file = open(
+                    outputs['asr_model_config'], encoding='utf-8')
+                root = yaml.full_load(config_file)
+                config_file.close()
+                if 'frontend_conf' in root:
+                    frontend_conf = root['frontend_conf']
+                if 'token_num_relax' in root:
+                    token_num_relax = root['token_num_relax']
+                if 'decoding_ind' in root:
+                    decoding_ind = root['decoding_ind']
+                if 'decoding_mode' in root:
+                    decoding_mode = root['decoding_mode']
+
+                cmd['beam_size'] = root['beam_size']
+                cmd['penalty'] = root['penalty']
+                cmd['maxlenratio'] = root['maxlenratio']
+                cmd['minlenratio'] = root['minlenratio']
+                cmd['ctc_weight'] = root['ctc_weight']
+                cmd['lm_weight'] = root['lm_weight']
+            else:
+                # for vad task, no asr_model_config
+                cmd['beam_size'] = None
+                cmd['penalty'] = None
+                cmd['maxlenratio'] = None
+                cmd['minlenratio'] = None
+                cmd['ctc_weight'] = None
+                cmd['lm_weight'] = None
+            cmd['asr_train_config'] = outputs['am_model_config']
+            cmd['lm_file'] = outputs['lm_model_path']
+            cmd['lm_train_config'] = outputs['lm_model_config']
+            cmd['batch_size'] = outputs['model_config']['batch_size']
+            cmd['frontend_conf'] = frontend_conf
+            if frontend_conf is not None and 'fs' in frontend_conf:
+                cmd['fs']['model_fs'] = frontend_conf['fs']
+            cmd['token_num_relax'] = token_num_relax
+            cmd['decoding_ind'] = decoding_ind
+            cmd['decoding_mode'] = decoding_mode
+            cmd['num_workers'] = 0
+            if outputs.__contains__('mvn_file'):
+                cmd['cmvn_file'] = outputs['mvn_file']
+            else:
+                cmd['cmvn_file'] = None
+            if outputs.__contains__('vad_model_name'):
+                cmd['vad_model_name'] = outputs['vad_model_name']
+            else:
+                cmd['vad_model_name'] = None
+            if outputs.__contains__('vad_model_config'):
+                cmd['vad_model_config'] = outputs['vad_model_config']
+            else:
+                cmd['vad_model_config'] = None
+            if outputs.__contains__('vad_mvn_file'):
+                cmd['vad_mvn_file'] = outputs['vad_mvn_file']
+            else:
+                cmd['vad_mvn_file'] = None
+            if outputs.__contains__('punc_model_name'):
+                cmd['punc_model_name'] = outputs['punc_model_name']
+            else:
+                cmd['punc_model_name'] = None
+            if outputs.__contains__('punc_model_config'):
+                cmd['punc_model_config'] = outputs['punc_model_config']
+            else:
+                cmd['punc_model_config'] = None
+            if 'batch_size' in extra_args:
+                cmd['batch_size'] = extra_args['batch_size']
+            if 'mode' in extra_args:
+                cmd['mode'] = extra_args['mode']
+            if 'ngpu' in extra_args:
+                cmd['ngpu'] = extra_args['ngpu']
+            if 'beam_size' in extra_args:
+                cmd['beam_size'] = extra_args['beam_size']
+            if 'decoding_ind' in extra_args:
+                cmd['decoding_ind'] = extra_args['decoding_ind']
+            if 'decoding_mode' in extra_args:
+                cmd['decoding_mode'] = extra_args['decoding_mode']
+            if 'vad_model_file' in extra_args:
+                cmd['vad_model_name'] = extra_args['vad_model_file']
+            if 'vad_infer_config' in extra_args:
+                cmd['vad_model_config'] = extra_args['vad_infer_config']
+            if 'vad_cmvn_file' in extra_args:
+                cmd['vad_mvn_file'] = extra_args['vad_cmvn_file']
+            if 'punc_model_file' in extra_args:
+                cmd['punc_model_name'] = extra_args['punc_model_file']
+            if 'punc_infer_config' in extra_args:
+                cmd['punc_model_config'] = extra_args['punc_infer_config']
+
+        elif self.framework == Frameworks.tf:
+            cmd['fs']['model_fs'] = outputs['model_config']['fs']
+            cmd['hop_length'] = outputs['model_config']['hop_length']
+            cmd['feature_dims'] = outputs['model_config']['feature_dims']
+            cmd['predictions_file'] = 'text'
+            cmd['mvn_file'] = outputs['am_mvn_file']
+            cmd['vocab_file'] = outputs['vocab_file']
+            cmd['lang'] = outputs['model_lang']
+            if 'idx_text' in outputs:
+                cmd['idx_text'] = outputs['idx_text']
+            if 'sampled_ids' in outputs['model_config']:
+                cmd['sampled_ids'] = outputs['model_config']['sampled_ids']
+            if 'sampled_lengths' in outputs['model_config']:
+                cmd['sampled_lengths'] = outputs['model_config'][
+                    'sampled_lengths']
+        else:
+            raise ValueError('model type is mismatching')
+
+        return cmd
+
     def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """Decoding
         """
@@ -87,91 +274,22 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
         logger.info(f"Decoding with {inputs['audio_format']} files ...")
 
         data_cmd: Sequence[Tuple[str, str, str]]
-        if inputs['audio_format'] == 'wav' or inputs['audio_format'] == 'pcm':
-            data_cmd = ['speech', 'sound']
-        elif inputs['audio_format'] == 'kaldi_ark':
-            data_cmd = ['speech', 'kaldi_ark']
-        elif inputs['audio_format'] == 'tfrecord':
-            data_cmd = ['speech', 'tfrecord']
+        if isinstance(self.audio_in, bytes):
+            data_cmd = [self.audio_in, 'speech', 'bytes']
+        elif isinstance(self.audio_in, str):
+            data_cmd = [self.audio_in, 'speech', 'sound']
+        elif self.raw_inputs is not None:
+            data_cmd = None
 
-        if inputs.__contains__('mvn_file'):
+        if inputs.__contains__('mvn_file') and inputs['audio_format'] != 'scp':
             data_cmd.append(inputs['mvn_file'])
 
         # generate asr inference command
-        cmd = {
-            'model_type': inputs['model_type'],
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'log_level': 'ERROR',
-            'audio_in': inputs['audio_lists'],
-            'name_and_type': data_cmd,
-            'asr_model_file': inputs['am_model_path'],
-            'idx_text': '',
-            'sampled_ids': 'seq2seq/sampled_ids',
-            'sampled_lengths': 'seq2seq/sampled_lengths',
-            'lang': 'zh-cn',
-            'code_base': inputs['code_base'],
-            'mode': inputs['mode'],
-            'fs': {
-                'audio_fs': inputs['audio_fs'],
-                'model_fs': 16000
-            }
-        }
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['fs']['audio_fs'] = inputs['audio_fs']
+        self.cmd['raw_inputs'] = self.raw_inputs
 
-        if self.framework == Frameworks.torch:
-            config_file = open(inputs['asr_model_config'], encoding='utf-8')
-            root = yaml.full_load(config_file)
-            config_file.close()
-            frontend_conf = None
-            if 'frontend_conf' in root:
-                frontend_conf = root['frontend_conf']
-            token_num_relax = None
-            if 'token_num_relax' in root:
-                token_num_relax = root['token_num_relax']
-            decoding_ind = None
-            if 'decoding_ind' in root:
-                decoding_ind = root['decoding_ind']
-            decoding_mode = None
-            if 'decoding_mode' in root:
-                decoding_mode = root['decoding_mode']
-
-            cmd['beam_size'] = root['beam_size']
-            cmd['penalty'] = root['penalty']
-            cmd['maxlenratio'] = root['maxlenratio']
-            cmd['minlenratio'] = root['minlenratio']
-            cmd['ctc_weight'] = root['ctc_weight']
-            cmd['lm_weight'] = root['lm_weight']
-            cmd['asr_train_config'] = inputs['am_model_config']
-            cmd['lm_file'] = inputs['lm_model_path']
-            cmd['lm_train_config'] = inputs['lm_model_config']
-            cmd['batch_size'] = inputs['model_config']['batch_size']
-            cmd['frontend_conf'] = frontend_conf
-            if frontend_conf is not None and 'fs' in frontend_conf:
-                cmd['fs']['model_fs'] = frontend_conf['fs']
-            cmd['token_num_relax'] = token_num_relax
-            cmd['decoding_ind'] = decoding_ind
-            cmd['decoding_mode'] = decoding_mode
-            cmd['num_workers'] = 0
-
-        elif self.framework == Frameworks.tf:
-            cmd['fs']['model_fs'] = inputs['model_config']['fs']
-            cmd['hop_length'] = inputs['model_config']['hop_length']
-            cmd['feature_dims'] = inputs['model_config']['feature_dims']
-            cmd['predictions_file'] = 'text'
-            cmd['mvn_file'] = inputs['am_mvn_file']
-            cmd['vocab_file'] = inputs['vocab_file']
-            cmd['lang'] = inputs['model_lang']
-            if 'idx_text' in inputs:
-                cmd['idx_text'] = inputs['idx_text']
-            if 'sampled_ids' in inputs['model_config']:
-                cmd['sampled_ids'] = inputs['model_config']['sampled_ids']
-            if 'sampled_lengths' in inputs['model_config']:
-                cmd['sampled_lengths'] = inputs['model_config'][
-                    'sampled_lengths']
-
-        else:
-            raise ValueError('model type is mismatching')
-
-        inputs['asr_result'] = self.run_inference(cmd)
+        inputs['asr_result'] = self.run_inference(self.cmd)
 
         return inputs
 
@@ -247,34 +365,10 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
     def run_inference(self, cmd):
         asr_result = []
         if self.framework == Frameworks.torch and cmd['code_base'] == 'funasr':
-            from funasr.bin import asr_inference_launch
-
-            if hasattr(asr_inference_launch, 'set_parameters'):
-                asr_inference_launch.set_parameters(sample_rate=cmd['fs'])
-                asr_inference_launch.set_parameters(language=cmd['lang'])
-
-            asr_result = asr_inference_launch.inference_launch(
-                mode=cmd['mode'],
-                batch_size=cmd['batch_size'],
-                maxlenratio=cmd['maxlenratio'],
-                minlenratio=cmd['minlenratio'],
-                beam_size=cmd['beam_size'],
-                ngpu=cmd['ngpu'],
-                num_workers=cmd['num_workers'],
-                ctc_weight=cmd['ctc_weight'],
-                lm_weight=cmd['lm_weight'],
-                penalty=cmd['penalty'],
-                log_level=cmd['log_level'],
+            asr_result = self.funasr_infer_modelscope(
                 data_path_and_name_and_type=cmd['name_and_type'],
-                audio_lists=cmd['audio_in'],
-                asr_train_config=cmd['asr_train_config'],
-                asr_model_file=cmd['asr_model_file'],
-                lm_file=cmd['lm_file'],
-                lm_train_config=cmd['lm_train_config'],
-                frontend_conf=cmd['frontend_conf'],
-                token_num_relax=cmd['token_num_relax'],
-                decoding_ind=cmd['decoding_ind'],
-                decoding_mode=cmd['decoding_mode'])
+                raw_inputs=cmd['raw_inputs'])
+
         elif self.framework == Frameworks.torch:
             from easyasr import asr_inference_paraformer_espnet
 
diff --git a/modelscope/pipelines/audio/inverse_text_processing_pipeline.py b/modelscope/pipelines/audio/inverse_text_processing_pipeline.py
new file mode 100644
index 00000000..f5282691
--- /dev/null
+++ b/modelscope/pipelines/audio/inverse_text_processing_pipeline.py
@@ -0,0 +1,123 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import yaml
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Frameworks, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['InverseTextProcessingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.inverse_text_processing, module_name=Pipelines.itn_inference)
+class InverseTextProcessingPipeline(Pipeline):
+    """Inverse Text Processing Inference Pipeline
+    use `model` to create a Inverse Text Processing pipeline.
+
+    Args:
+        model (BartForTextErrorCorrection): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> pipeline_itn = pipeline(
+    >>>    task=Tasks.inverse_text_processing, model='damo/speech_inverse_text_processing_fun-text-processing-itn-id')
+    >>> sentence = 'sembilan ribu sembilan ratus sembilan puluh sembilan'
+    >>> print(pipeline_itn(sentence))
+
+    To view other examples plese check tests/pipelines/test_inverse_text_processing.py.
+    """
+
+    def __init__(self, model: Union[Model, str] = None, **kwargs):
+        """use `model` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_cfg = self.model.forward()
+
+    def __call__(self, text_in: str = None) -> Dict[str, Any]:
+
+        if len(text_in) == 0:
+            raise ValueError('The input of ITN should not be null.')
+        else:
+            self.text_in = text_in
+        output = {}
+        itn_result = self.forward(self.text_in)
+        output['text'] = itn_result
+
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        return inputs
+
+    def forward(self, text_in: str = None) -> str:
+        """Decoding
+        """
+        logger.info('Inverse Text Normalization: {0} ...'.format(text_in))
+        lang = self.model_cfg['model_config']['lang']
+        model_dir = self.model_cfg['model_workspace']
+        itn_model_path = self.model_cfg['itn_model_path']
+
+        # make directory recursively
+        cache_dir = os.path.join(model_dir, lang, '.cache')
+        if not os.path.isdir(cache_dir):
+            os.makedirs(cache_dir, mode=0o777, exist_ok=True)
+
+        name = '_{0}_itn.far'.format(lang)
+        far_file = os.path.join(cache_dir, name)
+
+        # copy file into cache_dir
+        shutil.copy(itn_model_path, far_file)
+
+        # generate itn inference command
+        cmd = {
+            'ngpu': 0,
+            'log_level': 'ERROR',
+            'text_in': text_in,
+            'itn_model_file': far_file,
+            'cache_dir': cache_dir,
+            'overwrite_cache': False,
+            'enable_standalone_number': True,
+            'enable_0_to_9': True,
+            'lang': lang,
+            'verbose': False,
+        }
+
+        itn_result = self.run_inference(cmd)
+
+        return itn_result
+
+    def run_inference(self, cmd):
+        itn_result = ''
+        if self.framework == Frameworks.torch:
+            from fun_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+            if cmd['lang'] == 'ja':
+                itn_normalizer = InverseNormalizer(
+                    lang=cmd['lang'],
+                    cache_dir=cmd['cache_dir'],
+                    overwrite_cache=cmd['overwrite_cache'],
+                    enable_standalone_number=cmd['enable_standalone_number'],
+                    enable_0_to_9=cmd['enable_0_to_9'])
+            else:
+                itn_normalizer = InverseNormalizer(
+                    lang=cmd['lang'],
+                    cache_dir=cmd['cache_dir'],
+                    overwrite_cache=cmd['overwrite_cache'])
+            itn_result = itn_normalizer.inverse_normalize(
+                cmd['text_in'], verbose=cmd['verbose'])
+
+        else:
+            raise ValueError('model type is mismatching')
+
+        return itn_result
diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
index db6fc65d..67ea3ab3 100644
--- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
+++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py
@@ -56,7 +56,8 @@ class KeyWordSpottingKwsbpPipeline(Pipeline):
             # load pcm data from wav data if audio_in is wave format
             audio_in, audio_fs = extract_pcm_from_wav(audio_in)
 
-        output = self.preprocessor.forward(self.model.forward(), audio_in)
+        # model.forward return model dir and config file when testing with kwsbp
+        output = self.preprocessor.forward(self.model.forward(None), audio_in)
         output = self.forward(output)
         rst = self.postprocess(output)
         return rst
diff --git a/modelscope/pipelines/audio/punctuation_processing_pipeline.py b/modelscope/pipelines/audio/punctuation_processing_pipeline.py
new file mode 100644
index 00000000..072f9e85
--- /dev/null
+++ b/modelscope/pipelines/audio/punctuation_processing_pipeline.py
@@ -0,0 +1,138 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import yaml
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import generate_text_from_url
+from modelscope.utils.constant import Frameworks, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['PunctuationProcessingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.punctuation, module_name=Pipelines.punc_inference)
+class PunctuationProcessingPipeline(Pipeline):
+    """Punctuation Processing Inference Pipeline
+    use `model` to create a Punctuation Processing pipeline.
+
+    Args:
+        model (PunctuationProcessingPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> pipeline_punc = pipeline(
+    >>>    task=Tasks.punctuation, model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch')
+    >>> text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt'
+    >>> print(pipeline_punc(text_in))
+
+    """
+
+    def __init__(self, model: Union[Model, str] = None, **kwargs):
+        """use `model` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_cfg = self.model.forward()
+        self.cmd = self.get_cmd()
+        self.output_dir = None
+        if 'output_dir' in kwargs:
+            self.output_dir = kwargs['output_dir']
+        from funasr.bin import punc_inference_launch
+        self.funasr_infer_modelscope = punc_inference_launch.inference_launch(
+            mode=self.cmd['mode'],
+            ngpu=self.cmd['ngpu'],
+            log_level=self.cmd['log_level'],
+            dtype=self.cmd['dtype'],
+            seed=self.cmd['seed'],
+            output_dir=self.output_dir,
+            batch_size=self.cmd['batch_size'],
+            num_workers=self.cmd['num_workers'],
+            key_file=self.cmd['key_file'],
+            train_config=self.cmd['train_config'],
+            model_file=self.cmd['model_file'])
+
+    def __call__(self, text_in: str = None) -> Dict[str, Any]:
+        if len(text_in) == 0:
+            raise ValueError('The input of punctuation should not be null.')
+        else:
+            self.text_in = text_in
+
+        output = self.forward(self.text_in)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if i == 0:
+                text = inputs[0]['value']
+                if len(text) > 0:
+                    rst[OutputKeys.TEXT] = text
+            else:
+                rst[inputs[i]['key']] = inputs[i]['value']
+        return rst
+
+    def get_cmd(self) -> Dict[str, Any]:
+        # generate inference command
+        lang = self.model_cfg['model_config']['lang']
+        punc_model_path = self.model_cfg['punc_model_path']
+        punc_model_config = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['punc_config'])
+        mode = self.model_cfg['model_config']['mode']
+        cmd = {
+            'mode': mode,
+            'output_dir': None,
+            'batch_size': 1,
+            'num_workers': 1,
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'log_level': 'ERROR',
+            'dtype': 'float32',
+            'seed': 0,
+            'key_file': None,
+            'model_file': punc_model_path,
+            'train_config': punc_model_config,
+            'lang': lang
+        }
+
+        return cmd
+
+    def forward(self, text_in: str = None) -> list:
+        """Decoding
+        """
+        logger.info('Punctuation Processing: {0} ...'.format(text_in))
+        # generate text_in
+        text_file, raw_inputs = generate_text_from_url(text_in)
+        if raw_inputs is None:
+            data_cmd = [(text_file, 'text', 'text')]
+        elif text_file is None and raw_inputs is not None:
+            data_cmd = None
+
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = raw_inputs
+        punc_result = self.run_inference(self.cmd)
+
+        return punc_result
+
+    def run_inference(self, cmd):
+        punc_result = ''
+        if self.framework == Frameworks.torch:
+            punc_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return punc_result
diff --git a/modelscope/pipelines/audio/separation_pipeline.py b/modelscope/pipelines/audio/separation_pipeline.py
new file mode 100644
index 00000000..884f7f03
--- /dev/null
+++ b/modelscope/pipelines/audio/separation_pipeline.py
@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+from typing import Any, Dict
+
+import numpy
+import soundfile as sf
+import torch
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.models.base import Input
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.speech_separation, module_name=Pipelines.speech_separation)
+class SeparationPipeline(Pipeline):
+
+    def __init__(self, model, **kwargs):
+        """create a speech separation pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub.
+        """
+        logger.info('loading model...')
+        super().__init__(model=model, **kwargs)
+        self.model.load_check_point(device=self.device)
+        self.model.eval()
+
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        if isinstance(inputs, str):
+            file_bytes = File.read(inputs)
+            data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+            if fs != 8000:
+                raise ValueError(
+                    'modelscope error: The audio sample rate should be 8000')
+        elif isinstance(inputs, bytes):
+            data = torch.from_numpy(
+                numpy.frombuffer(inputs, dtype=numpy.float32))
+        return dict(data=data)
+
+    def postprocess(self, inputs: Dict[str, Any],
+                    **post_params) -> Dict[str, Any]:
+        return inputs
+
+    def forward(
+        self, inputs: Dict[str, Any], **forward_params
+    ) -> Dict[str, Any]:  # mix, targets, stage, noise=None):
+        """Forward computations from the mixture to the separated signals."""
+        logger.info('Start forward...')
+        # Unpack lists and put tensors in the right device
+        mix = inputs['data'].to(self.device)
+        mix = torch.unsqueeze(mix, dim=1).transpose(0, 1)
+        est_source = self.model(mix)
+        result = []
+        for ns in range(self.model.num_spks):
+            signal = est_source[0, :, ns]
+            signal = signal / signal.abs().max() * 0.5
+            signal = signal.unsqueeze(0).cpu()
+            # convert tensor to pcm
+            output = (signal.numpy() * 32768).astype(numpy.int16).tobytes()
+            result.append(output)
+        logger.info('Finish forward.')
+        return {OutputKeys.OUTPUT_PCM_LIST: result}
diff --git a/modelscope/pipelines/audio/speaker_verification_pipeline.py b/modelscope/pipelines/audio/speaker_verification_pipeline.py
new file mode 100644
index 00000000..ed63dbcd
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_pipeline.py
@@ -0,0 +1,162 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import yaml
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import (generate_scp_for_sv,
+                                                generate_sv_scp_from_url)
+from modelscope.utils.constant import Frameworks, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['SpeakerVerificationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification, module_name=Pipelines.sv_inference)
+class SpeakerVerificationPipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> pipeline_punc = pipeline(
+    >>>    task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch')
+    >>> audio_in=('','')
+    >>> print(pipeline_punc(audio_in))
+
+    """
+
+    def __init__(self, model: Union[Model, str] = None, **kwargs):
+        """use `model` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_cfg = self.model.forward()
+        self.cmd = self.get_cmd()
+
+        from funasr.bin import sv_inference_launch
+        self.funasr_infer_modelscope = sv_inference_launch.inference_launch(
+            mode=self.cmd['mode'],
+            ngpu=self.cmd['ngpu'],
+            log_level=self.cmd['log_level'],
+            dtype=self.cmd['dtype'],
+            seed=self.cmd['seed'],
+            sv_train_config=self.cmd['sv_train_config'],
+            sv_model_file=self.cmd['sv_model_file'],
+            output_dir=self.cmd['output_dir'],
+            batch_size=self.cmd['batch_size'],
+            num_workers=self.cmd['num_workers'],
+            key_file=self.cmd['key_file'],
+            model_tag=self.cmd['model_tag'])
+
+    def __call__(self,
+                 audio_in: Union[tuple, str, Any] = None) -> Dict[str, Any]:
+        if len(audio_in) == 0:
+            raise ValueError('The input of ITN should not be null.')
+        else:
+            self.audio_in = audio_in
+
+        output = self.forward(self.audio_in)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if i == 0:
+                if isinstance(self.audio_in, tuple):
+                    score = inputs[0]['value']
+                    rst[OutputKeys.SCORES] = score
+                else:
+                    embedding = inputs[0]['value']
+                    rst[OutputKeys.SPK_EMBEDDING] = embedding
+            else:
+                rst[inputs[i]['key']] = inputs[i]['value']
+        return rst
+
+    def get_cmd(self) -> Dict[str, Any]:
+        # generate asr inference command
+        mode = self.model_cfg['model_config']['mode']
+        sv_model_path = self.model_cfg['sv_model_path']
+        sv_model_config = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['sv_model_config'])
+        cmd = {
+            'mode': mode,
+            'output_dir': None,
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'batch_size': 1,
+            'num_workers': 1,
+            'log_level': 'ERROR',
+            'dtype': 'float32',
+            'seed': 0,
+            'key_file': None,
+            'sv_model_file': sv_model_path,
+            'sv_train_config': sv_model_config,
+            'model_tag': None
+        }
+        return cmd
+
+    def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
+        """Decoding
+        """
+        logger.info(
+            'Speaker Verification Processing: {0} ...'.format(audio_in))
+
+        data_cmd, raw_inputs = None, None
+        if isinstance(audio_in, tuple):
+            # generate audio_scp
+            if isinstance(audio_in[0], str):
+                audio_scp_1, audio_scp_2 = generate_sv_scp_from_url(audio_in)
+                data_cmd = [(audio_scp_1, 'speech', 'sound'),
+                            (audio_scp_2, 'ref_speech', 'sound')]
+            elif isinstance(audio_in[0], bytes):
+                data_cmd = [(audio_in[0], 'speech', 'bytes'),
+                            (audio_in[1], 'ref_speech', 'bytes')]
+            else:
+                raise TypeError('Unsupported data type.')
+        else:
+            if isinstance(audio_in, str):
+                audio_scp = generate_scp_for_sv(audio_in)
+                data_cmd = [(audio_scp, 'speech', 'sound')]
+            elif isinstance(audio_in[0], bytes):
+                data_cmd = [(audio_in, 'speech', 'bytes')]
+            else:
+                import torch
+                import numpy as np
+                if isinstance(audio_in, torch.Tensor):
+                    raw_inputs = audio_in
+                elif isinstance(audio_in, np.ndarray):
+                    raw_inputs = audio_in
+                else:
+                    raise TypeError('Unsupported data type.')
+
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = raw_inputs
+        punc_result = self.run_inference(self.cmd)
+
+        return punc_result
+
+    def run_inference(self, cmd):
+        if self.framework == Frameworks.torch:
+            sv_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return sv_result
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index c49fe7dc..be16fd5d 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -39,6 +39,8 @@ logger = get_logger()
 
 
 class Pipeline(ABC):
+    """Pipeline base.
+    """
 
     def initiate_single_model(self, model):
         if isinstance(model, str):
@@ -94,6 +96,7 @@ class Pipeline(ABC):
 
         if config_file is not None:
             self.cfg = Config.from_file(config_file)
+            model_dir = os.path.dirname(config_file)
         elif not self.has_multiple_models:
             if isinstance(self.model, str):
                 model_dir = self.model
@@ -101,8 +104,7 @@ class Pipeline(ABC):
                 model_dir = self.model.model_dir
             self.cfg = read_config(model_dir)
 
-        if preprocessor is None and not self.has_multiple_models \
-           and hasattr(self.cfg, 'preprocessor'):
+        if preprocessor is None and not self.has_multiple_models:
             self.preprocessor = Preprocessor.from_pretrained(model_dir)
         else:
             self.preprocessor = preprocessor
@@ -273,6 +275,7 @@ class Pipeline(ABC):
                                                    **forward_params)
                 else:
                     batched_out = self.forward(batched_input, **forward_params)
+
             for batch_idx in range(real_batch_size):
                 out = {}
                 for k, element in batched_out.items():
@@ -383,11 +386,10 @@ class DistributedPipeline(Pipeline):
     2. Set the multiprocessing method to spawn
     3. Open a multiprocessing pool of the world_size to instantiate model pieces.
     4. Set the master port and ip
-    5. Call _instantiate_one to instantiate one model piece
-        This method should be implemented by the derived class.
-    6. After the forward method is called, do preprocess in main process
-        and call _forward_one to collect results, and do
-        post process in main process.
+    5. Call _instantiate_one to instantiate one model piece,
+    This method should be implemented by the derived class.
+    6. After the forward method is called, do preprocess in main process and
+    call _forward_one to collect results, and do post process in main process.
 
     NOTE: _instantiate_one and _forward_one are class methods, any derived class should implement them and
     store the model handler in the class field.
@@ -416,8 +418,7 @@ class DistributedPipeline(Pipeline):
         self.device = create_device(self.device_name)
         self.has_multiple_models = False
         self.framework = self.cfg.framework
-        if torch.multiprocessing.get_start_method(allow_none=True) is None:
-            torch.multiprocessing.set_start_method('spawn')
+        torch.multiprocessing.set_start_method('spawn', force=True)
 
         ranks = list(range(self.world_size))
         self.model_pool = Pool(self.world_size)
@@ -500,7 +501,10 @@ def collate_fn(data, device):
 
     """
     from torch.utils.data.dataloader import default_collate
-    from modelscope.preprocessors.nlp import InputFeatures
+
+    def get_class_name(obj):
+        return obj.__class__.__name__
+
     if isinstance(data, dict) or isinstance(data, Mapping):
         # add compatibility for img_metas for mmlab models
         return type(data)({
@@ -523,11 +527,11 @@ def collate_fn(data, device):
         return data.to(device)
     elif isinstance(data, (bytes, str, int, float, bool, type(None))):
         return data
-    elif isinstance(data, InputFeatures):
+    elif get_class_name(data) == 'InputFeatures':
+        # modelscope.preprocessors.nlp.InputFeatures
+        return data
+    elif get_class_name(data) == 'DataContainer':
+        # mmcv.parallel.DataContainer
         return data
     else:
-        from mmcv.parallel import DataContainer
-        if isinstance(data, DataContainer):
-            return data
-        else:
-            raise ValueError(f'Unsupported data type {type(data)}')
+        raise ValueError(f'Unsupported data type {type(data)}')
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index fb1c53da..951d201c 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -22,6 +22,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_corom_sentence-embedding_english-base'),
     Tasks.text_ranking: (Pipelines.text_ranking,
                          'damo/nlp_corom_passage-ranking_english-base'),
+    Tasks.text_ranking: (Pipelines.mgeo_ranking,
+                         'damo/mgeo_address_ranking_chinese_base'),
     Tasks.word_segmentation:
     (Pipelines.word_segmentation,
      'damo/nlp_structbert_word-segmentation_chinese-base'),
@@ -57,6 +59,13 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                                    'damo/cv_vit_object-detection_coco'),
     Tasks.image_denoising: (Pipelines.image_denoise,
                             'damo/cv_nafnet_image-denoise_sidd'),
+    Tasks.image_deblurring: (Pipelines.image_deblur,
+                             'damo/cv_nafnet_image-deblur_gopro'),
+    Tasks.video_stabilization: (Pipelines.video_stabilization,
+                                'damo/cv_dut-raft_video-stabilization_base'),
+    Tasks.video_super_resolution:
+    (Pipelines.video_super_resolution,
+     'damo/cv_realbasicvsr_video-super-resolution_videolq'),
     Tasks.text_classification:
     (Pipelines.sentiment_classification,
      'damo/nlp_structbert_sentiment-classification_chinese-base'),
@@ -77,6 +86,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/nlp_bart_text-error-correction_chinese'),
     Tasks.image_captioning: (Pipelines.image_captioning,
                              'damo/ofa_image-caption_coco_large_en'),
+    Tasks.video_captioning:
+    (Pipelines.video_captioning,
+     'damo/multi-modal_hitea_video-captioning_base_en'),
     Tasks.image_portrait_stylization:
     (Pipelines.person_image_cartoon,
      'damo/cv_unet_person-image-cartoon_compound-models'),
@@ -85,6 +97,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.table_recognition:
     (Pipelines.table_recognition,
      'damo/cv_dla34_table-structure-recognition_cycle-centernet'),
+    Tasks.document_vl_embedding:
+    (Pipelines.document_vl_embedding,
+     'damo/multi-modal_convnext-roberta-base_vldoc-embedding'),
     Tasks.license_plate_detection:
     (Pipelines.license_plate_detection,
      'damo/cv_resnet18_license-plate-detection_damo'),
@@ -111,6 +126,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.visual_question_answering:
     (Pipelines.visual_question_answering,
      'damo/mplug_visual-question-answering_coco_large_en'),
+    Tasks.video_question_answering:
+    (Pipelines.video_question_answering,
+     'damo/multi-modal_hitea_video-question-answering_base_en'),
     Tasks.video_embedding: (Pipelines.cmdssl_video_embedding,
                             'damo/cv_r2p1d_video_embedding'),
     Tasks.text_to_image_synthesis:
@@ -128,6 +146,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.face_detection:
     (Pipelines.mog_face_detection,
      'damo/cv_resnet101_face-detection_cvpr22papermogface'),
+    Tasks.face_liveness: (Pipelines.face_liveness_ir,
+                          'damo/cv_manual_face-liveness_flir'),
     Tasks.face_recognition: (Pipelines.face_recognition,
                              'damo/cv_ir101_facerecognition_cfglint'),
     Tasks.facial_expression_recognition:
@@ -149,14 +169,23 @@ DEFAULT_MODEL_FOR_PIPELINE = {
      'damo/cv_csrnet_image-color-enhance-models'),
     Tasks.virtual_try_on: (Pipelines.virtual_try_on,
                            'damo/cv_daflow_virtual-try-on_base'),
-    Tasks.image_colorization: (Pipelines.image_colorization,
-                               'damo/cv_unet_image-colorization'),
+    Tasks.image_colorization: (Pipelines.ddcolor_image_colorization,
+                               'damo/cv_ddcolor_image-colorization'),
     Tasks.image_segmentation:
     (Pipelines.image_instance_segmentation,
      'damo/cv_swin-b_image-instance-segmentation_coco'),
     Tasks.image_depth_estimation:
     (Pipelines.image_depth_estimation,
      'damo/cv_newcrfs_image-depth-estimation_indoor'),
+    Tasks.indoor_layout_estimation:
+    (Pipelines.indoor_layout_estimation,
+     'damo/cv_panovit_indoor-layout-estimation'),
+    Tasks.video_depth_estimation:
+    (Pipelines.video_depth_estimation,
+     'damo/cv_dro-resnet18_video-depth-estimation_indoor'),
+    Tasks.panorama_depth_estimation:
+    (Pipelines.panorama_depth_estimation,
+     'damo/cv_unifuse_panorama-depth-estimation'),
     Tasks.image_style_transfer: (Pipelines.image_style_transfer,
                                  'damo/cv_aams_style-transfer_damo'),
     Tasks.face_image_generation: (Pipelines.face_image_generation,
@@ -207,6 +236,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
                              'damo/cv_video-inpainting'),
     Tasks.video_human_matting: (Pipelines.video_human_matting,
                                 'damo/cv_effnetv2_video-human-matting'),
+    Tasks.video_frame_interpolation:
+    (Pipelines.video_frame_interpolation,
+     'damo/cv_raft_video-frame-interpolation'),
     Tasks.human_wholebody_keypoint:
     (Pipelines.human_wholebody_keypoint,
      'damo/cv_hrnetw48_human-wholebody-keypoint_image'),
@@ -218,19 +250,35 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'),
     Tasks.product_segmentation: (Pipelines.product_segmentation,
                                  'damo/cv_F3Net_product-segmentation'),
-    Tasks.referring_video_object_segmentation:
-    (Pipelines.referring_video_object_segmentation,
-     'damo/cv_swin-t_referring_video-object-segmentation'),
+    Tasks.referring_video_object_segmentation: (
+        Pipelines.referring_video_object_segmentation,
+        'damo/cv_swin-t_referring_video-object-segmentation'),
     Tasks.video_summarization: (Pipelines.video_summarization,
                                 'damo/cv_googlenet_pgl-video-summarization'),
     Tasks.image_skychange: (Pipelines.image_skychange,
                             'damo/cv_hrnetocr_skychange'),
-    Tasks.translation_evaluation:
-    (Pipelines.translation_evaluation,
-     'damo/nlp_unite_mup_translation_evaluation_multilingual_large'),
-    Tasks.video_object_segmentation:
-    (Pipelines.video_object_segmentation,
-     'damo/cv_rdevos_video-object-segmentation'),
+    Tasks.translation_evaluation: (
+        Pipelines.translation_evaluation,
+        'damo/nlp_unite_mup_translation_evaluation_multilingual_large'),
+    Tasks.video_object_segmentation: (
+        Pipelines.video_object_segmentation,
+        'damo/cv_rdevos_video-object-segmentation'),
+    Tasks.video_multi_object_tracking: (
+        Pipelines.video_multi_object_tracking,
+        'damo/cv_yolov5_video-multi-object-tracking_fairmot'),
+    Tasks.image_multi_view_depth_estimation: (
+        Pipelines.image_multi_view_depth_estimation,
+        'damo/cv_casmvs_multi-view-depth-estimation_general'),
+    Tasks.image_fewshot_detection: (
+        Pipelines.image_fewshot_detection,
+        'damo/cv_resnet101_detection_fewshot-defrcn'),
+    Tasks.image_body_reshaping: (Pipelines.image_body_reshaping,
+                                 'damo/cv_flow-based-body-reshaping_damo'),
+    Tasks.image_face_fusion: (Pipelines.image_face_fusion,
+                              'damo/cv_unet-image-face-fusion_damo'),
+    Tasks.image_matching: (
+        Pipelines.image_matching,
+        'damo/cv_quadtree_attention_image-matching_outdoor'),
 }
 
 
@@ -281,6 +329,7 @@ def pipeline(task: str = None,
              framework: str = None,
              device: str = 'gpu',
              model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+             plugins: List[str] = None,
              **kwargs) -> Pipeline:
     """ Factory method to build an obj:`Pipeline`.
 
@@ -314,6 +363,8 @@ def pipeline(task: str = None,
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
 
+    try_import_plugins(plugins)
+
     model = normalize_model_input(model, model_revision)
     pipeline_props = {'type': pipeline_name}
     if pipeline_name is None:
@@ -327,6 +378,7 @@ def pipeline(task: str = None,
                         model, str) else read_config(
                             model[0], revision=model_revision)
                 check_config(cfg)
+                try_import_plugins(cfg.safe_get('plugins'))
                 pipeline_props = cfg.pipeline
         elif model is not None:
             # get pipeline info from Model object
@@ -335,6 +387,7 @@ def pipeline(task: str = None,
                 # model is instantiated by user, we should parse config again
                 cfg = read_config(first_model.model_dir)
                 check_config(cfg)
+                try_import_plugins(cfg.safe_get('plugins'))
                 first_model.pipeline = cfg.pipeline
             pipeline_props = first_model.pipeline
         else:
@@ -392,3 +445,10 @@ def get_default_pipeline_info(task):
     else:
         pipeline_name, default_model = DEFAULT_MODEL_FOR_PIPELINE[task]
     return pipeline_name, default_model
+
+
+def try_import_plugins(plugins: List[str]) -> None:
+    """ Try to import plugins """
+    if plugins is not None:
+        from modelscope.utils.plugins import import_plugins
+        import_plugins(plugins)
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index cff2138d..c9666398 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
     from .face_detection_pipeline import FaceDetectionPipeline
     from .face_image_generation_pipeline import FaceImageGenerationPipeline
     from .face_recognition_pipeline import FaceRecognitionPipeline
+    from .face_recognition_ood_pipeline import FaceRecognitionOodPipeline
     from .arc_face_recognition_pipeline import ArcFaceRecognitionPipeline
     from .mask_face_recognition_pipeline import MaskFaceRecognitionPipeline
     from .general_recognition_pipeline import GeneralRecognitionPipeline
@@ -27,9 +28,11 @@ if TYPE_CHECKING:
     from .image_colorization_pipeline import ImageColorizationPipeline
     from .image_classification_pipeline import ImageClassificationPipeline
     from .image_denoise_pipeline import ImageDenoisePipeline
+    from .image_deblur_pipeline import ImageDeblurPipeline
     from .image_instance_segmentation_pipeline import ImageInstanceSegmentationPipeline
     from .image_matting_pipeline import ImageMattingPipeline
     from .image_panoptic_segmentation_pipeline import ImagePanopticSegmentationPipeline
+    from .image_semantic_segmentation_pipeline import ImagePanopticSegmentationEasyCVPipeline
     from .image_portrait_enhancement_pipeline import ImagePortraitEnhancementPipeline
     from .image_reid_person_pipeline import ImageReidPersonPipeline
     from .image_semantic_segmentation_pipeline import ImageSemanticSegmentationPipeline
@@ -67,8 +70,21 @@ if TYPE_CHECKING:
     from .hand_static_pipeline import HandStaticPipeline
     from .referring_video_object_segmentation_pipeline import ReferringVideoObjectSegmentationPipeline
     from .language_guided_video_summarization_pipeline import LanguageGuidedVideoSummarizationPipeline
+    from .vision_middleware_pipeline import VisionMiddlewarePipeline
+    from .video_frame_interpolation_pipeline import VideoFrameInterpolationPipeline
     from .image_skychange_pipeline import ImageSkychangePipeline
+    from .vop_retrieval_pipeline import VopRetrievalPipeline
     from .video_object_segmentation_pipeline import VideoObjectSegmentationPipeline
+    from .image_matching_pipeline import ImageMatchingPipeline
+    from .video_stabilization_pipeline import VideoStabilizationPipeline
+    from .video_super_resolution_pipeline import VideoSuperResolutionPipeline
+    from .pointcloud_sceneflow_estimation_pipeline import PointCloudSceneFlowEstimationPipeline
+    from .face_liveness_ir_pipeline import FaceLivenessIrPipeline
+    from .maskdino_instance_segmentation_pipeline import MaskDINOInstanceSegmentationPipeline
+    from .image_mvs_depth_estimation_pipeline import ImageMultiViewDepthEstimationPipeline
+    from .panorama_depth_estimation_pipeline import PanoramaDepthEstimationPipeline
+    from .ddcolor_image_colorization_pipeline import DDColorImageColorizationPipeline
+    from .image_defrcn_fewshot_pipeline import ImageDefrcnDetectionPipeline
 
 else:
     _import_structure = {
@@ -86,6 +102,7 @@ else:
         'face_detection_pipeline': ['FaceDetectionPipeline'],
         'face_image_generation_pipeline': ['FaceImageGenerationPipeline'],
         'face_recognition_pipeline': ['FaceRecognitionPipeline'],
+        'face_recognition_ood_pipeline': ['FaceRecognitionOodPipeline'],
         'arc_face_recognition_pipeline': ['ArcFaceRecognitionPipeline'],
         'mask_face_recognition_pipeline': ['MaskFaceRecognitionPipeline'],
         'general_recognition_pipeline': ['GeneralRecognitionPipeline'],
@@ -93,13 +110,16 @@ else:
         ['GeneralImageClassificationPipeline', 'ImageClassificationPipeline'],
         'image_cartoon_pipeline': ['ImageCartoonPipeline'],
         'image_denoise_pipeline': ['ImageDenoisePipeline'],
+        'image_deblur_pipeline': ['ImageDeblurPipeline'],
         'image_color_enhance_pipeline': ['ImageColorEnhancePipeline'],
         'image_colorization_pipeline': ['ImageColorizationPipeline'],
         'image_instance_segmentation_pipeline':
         ['ImageInstanceSegmentationPipeline'],
         'image_matting_pipeline': ['ImageMattingPipeline'],
-        'image_panoptic_segmentation_pipeline':
-        ['ImagePanopticSegmentationPipeline'],
+        'image_panoptic_segmentation_pipeline': [
+            'ImagePanopticSegmentationPipeline',
+            'ImagePanopticSegmentationEasyCVPipeline'
+        ],
         'image_portrait_enhancement_pipeline':
         ['ImagePortraitEnhancementPipeline'],
         'image_reid_person_pipeline': ['ImageReidPersonPipeline'],
@@ -141,9 +161,8 @@ else:
         'retina_face_detection_pipeline': ['RetinaFaceDetectionPipeline'],
         'facial_expression_recognition_pipeline':
         ['FacialExpressionRecognitionPipeline'],
-        'facial_landmark_confidence_pipeline': [
-            'FacialLandmarkConfidencePipeline'
-        ],
+        'facial_landmark_confidence_pipeline':
+        ['FacialLandmarkConfidencePipeline'],
         'face_processing_base_pipeline': ['FaceProcessingBasePipeline'],
         'face_attribute_recognition_pipeline': [
             'FaceAttributeRecognitionPipeline'
@@ -156,10 +175,32 @@ else:
         'language_guided_video_summarization_pipeline': [
             'LanguageGuidedVideoSummarizationPipeline'
         ],
+        'vision_middleware_pipeline': ['VisionMiddlewarePipeline'],
+        'video_frame_interpolation_pipeline': [
+            'VideoFrameInterpolationPipeline'
+        ],
         'image_skychange_pipeline': ['ImageSkychangePipeline'],
+        'vop_retrieval_pipeline': ['VopRetrievalPipeline'],
         'video_object_segmentation_pipeline': [
             'VideoObjectSegmentationPipeline'
         ],
+        'image_matching_pipeline': ['ImageMatchingPipeline'],
+        'video_stabilization_pipeline': ['VideoStabilizationPipeline'],
+        'video_super_resolution_pipeline': ['VideoSuperResolutionPipeline'],
+        'pointcloud_sceneflow_estimation_pipeline': [
+            'PointCloudSceneFlowEstimationPipeline'
+        ],
+        'face_liveness_ir_pipeline': ['FaceLivenessIrPipeline'],
+        'maskdino_instance_segmentation_pipeline': [
+            'MaskDINOInstanceSegmentationPipeline'
+        ],
+        'image_mvs_depth_estimation_pipeline': [
+            'ImageMultiViewDepthEstimationPipeline'
+        ],
+        'ddcolor_image_colorization_pipeline': [
+            'DDColorImageColorizationPipeline'
+        ],
+        'image_defrcn_fewshot_pipeline': ['ImageDefrcnDetectionPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/arc_face_recognition_pipeline.py b/modelscope/pipelines/cv/arc_face_recognition_pipeline.py
index 241dd39f..72ebffc8 100644
--- a/modelscope/pipelines/cv/arc_face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/arc_face_recognition_pipeline.py
@@ -47,7 +47,7 @@ class ArcFaceRecognitionPipeline(FaceProcessingBasePipeline):
         logger.info('face recognition model loaded!')
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        result = super(ArcFaceRecognitionPipeline, self).preprocess(input)
+        result = super().preprocess(input)
         align_img = result['img']
         face_img = align_img[:, :, ::-1]  # to rgb
         face_img = np.transpose(face_img, axes=(2, 0, 1))
diff --git a/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py b/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py
new file mode 100644
index 00000000..43e505fd
--- /dev/null
+++ b/modelscope/pipelines/cv/ddcolor_image_colorization_pipeline.py
@@ -0,0 +1,168 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_colorization import DDColorForImageColorization
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_colorization, module_name=Pipelines.ddcolor_image_colorization)
+class DDColorImageColorizationPipeline(Pipeline):
+    """ DDColor Image Colorization Pipeline.
+
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+
+    >>> colorizer = pipeline('image-colorization', 'damo/cv_ddcolor_image-colorization')
+    >>> colorizer("data/test/images/audrey_hepburn.jpg")
+       {'output_img': array([[[198, 199, 193],
+         [198, 199, 193],
+         [197, 199, 195],
+         ...,
+         [197, 213, 206],
+         [197, 213, 206],
+         [197, 213, 207]],
+
+        [[198, 199, 193],
+         [198, 199, 193],
+         [197, 199, 195],
+         ...,
+         [196, 212, 205],
+         [196, 212, 205],
+         [196, 212, 206]],
+
+        [[198, 199, 193],
+         [198, 199, 193],
+         [197, 199, 195],
+         ...,
+         [193, 209, 202],
+         [193, 209, 202],
+         [193, 209, 203]],
+
+        ...,
+
+        [[ 56,  72, 103],
+         [ 56,  72, 103],
+         [ 56,  72, 102],
+         ...,
+         [233, 231, 232],
+         [233, 231, 232],
+         [233, 231, 232]],
+
+        [[ 51,  62,  91],
+         [ 52,  63,  92],
+         [ 52,  64,  92],
+         ...,
+         [233, 232, 231],
+         [233, 232, 231],
+         [232, 232, 229]],
+
+        [[ 60,  72, 101],
+         [ 59,  71, 100],
+         [ 57,  70,  99],
+         ...,
+         [233, 232, 231],
+         [233, 232, 231],
+         [232, 232, 229]]], dtype=uint8)}
+    >>> #
+    ```
+    """
+
+    def __init__(self, model: Union[DDColorForImageColorization, str],
+                 **kwargs):
+        """
+        use `model` to create an image colorization pipeline for prediction
+
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.model.eval()
+        self.input_size = 512
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        # self.model = DDColorForImageColorization(
+        #     model_dir=model,
+        #     encoder_name='convnext-l',
+        #     input_size=[self.input_size, self.input_size],
+        # ).to(self.device)
+
+        # model_path = f'{model}/{ModelFile.TORCH_MODEL_FILE}'
+        # logger.info(f'loading model from {model_path}')
+        # self.model.load_state_dict(
+        #     torch.load(model_path, map_location=torch.device('cpu'))['params'],
+        #     strict=True)
+
+        logger.info('load model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """preprocess the input image, extract L-channel and convert it back to RGB
+
+        Args:
+            inputs: an input image from file or url
+
+        Returns:
+            Dict[str, Any]: the pre-processed image
+        """
+        img = LoadImage.convert_to_ndarray(input)
+        self.height, self.width = img.shape[:2]
+
+        img = (img / 255.0).astype(np.float32)
+        self.orig_l = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)[:, :, :1]
+
+        img = cv2.resize(img, (self.input_size, self.input_size))
+        img_l = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)[:, :, :1]
+        img_gray_lab = np.concatenate(
+            (img_l, np.zeros_like(img_l), np.zeros_like(img_l)), axis=-1)
+        img_gray_rgb = cv2.cvtColor(img_gray_lab, cv2.COLOR_LAB2RGB)
+        tensor_gray_rgb = torch.from_numpy(img_gray_rgb.transpose(
+            (2, 0, 1))).float()
+        tensor_gray_rgb = tensor_gray_rgb.unsqueeze(0).to(self.device)
+
+        result = {'img': tensor_gray_rgb}
+        return result
+
+    @torch.no_grad()
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        """call model to output the predictions and concatenate it with the original L-channel
+
+        Args:
+            inputs: input image tensor
+
+        Returns:
+            Dict[str, Any]: the result image
+        """
+
+        output_ab = self.model(input).cpu()
+
+        output_ab_resize = F.interpolate(
+            output_ab, size=(self.height, self.width))
+        output_ab_resize = output_ab_resize[0].float().numpy().transpose(
+            1, 2, 0)
+        out_lab = np.concatenate((self.orig_l, output_ab_resize), axis=-1)
+        out_bgr = cv2.cvtColor(out_lab, cv2.COLOR_LAB2BGR)
+        output_img = (out_bgr * 255.0).round().astype(np.uint8)
+
+        return {OutputKeys.OUTPUT_IMG: output_img}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
index 0b680ad4..2a95ebb4 100644
--- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
+++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py
@@ -16,7 +16,8 @@ from .base import EasyCVPipeline
     Tasks.image_object_detection,
     module_name=Pipelines.image_object_detection_auto)
 @PIPELINES.register_module(
-    Tasks.image_object_detection, module_name=Pipelines.hand_detection)
+    Tasks.domain_specific_object_detection,
+    module_name=Pipelines.hand_detection)
 class EasyCVDetectionPipeline(EasyCVPipeline):
     """Pipeline for easycv detection task."""
 
diff --git a/modelscope/pipelines/cv/face_attribute_recognition_pipeline.py b/modelscope/pipelines/cv/face_attribute_recognition_pipeline.py
index ddf3bc5d..f7645aa5 100644
--- a/modelscope/pipelines/cv/face_attribute_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_attribute_recognition_pipeline.py
@@ -18,6 +18,7 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
 
 logger = get_logger()
 
@@ -25,7 +26,7 @@ logger = get_logger()
 @PIPELINES.register_module(
     Tasks.face_attribute_recognition,
     module_name=Pipelines.face_attribute_recognition)
-class FaceAttributeRecognitionPipeline(Pipeline):
+class FaceAttributeRecognitionPipeline(FaceProcessingBasePipeline):
 
     def __init__(self, model: str, **kwargs):
         """
@@ -44,82 +45,15 @@ class FaceAttributeRecognitionPipeline(Pipeline):
         self.device = device
         logger.info('load model done')
 
-        # face detect pipeline
-        det_model_id = 'damo/cv_resnet50_face-detection_retinaface'
         male_list = ['Male', 'Female']
         age_list = [
             '0-2', '3-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69',
             '70+'
         ]
         self.map_list = [male_list, age_list]
-        self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id)
-
-    def _choose_face(self,
-                     det_result,
-                     min_face=10,
-                     top_face=1,
-                     center_face=False):
-        '''
-        choose face with maximum area
-        Args:
-            det_result: output of face detection pipeline
-            min_face: minimum size of valid face w/h
-            top_face: take faces with top max areas
-            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
-        '''
-        bboxes = np.array(det_result[OutputKeys.BOXES])
-        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
-        if bboxes.shape[0] == 0:
-            logger.info('Warning: No face detected!')
-            return None
-        # face idx with enough size
-        face_idx = []
-        for i in range(bboxes.shape[0]):
-            box = bboxes[i]
-            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
-                face_idx += [i]
-        if len(face_idx) == 0:
-            logger.info(
-                f'Warning: Face size not enough, less than {min_face}x{min_face}!'
-            )
-            return None
-        bboxes = bboxes[face_idx]
-        landmarks = landmarks[face_idx]
-        # find max faces
-        boxes = np.array(bboxes)
-        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-        sort_idx = np.argsort(area)[-top_face:]
-        # find center face
-        if top_face > 1 and center_face and bboxes.shape[0] > 1:
-            img_center = [img.shape[1] // 2, img.shape[0] // 2]
-            min_dist = float('inf')
-            sel_idx = -1
-            for _idx in sort_idx:
-                box = boxes[_idx]
-                dist = np.square(
-                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
-                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
-                if dist < min_dist:
-                    min_dist = dist
-                    sel_idx = _idx
-            sort_idx = [sel_idx]
-        main_idx = sort_idx[-1]
-        return bboxes[main_idx], landmarks[main_idx]
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_ndarray(input)
-        img = img[:, :, ::-1]
-        det_result = self.face_detection(img.copy())
-        rtn = self._choose_face(det_result)
-        face_img = None
-        if rtn is not None:
-            _, face_lmks = rtn
-            face_lmks = face_lmks.reshape(5, 2)
-            face_img, _ = align_face(img, (112, 112), face_lmks)
-            face_img = face_img.astype(np.float32)
-        result = {}
-        result['img'] = face_img
+        result = super().preprocess(input)
         return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/face_liveness_ir_pipeline.py b/modelscope/pipelines/cv/face_liveness_ir_pipeline.py
new file mode 100644
index 00000000..a54d4577
--- /dev/null
+++ b/modelscope/pipelines/cv/face_liveness_ir_pipeline.py
@@ -0,0 +1,84 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import onnxruntime
+import PIL
+import torch
+import torch.nn.functional as F
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.models.cv.facial_landmark_confidence import \
+    FacialLandmarkConfidence
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_liveness, module_name=Pipelines.face_liveness_ir)
+class FaceLivenessIrPipeline(FaceProcessingBasePipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face lievness ir pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        onnx_path = osp.join(model, ModelFile.ONNX_MODEL_FILE)
+        logger.info(f'loading model from {onnx_path}')
+        self.sess, self.input_node_name, self.out_node_name = self.load_onnx_model(
+            onnx_path)
+        logger.info('load model done')
+
+    def load_onnx_model(self, onnx_path):
+        sess = onnxruntime.InferenceSession(onnx_path)
+        out_node_name = []
+        input_node_name = []
+        for node in sess.get_outputs():
+            out_node_name.append(node.name)
+
+        for node in sess.get_inputs():
+            input_node_name.append(node.name)
+
+        return sess, input_node_name, out_node_name
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        result = super().preprocess(input)
+        orig_img = LoadImage.convert_to_ndarray(input)
+        orig_img = orig_img[:, :, ::-1]
+        img = super(FaceLivenessIrPipeline,
+                    self).align_face_padding(orig_img, result['bbox'], 16)
+        if img.shape[0] != 112:
+            img = img[8:120, 8:120, :]
+        img = (img - 127.5) * 0.0078125
+        input_tensor = img.astype('float32').transpose(
+            (2, 0, 1))[np.newaxis, :]
+        result['input_tensor'] = input_tensor
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        input_feed = {}
+        input_feed[
+            self.input_node_name[0]] = input['input_tensor'].cpu().numpy()
+        result = self.sess.run(self.out_node_name, input_feed=input_feed)
+        out = F.softmax(torch.FloatTensor(result), dim=-1)[0][0]
+        assert result is not None
+        scores = [1 - out[1].tolist()]
+        boxes = input['bbox'].cpu().numpy()[np.newaxis, :].tolist()
+        return {OutputKeys.SCORES: scores, OutputKeys.BOXES: boxes}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_processing_base_pipeline.py b/modelscope/pipelines/cv/face_processing_base_pipeline.py
index 2a732171..bb6f0397 100644
--- a/modelscope/pipelines/cv/face_processing_base_pipeline.py
+++ b/modelscope/pipelines/cv/face_processing_base_pipeline.py
@@ -40,7 +40,8 @@ class FaceProcessingBasePipeline(Pipeline):
                      det_result,
                      min_face=10,
                      top_face=1,
-                     center_face=False):
+                     center_face=False,
+                     img_shape=None):
         '''
         choose face with maximum area
         Args:
@@ -74,8 +75,8 @@ class FaceProcessingBasePipeline(Pipeline):
         area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
         sort_idx = np.argsort(area)[-top_face:]
         # find center face
-        if top_face > 1 and center_face and bboxes.shape[0] > 1:
-            img_center = [img.shape[1] // 2, img.shape[0] // 2]
+        if top_face > 1 and center_face and bboxes.shape[0] > 1 and img_shape:
+            img_center = [img_shape[1] // 2, img_shape[0] // 2]
             min_dist = float('inf')
             sel_idx = -1
             for _idx in sort_idx:
@@ -94,7 +95,7 @@ class FaceProcessingBasePipeline(Pipeline):
         img = LoadImage.convert_to_ndarray(input)
         img = img[:, :, ::-1]
         det_result = self.face_detection(img.copy())
-        rtn = self._choose_face(det_result)
+        rtn = self._choose_face(det_result, img_shape=img.shape)
         if rtn is not None:
             scores, bboxes, face_lmks = rtn
             face_lmks = face_lmks.reshape(5, 2)
@@ -107,6 +108,85 @@ class FaceProcessingBasePipeline(Pipeline):
         result['lmks'] = face_lmks
         return result
 
+    def align_face_padding(self, img, rect, padding_size=16, pad_pixel=127):
+        rect = np.reshape(rect, (-1, 4))
+        if img is None:
+            return None
+        if img.ndim == 2:
+            w, h = img.shape
+            ret = np.empty((w, h, 3), dtype=np.uint8)
+            ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img
+            img = ret
+        img = img[:, :, 0:3]
+        img = img[..., ::-1]
+        nrof = np.zeros((5, ), dtype=np.int32)
+
+        bounding_boxes = rect
+        nrof_faces = bounding_boxes.shape[0]
+        if nrof_faces > 0:
+            det = bounding_boxes[:, 0:4]
+            img_size = np.asarray(img.shape)[0:2]
+            bindex = 0
+            if nrof_faces > 1:
+                img_center = img_size / 2
+                offsets = np.vstack([
+                    (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                    (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+                ])
+                offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+                bindex = np.argmax(0 - offset_dist_squared * 2.0)
+            _bbox = bounding_boxes[bindex, 0:4]
+            nrof[0] += 1
+        else:
+            nrof[1] += 1
+        if _bbox is None:
+            nrof[2] += 1
+            return None
+        _bbox = [int(_bbox[0]), int(_bbox[1]), int(_bbox[2]), int(_bbox[3])]
+        x1 = _bbox[0] - int(
+            (_bbox[2] - _bbox[0] + 1) * padding_size * 1.0 / 112)
+        x2 = _bbox[2] + int(
+            (_bbox[2] - _bbox[0] + 1) * padding_size * 1.0 / 112)
+        y1 = _bbox[1] - int(
+            (_bbox[3] - _bbox[1] + 1) * padding_size * 1.0 / 112)
+        y2 = _bbox[3] + int(
+            (_bbox[3] - _bbox[1] + 1) * padding_size * 1.0 / 112)
+        _bbox[0] = max(0, x1)
+        _bbox[1] = max(0, y1)
+        _bbox[2] = min(img.shape[1] - 1, x2)
+        _bbox[3] = min(img.shape[0] - 1, y2)
+        padding_h = _bbox[3] - _bbox[1] + 1
+        padding_w = _bbox[2] - _bbox[0] + 1
+        if padding_w > padding_h:
+            offset = int((padding_w - padding_h) / 2)
+            _bbox[1] = _bbox[1] - offset
+            _bbox[3] = _bbox[1] + padding_w - 1
+            _bbox[1] = max(0, _bbox[1])
+            _bbox[3] = min(img.shape[0] - 1, _bbox[3])
+            dst_size = padding_w
+        else:
+            offset = int((padding_h - padding_w) / 2)
+            _bbox[0] = _bbox[0] - offset
+            _bbox[2] = _bbox[0] + padding_h - 1
+            _bbox[0] = max(0, _bbox[0])
+            _bbox[2] = min(img.shape[1] - 1, _bbox[2])
+            dst_size = padding_h
+
+        dst = np.full((dst_size, dst_size, 3), pad_pixel, dtype=np.uint8)
+        dst_x_offset = int((dst_size - (_bbox[2] - _bbox[0] + 1)) / 2)
+        dst_y_offset = int((dst_size - (_bbox[3] - _bbox[1] + 1)) / 2)
+
+        y_start = dst_y_offset
+        y_end = dst_y_offset + _bbox[3] + 1 - _bbox[1]
+        x_start = dst_x_offset
+        x_end = dst_x_offset + _bbox[2] + 1 - _bbox[0]
+        dst[y_start:y_end, x_start:x_end, :] = img[_bbox[1]:_bbox[3] + 1,
+                                                   _bbox[0]:_bbox[2] + 1, :]
+
+        dst = cv2.resize(dst, (128, 128), interpolation=cv2.INTER_LINEAR)
+
+        return dst
+
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         return {
             OutputKeys.OUTPUT_IMG: input['img'].cpu().numpy(),
diff --git a/modelscope/pipelines/cv/face_recognition_ood_pipeline.py b/modelscope/pipelines/cv/face_recognition_ood_pipeline.py
new file mode 100644
index 00000000..f66288d1
--- /dev/null
+++ b/modelscope/pipelines/cv/face_recognition_ood_pipeline.py
@@ -0,0 +1,73 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.cv.face_recognition.align_face import align_face
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.face_recognition_ood, module_name=Pipelines.face_recognition_ood)
+class FaceRecognitionOodPipeline(FaceProcessingBasePipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a face recognition ood pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        Example:
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> fr_ood= pipeline('face-recognition-ood', 'damo/cv_ir_face-recognition-ood_rts')
+        >>> fr_ood("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/face_recognition_1.png")
+        {{'img_embedding': array([[ 0.02276129, -0.00761525, ...,0.05735306]],
+            dtype=float32, 'scores': [[0.7656678557395935]]}
+        ```
+        """
+
+        # face recong model
+        super().__init__(model=model, **kwargs)
+        face_model = self.model
+        face_model = face_model.to(self.device)
+        face_model.eval()
+        self.face_model = face_model
+        logger.info('face recognition model loaded!')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        result = super().preprocess(input)
+        align_img = result['img']
+        face_img = align_img[:, :, ::-1]  # to rgb
+        face_img = np.transpose(face_img, axes=(2, 0, 1))
+        face_img = (face_img / 255. - 0.5) / 0.5
+        face_img = face_img.astype(np.float32)
+        result['img'] = face_img
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        assert input['img'] is not None
+        img = input['img'].unsqueeze(0)
+        output = self.face_model(img)
+        emb = output[0].detach().cpu().numpy()
+        emb /= np.sqrt(np.sum(emb**2, -1, keepdims=True))  # l2 norm
+        scores = output[1].exp().detach().cpu().numpy().tolist()
+        return {OutputKeys.IMG_EMBEDDING: emb, OutputKeys.SCORES: scores}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py
index 873e4a1f..4af5a04f 100644
--- a/modelscope/pipelines/cv/face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/face_recognition_pipeline.py
@@ -17,13 +17,14 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
 
 logger = get_logger()
 
 
 @PIPELINES.register_module(
     Tasks.face_recognition, module_name=Pipelines.face_recognition)
-class FaceRecognitionPipeline(Pipeline):
+class FaceRecognitionPipeline(FaceProcessingBasePipeline):
 
     def __init__(self, model: str, **kwargs):
         """
@@ -46,78 +47,14 @@ class FaceRecognitionPipeline(Pipeline):
         face_model.eval()
         self.face_model = face_model
         logger.info('face recognition model loaded!')
-        # face detect pipeline
-        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
-        self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id)
-
-    def _choose_face(self,
-                     det_result,
-                     min_face=10,
-                     top_face=1,
-                     center_face=False):
-        '''
-        choose face with maximum area
-        Args:
-            det_result: output of face detection pipeline
-            min_face: minimum size of valid face w/h
-            top_face: take faces with top max areas
-            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
-        '''
-        bboxes = np.array(det_result[OutputKeys.BOXES])
-        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
-        # scores = np.array(det_result[OutputKeys.SCORES])
-        if bboxes.shape[0] == 0:
-            logger.info('No face detected!')
-            return None
-        # face idx with enough size
-        face_idx = []
-        for i in range(bboxes.shape[0]):
-            box = bboxes[i]
-            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
-                face_idx += [i]
-        if len(face_idx) == 0:
-            logger.info(
-                f'Face size not enough, less than {min_face}x{min_face}!')
-            return None
-        bboxes = bboxes[face_idx]
-        landmarks = landmarks[face_idx]
-        # find max faces
-        boxes = np.array(bboxes)
-        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-        sort_idx = np.argsort(area)[-top_face:]
-        # find center face
-        if top_face > 1 and center_face and bboxes.shape[0] > 1:
-            img_center = [img.shape[1] // 2, img.shape[0] // 2]
-            min_dist = float('inf')
-            sel_idx = -1
-            for _idx in sort_idx:
-                box = boxes[_idx]
-                dist = np.square(
-                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
-                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
-                if dist < min_dist:
-                    min_dist = dist
-                    sel_idx = _idx
-            sort_idx = [sel_idx]
-        main_idx = sort_idx[-1]
-        return bboxes[main_idx], landmarks[main_idx]
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_ndarray(input)
-        img = img[:, :, ::-1]
-        det_result = self.face_detection(img.copy())
-        rtn = self._choose_face(det_result)
-        face_img = None
-        if rtn is not None:
-            _, face_lmks = rtn
-            face_lmks = face_lmks.reshape(5, 2)
-            align_img, _ = align_face(img, (112, 112), face_lmks)
-            face_img = align_img[:, :, ::-1]  # to rgb
-            face_img = np.transpose(face_img, axes=(2, 0, 1))
-            face_img = (face_img / 255. - 0.5) / 0.5
-            face_img = face_img.astype(np.float32)
-        result = {}
+        result = super().preprocess(input)
+        align_img = result['img']
+        face_img = align_img[:, :, ::-1]  # to rgb
+        face_img = np.transpose(face_img, axes=(2, 0, 1))
+        face_img = (face_img / 255. - 0.5) / 0.5
+        face_img = face_img.astype(np.float32)
         result['img'] = face_img
         return result
 
diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
index 3c85ae62..d7e617f8 100644
--- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py
@@ -18,6 +18,7 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
 
 logger = get_logger()
 
@@ -25,7 +26,7 @@ logger = get_logger()
 @PIPELINES.register_module(
     Tasks.facial_expression_recognition,
     module_name=Pipelines.facial_expression_recognition)
-class FacialExpressionRecognitionPipeline(Pipeline):
+class FacialExpressionRecognitionPipeline(FaceProcessingBasePipeline):
 
     def __init__(self, model: str, **kwargs):
         """
@@ -43,79 +44,13 @@ class FacialExpressionRecognitionPipeline(Pipeline):
         self.device = device
         logger.info('load model done')
 
-        # face detect pipeline
-        det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
         self.map_list = [
             'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
         ]
-        self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id)
-
-    def _choose_face(self,
-                     det_result,
-                     min_face=10,
-                     top_face=1,
-                     center_face=False):
-        '''
-        choose face with maximum area
-        Args:
-            det_result: output of face detection pipeline
-            min_face: minimum size of valid face w/h
-            top_face: take faces with top max areas
-            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
-        '''
-        bboxes = np.array(det_result[OutputKeys.BOXES])
-        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
-        if bboxes.shape[0] == 0:
-            logger.info('Warning: No face detected!')
-            return None
-        # face idx with enough size
-        face_idx = []
-        for i in range(bboxes.shape[0]):
-            box = bboxes[i]
-            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
-                face_idx += [i]
-        if len(face_idx) == 0:
-            logger.info(
-                f'Warning: Face size not enough, less than {min_face}x{min_face}!'
-            )
-            return None
-        bboxes = bboxes[face_idx]
-        landmarks = landmarks[face_idx]
-        # find max faces
-        boxes = np.array(bboxes)
-        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-        sort_idx = np.argsort(area)[-top_face:]
-        # find center face
-        if top_face > 1 and center_face and bboxes.shape[0] > 1:
-            img_center = [img.shape[1] // 2, img.shape[0] // 2]
-            min_dist = float('inf')
-            sel_idx = -1
-            for _idx in sort_idx:
-                box = boxes[_idx]
-                dist = np.square(
-                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
-                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
-                if dist < min_dist:
-                    min_dist = dist
-                    sel_idx = _idx
-            sort_idx = [sel_idx]
-        main_idx = sort_idx[-1]
-        return bboxes[main_idx], landmarks[main_idx]
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_ndarray(input)
-        img = img[:, :, ::-1]
-        det_result = self.face_detection(img.copy())
-        rtn = self._choose_face(det_result)
-        face_img = None
-        if rtn is not None:
-            _, face_lmks = rtn
-            face_lmks = face_lmks.reshape(5, 2)
-            face_img, _ = align_face(img, (112, 112), face_lmks)
-            face_img = face_img.astype(np.float32)
-        result = {}
-        result['img'] = face_img
+        result = super(FacialExpressionRecognitionPipeline,
+                       self).preprocess(input)
         return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py b/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py
index 26e8e733..cab8310e 100644
--- a/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py
+++ b/modelscope/pipelines/cv/facial_landmark_confidence_pipeline.py
@@ -44,8 +44,7 @@ class FacialLandmarkConfidencePipeline(FaceProcessingBasePipeline):
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
 
-        result = super(FacialLandmarkConfidencePipeline,
-                       self).preprocess(input)
+        result = super().preprocess(input)
         img = LoadImage.convert_to_ndarray(input)
         img = img[:, :, ::-1]
         result['orig_img'] = img.astype(np.float32)
diff --git a/modelscope/pipelines/cv/image_classification_pipeline.py b/modelscope/pipelines/cv/image_classification_pipeline.py
index b9d7376b..de269f04 100644
--- a/modelscope/pipelines/cv/image_classification_pipeline.py
+++ b/modelscope/pipelines/cv/image_classification_pipeline.py
@@ -1,19 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
-import cv2
 import numpy as np
-import PIL
 import torch
 
-from modelscope.metainfo import Pipelines
-from modelscope.models.multi_modal import OfaForAllTasks
+from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import OfaPreprocessor, Preprocessor, load_image
-from modelscope.utils.constant import Tasks
-from modelscope.utils.device import get_device
+from modelscope.pipelines.util import batch_process
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.image import LoadImage
+from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -21,24 +19,6 @@ logger = get_logger()
 
 @PIPELINES.register_module(
     Tasks.image_classification, module_name=Pipelines.image_classification)
-class ImageClassificationPipeline(Pipeline):
-
-    def __init__(self,
-                 model: Union[Model, str],
-                 preprocessor: Optional[Preprocessor] = None,
-                 **kwargs):
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        assert isinstance(model, str) or isinstance(model, Model), \
-            'model must be a single str or OfaForAllTasks'
-        self.model.eval()
-        self.model.to(get_device())
-        if preprocessor is None and isinstance(self.model, OfaForAllTasks):
-            self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        return inputs
-
-
 @PIPELINES.register_module(
     Tasks.image_classification,
     module_name=Pipelines.general_image_classification)
@@ -48,78 +28,105 @@ class ImageClassificationPipeline(Pipeline):
 @PIPELINES.register_module(
     Tasks.image_classification,
     module_name=Pipelines.nextvit_small_daily_image_classification)
+@PIPELINES.register_module(
+    Tasks.image_classification,
+    module_name=Pipelines.convnext_base_image_classification_garbage)
+@PIPELINES.register_module(
+    Tasks.image_classification,
+    module_name=Pipelines.common_image_classification)
+@PIPELINES.register_module(
+    Tasks.image_classification,
+    module_name=Pipelines.bnext_small_image_classification)
 class GeneralImageClassificationPipeline(Pipeline):
 
-    def __init__(self, model: str, **kwargs):
-        """
-        use `model` and `preprocessor` to create a image classification pipeline for prediction
+    def __init__(self,
+                 model: str,
+                 preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create an image classification pipeline for prediction
         Args:
-            model: model id on modelscope hub.
+            model: A str format model id or model local dir to build the model instance from.
+            preprocessor: A preprocessor instance to preprocess the data, if None,
+            the pipeline will try to build the preprocessor according to the configuration.json file.
+            kwargs: The args needed by the `Pipeline` class.
         """
-        super().__init__(model=model, **kwargs)
-
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+        self.target_gpus = None
+        if preprocessor is None:
+            assert hasattr(self.model, 'model_dir'), 'Model used in ImageClassificationPipeline should has ' \
+                                                     'a `model_dir` attribute to build a preprocessor.'
+            if self.model.__class__.__name__ == 'OfaForAllTasks':
+                self.preprocessor = Preprocessor.from_pretrained(
+                    model_name_or_path=self.model.model_dir,
+                    type=Preprocessors.ofa_tasks_preprocessor,
+                    field=Fields.multi_modal,
+                    **kwargs)
+            else:
+                if next(self.model.parameters()).is_cuda:
+                    self.target_gpus = [next(self.model.parameters()).device]
+                assert hasattr(self.model, 'model_dir'), 'Model used in GeneralImageClassificationPipeline' \
+                                                         ' should has a `model_dir` attribute to build a preprocessor.'
+                self.preprocessor = Preprocessor.from_pretrained(
+                    self.model.model_dir, **kwargs)
+                if self.preprocessor.__class__.__name__ == 'ImageClassificationBypassPreprocessor':
+                    from modelscope.preprocessors import ImageClassificationMmcvPreprocessor
+                    self.preprocessor = ImageClassificationMmcvPreprocessor(
+                        self.model.model_dir, **kwargs)
         logger.info('load model done')
 
-    def preprocess(self, input: Input) -> Dict[str, Any]:
-        from mmcls.datasets.pipelines import Compose
-        from mmcv.parallel import collate, scatter
-        from modelscope.models.cv.image_classification.utils import preprocess_transform
-        if isinstance(input, str):
-            img = np.array(load_image(input))
-        elif isinstance(input, PIL.Image.Image):
-            img = np.array(input.convert('RGB'))
-        elif isinstance(input, np.ndarray):
-            if len(input.shape) == 2:
-                img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
-            img = input[:, :, ::-1]  # in rgb order
+    def _batch(self, data):
+        if self.model.__class__.__name__ == 'OfaForAllTasks':
+            return batch_process(self.model, data)
         else:
-            raise TypeError(f'input should be either str, PIL.Image,'
-                            f' np.array, but got {type(input)}')
+            return super()._batch(data)
 
-        cfg = self.model.cfg
-
-        if self.model.config_type == 'mmcv_config':
-            if cfg.data.test.pipeline[0]['type'] == 'LoadImageFromFile':
-                cfg.data.test.pipeline.pop(0)
-            data = dict(img=img)
-            test_pipeline = Compose(cfg.data.test.pipeline)
+    def preprocess(self, input: Input, **preprocess_params) -> Dict[str, Any]:
+        if self.model.__class__.__name__ == 'OfaForAllTasks':
+            return super().preprocess(input, **preprocess_params)
         else:
-            if cfg.preprocessor.val[0]['type'] == 'LoadImageFromFile':
-                cfg.preprocessor.val.pop(0)
-            data = dict(img=img)
-            data_pipeline = preprocess_transform(cfg.preprocessor.val)
-            test_pipeline = Compose(data_pipeline)
+            img = LoadImage.convert_to_ndarray(input)
+            img = img[:, :, ::-1]  # Convert to BGR
+            data = super().preprocess(img, **preprocess_params)
+            from mmcv.parallel import collate, scatter
+            data = collate([data], samples_per_gpu=1)
+            if self.target_gpus is not None:
+                # scatter to specified GPU
+                data = scatter(data, self.target_gpus)[0]
+            return data
 
-        data = test_pipeline(data)
-        data = collate([data], samples_per_gpu=1)
-        if next(self.model.parameters()).is_cuda:
-            # scatter to specified GPU
-            data = scatter(data, [next(self.model.parameters()).device])[0]
-
-        return data
-
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-
-        with torch.no_grad():
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        if self.model.__class__.__name__ != 'OfaForAllTasks':
             input['return_loss'] = False
-            scores = self.model(input)
+        return self.model(input)
 
-        return {'scores': scores}
+    def postprocess(self, inputs: Dict[str, Any],
+                    **post_params) -> Dict[str, Any]:
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if self.model.__class__.__name__ != 'OfaForAllTasks':
+            scores = inputs
 
-        scores = inputs['scores']
+            pred_scores = np.sort(scores, axis=1)[0][::-1][:5]
+            pred_labels = np.argsort(scores, axis=1)[0][::-1][:5]
 
-        pred_scores = np.sort(scores, axis=1)[0][::-1][:5]
-        pred_labels = np.argsort(scores, axis=1)[0][::-1][:5]
+            result = {
+                'pred_score': [score for score in pred_scores],
+                'pred_class':
+                [self.model.CLASSES[label] for label in pred_labels]
+            }
 
-        result = {'pred_score': [score for score in pred_scores]}
-        result['pred_class'] = [
-            self.model.CLASSES[lable] for lable in pred_labels
-        ]
-
-        outputs = {
-            OutputKeys.SCORES: result['pred_score'],
-            OutputKeys.LABELS: result['pred_class']
-        }
-        return outputs
+            outputs = {
+                OutputKeys.SCORES: result['pred_score'],
+                OutputKeys.LABELS: result['pred_class']
+            }
+            return outputs
+        else:
+            return inputs
diff --git a/modelscope/pipelines/cv/image_deblur_pipeline.py b/modelscope/pipelines/cv/image_deblur_pipeline.py
new file mode 100644
index 00000000..165e54a5
--- /dev/null
+++ b/modelscope/pipelines/cv/image_deblur_pipeline.py
@@ -0,0 +1,120 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+from torchvision import transforms
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_deblur import NAFNetForImageDeblur
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import ImageDeblurPreprocessor, LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['ImageDeblurPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.image_deblurring, module_name=Pipelines.image_deblur)
+class ImageDeblurPipeline(Pipeline):
+    """
+
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> from modelscope.outputs import OutputKeys
+    >>> import cv2
+    >>>
+    >>> img = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/blurry.jpg'
+    >>> image_deblur_pipeline = pipeline(Tasks.image_deblurring, 'damo/cv_nafnet_image-deblur_gopro')
+    >>> result = image_deblur_pipeline(img)[OutputKeys.OUTPUT_IMG]
+    >>> cv2.imwrite('result.png', result)
+    ```
+    """
+
+    def __init__(self,
+                 model: Union[NAFNetForImageDeblur, str],
+                 preprocessor: Optional[ImageDeblurPreprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a cv image deblur pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        self.config = self.model.config
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        logger.info('load image denoise model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_img(input)
+        test_transforms = transforms.Compose([transforms.ToTensor()])
+        img = test_transforms(img)
+        result = {'img': img.unsqueeze(0).to(self._device)}
+        return result
+
+    def crop_process(self, input):
+        output = torch.zeros_like(input)  # [1, C, H, W]
+        # determine crop_h and crop_w
+        ih, iw = input.shape[-2:]
+        crop_rows, crop_cols = max(ih // 512, 1), max(iw // 512, 1)
+        overlap = 16
+
+        step_h, step_w = ih // crop_rows, iw // crop_cols
+        for y in range(crop_rows):
+            for x in range(crop_cols):
+                crop_y = step_h * y
+                crop_x = step_w * x
+
+                crop_h = step_h if y < crop_rows - 1 else ih - crop_y
+                crop_w = step_w if x < crop_cols - 1 else iw - crop_x
+
+                crop_frames = input[:, :,
+                                    max(0, crop_y - overlap
+                                        ):min(crop_y + crop_h + overlap, ih),
+                                    max(0, crop_x - overlap
+                                        ):min(crop_x + crop_w
+                                              + overlap, iw)].contiguous()
+                h_start = overlap if max(0, crop_y - overlap) > 0 else 0
+                w_start = overlap if max(0, crop_x - overlap) > 0 else 0
+                h_end = h_start + crop_h if min(crop_y + crop_h
+                                                + overlap, ih) < ih else ih
+                w_end = w_start + crop_w if min(crop_x + crop_w
+                                                + overlap, iw) < iw else iw
+
+                output[:, :, crop_y:crop_y + crop_h,
+                       crop_x:crop_x + crop_w] = self.model._inference_forward(
+                           crop_frames)['outputs'][:, :, h_start:h_end,
+                                                   w_start:w_end]
+        return output
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        def set_phase(model, is_train):
+            if is_train:
+                model.train()
+            else:
+                model.eval()
+
+        is_train = False
+        set_phase(self.model, is_train)
+        with torch.no_grad():
+            output = self.crop_process(input['img'])  # output Tensor
+
+        return {'output_tensor': output}
+
+    def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        output_img = (input['output_tensor'].squeeze(0) * 255).cpu().permute(
+            1, 2, 0).numpy().astype('uint8')
+        return {OutputKeys.OUTPUT_IMG: output_img[:, :, ::-1]}
diff --git a/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
new file mode 100644
index 00000000..ccd6eb8e
--- /dev/null
+++ b/modelscope/pipelines/cv/image_defrcn_fewshot_pipeline.py
@@ -0,0 +1,104 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.image_fewshot_detection,
+    module_name=Pipelines.image_fewshot_detection)
+class ImageDefrcnDetectionPipeline(Pipeline):
+    """ Image DeFRCN few-shot detection Pipeline. Given a image,
+        pipeline will return the detection results on the image.
+        Example:
+
+        ```python
+        >>> from modelscope.pipelines import pipeline
+        >>> detector = pipeline('image-fewshot-detection', 'damo/cv_resnet101_detection_fewshot-defrcn')
+        >>> detector('/Path/Image')
+           {
+            'scores': [0.8307567834854126, 0.1606406420469284],
+            'labels': ['person', 'dog'],
+            'boxes': [
+                [27.391937255859375, 0.0, 353.0, 500.0],
+                [64.22428131103516, 229.2884521484375, 213.90573120117188, 370.0657958984375]
+            ]
+            }
+        >>> #
+        ```
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, auto_collate=False, **kwargs)
+
+        model_path = os.path.join(self.model.model_dir,
+                                  ModelFile.TORCH_MODEL_FILE)
+        self.model.model = self._load_pretrained(
+            self.model.model, model_path, self.model.model_cfg.MODEL.DEVICE)
+
+    def _load_pretrained(self, net, load_path, device='cuda', strict=True):
+
+        load_net = torch.load(load_path, map_location=device)
+        if 'scheduler' in load_net:
+            del load_net['scheduler']
+        if 'optimizer' in load_net:
+            del load_net['optimizer']
+        if 'iteration' in load_net:
+            del load_net['iteration']
+        net.load_state_dict(load_net['model'], strict=strict)
+
+        return net
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_ndarray(input)
+        img = img.astype(np.float)
+
+        image = img[..., ::-1].copy()  # rgb to bgr
+        tim = torch.Tensor(image).permute(2, 0, 1)
+
+        result = {'image': tim}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+
+        outputs = self.model.inference(input)
+        result = {'data': outputs}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        if inputs['data'] is None:
+            outputs = {
+                OutputKeys.SCORES: [],
+                OutputKeys.LABELS: [],
+                OutputKeys.BOXES: []
+            }
+            return outputs
+
+        objects = inputs['data']['instances'].get_fields()
+        labels, bboxes = [], []
+        for label, box in zip(objects['pred_classes'], objects['pred_boxes']):
+            labels.append(self.model.config.model.classes[label])
+            bboxes.append(box.tolist())
+
+        scores = objects['scores'].tolist()
+
+        outputs = {
+            OutputKeys.SCORES: scores,
+            OutputKeys.LABELS: labels,
+            OutputKeys.BOXES: bboxes
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/image_face_fusion_pipeline.py b/modelscope/pipelines/cv/image_face_fusion_pipeline.py
new file mode 100644
index 00000000..3ba253e1
--- /dev/null
+++ b/modelscope/pipelines/cv/image_face_fusion_pipeline.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_face_fusion, module_name=Pipelines.image_face_fusion)
+class ImageFaceFusionPipeline(Pipeline):
+    """ Image face fusion pipeline
+    Example:
+
+    python
+    >>> from modelscope.pipelines import pipeline
+    >>> image_face_fusion = pipeline(Tasks.image_face_fusion,
+                   model='damo/cv_unet-image-face-fusion_damo')
+    >>> image_face_fusion({
+            'template': 'facefusion_template.jpg', # template path (str)
+            'image': 'facefusion_user.jpg', # user path (str)
+        })
+       {
+        "output_img": [H * W * 3] 0~255, we can use cv2.imwrite to save output_img as an image.
+        }
+    >>> #
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create image-face-fusion pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        logger.info('image face fusion model init done')
+
+    def preprocess(self,
+                   template: Input,
+                   user: Input = None) -> Dict[str, Any]:
+        if type(template) is dict:  # for demo service
+            user = template['user']
+            template = template['template']
+
+        template_img = LoadImage.convert_to_ndarray(template)
+        user_img = LoadImage.convert_to_ndarray(user)
+
+        result = {'template': template_img, 'user': user_img}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        template_img = input['template']
+        user_img = input['user']
+        output = self.model.inference(template_img, user_img)
+        result = {'outputs': output}
+        return result
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_img = inputs['outputs']
+        return {OutputKeys.OUTPUT_IMG: output_img}
diff --git a/modelscope/pipelines/cv/image_matching_pipeline.py b/modelscope/pipelines/cv/image_matching_pipeline.py
new file mode 100644
index 00000000..d16590d4
--- /dev/null
+++ b/modelscope/pipelines/cv/image_matching_pipeline.py
@@ -0,0 +1,175 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, List, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_matching, module_name=Pipelines.image_matching)
+class ImageMatchingPipeline(Pipeline):
+    """ Image Matching Pipeline.
+
+    Example:
+
+    ```python
+    from modelscope.outputs import OutputKeys
+    from modelscope.pipelines import pipeline
+    from modelscope.utils.constant import Tasks
+
+
+    task = 'image-matching'
+    model_id = 'damo/cv_quadtree_attention_image-matching_outdoor'
+
+    input_location = [
+                        ['data/test/images/image_matching1.jpg',
+                        'data/test/images/image_matching2.jpg']
+                    ]
+    estimator = pipeline(Tasks.image_matching, model=self.model_id)
+    result = estimator(input_location)
+    kpts0, kpts1, conf = result[0][OutputKeys.MATCHES]
+    print(f'Found {len(kpts0)} matches')
+    ```
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image matching pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        # check if cuda is available
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                'Cuda is not available. Image matching model only supports cuda.'
+            )
+
+        logger.info('image matching model, pipeline init')
+
+    def resize_image(self, img, max_image_size):
+        h, w = img.shape[:2]
+        scale = 1
+        if max(h, w) > max_image_size:
+            scale = max_image_size / max(h, w)
+            new_w, new_h = int(w * scale), int(h * scale)
+            img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        return img, scale
+
+    def compute_paded_size(self, size, div):
+        return int(np.ceil(size / div) * div)
+
+    def pad_image(self, img, h=None, w=None, div=32):
+        cur_h, cur_w = img.shape[:2]
+        if h is None and w is None:
+            h, w = cur_h, cur_w
+        h_pad, w_pad = self.compute_paded_size(h,
+                                               div), self.compute_paded_size(
+                                                   w, div)
+        img = cv2.copyMakeBorder(
+            img,
+            0,
+            h_pad - cur_h,
+            0,
+            w_pad - cur_w,
+            cv2.BORDER_CONSTANT,
+            value=0)
+        return img
+
+    def load_image(self, img_name):
+        img = LoadImage.convert_to_ndarray(img_name).astype(np.float32)
+        img = img / 255.
+        # convert rgb to gray
+        if len(img.shape) == 3:
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        return img
+
+    def preprocess(self, input: Input, max_image_size=1024):
+        assert len(input) == 2, 'input should be a list of two images'
+
+        img1 = self.load_image(input[0])
+        img1, scale1 = self.resize_image(img1, max_image_size)
+        scaled_h1, scaled_w1 = img1.shape[:2]
+
+        img2 = self.load_image(input[1])
+        img2, scale2 = self.resize_image(img2, max_image_size)
+        scaled_h2, scaled_w2 = img2.shape[:2]
+
+        h_max, w_max = max(scaled_h1, scaled_h2), max(scaled_w1, scaled_w2)
+        img1 = self.pad_image(img1, h_max, w_max)
+        img2 = self.pad_image(img2, h_max, w_max)
+
+        img1 = torch.from_numpy(img1)[None][None].cuda().float()
+        img2 = torch.from_numpy(img2)[None][None].cuda().float()
+        return {
+            'image0':
+            img1,
+            'image1':
+            img2,
+            'preprocess_info':
+            [scale1, scale2, scaled_h1, scaled_w1, scaled_h2, scaled_w2]
+        }
+
+    def postprocess_match(self, kpt1, kpt2, conf, scale1, scale2, scaled_h1,
+                          scaled_w1, scaled_h2, scaled_w2):
+        # filter out points outside the image
+        valid_match = (kpt1[:, 0] < scaled_w1) & (kpt1[:, 1] < scaled_h1) & (
+            kpt2[:, 0] < scaled_w2) & (
+                kpt2[:, 1] < scaled_h2)
+        kpt1, kpt2 = kpt1[valid_match], kpt2[valid_match]
+        kpt1 = kpt1 / scale1
+        kpt2 = kpt2 / scale2
+        conf = conf[valid_match]
+
+        return kpt1, kpt2, conf
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        matches = results[OutputKeys.MATCHES]
+
+        kpts0 = matches['kpts0'].cpu().numpy()
+        kpts1 = matches['kpts1'].cpu().numpy()
+        conf = matches['conf'].cpu().numpy()
+        preprocess_info = [v.cpu().numpy() for v in inputs['preprocess_info']]
+        kpts0, kpts1, conf = self.postprocess_match(kpts0, kpts1, conf,
+                                                    *preprocess_info)
+
+        outputs = {
+            OutputKeys.MATCHES: [kpts0, kpts1, conf],
+        }
+
+        return outputs
+
+    def __call__(self, input, **kwargs):
+        """
+        Match two images and return the matched keypoints and confidence.
+
+        Args:
+            input (`List[List[str]]`): A list of two image paths.
+
+        Return:
+            A list of result.
+            The list contain the following values:
+
+            - kpts0 -- Matched keypoints in the first image
+            - kpts1 -- Matched keypoints in the second image
+            - conf -- Confidence of the match
+        """
+        return super().__call__(input, **kwargs)
diff --git a/modelscope/pipelines/cv/image_mvs_depth_estimation_pipeline.py b/modelscope/pipelines/cv/image_mvs_depth_estimation_pipeline.py
new file mode 100644
index 00000000..fdd0edcf
--- /dev/null
+++ b/modelscope/pipelines/cv/image_mvs_depth_estimation_pipeline.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from tempfile import TemporaryDirectory
+from typing import Any, Dict, Union
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import depth_to_color
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_multi_view_depth_estimation,
+    module_name=Pipelines.image_multi_view_depth_estimation)
+class ImageMultiViewDepthEstimationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image multi-view depth estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.tmp_dir = None
+        logger.info('pipeline init done')
+
+    def check_input(self, input_dir):
+        assert os.path.exists(
+            input_dir), f'input dir:{input_dir} does not exsit'
+        sub_dirs = os.listdir(input_dir)
+        assert 'images' in sub_dirs, "must contain 'images' folder"
+        assert 'sparse' in sub_dirs, "must contain 'sparse' folder"
+        files = os.listdir(os.path.join(input_dir, 'sparse'))
+        assert 'cameras.bin' in files, "'sparse' folder must contain 'cameras.bin'"
+        assert 'images.bin' in files, "'sparse' folder must contain 'images.bin'"
+        assert 'points3D.bin' in files, "'sparse' folder must contain 'points3D.bin'"
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        assert isinstance(input, str), 'input must be str'
+        self.check_input(input)
+        self.tmp_dir = TemporaryDirectory()
+
+        casmvs_inp_dir = os.path.join(self.tmp_dir.name, 'casmvs_inp_dir')
+        casmvs_res_dir = os.path.join(self.tmp_dir.name, 'casmvs_res_dir')
+        os.makedirs(casmvs_inp_dir, exist_ok=True)
+        os.makedirs(casmvs_res_dir, exist_ok=True)
+
+        input_dict = {
+            'input_dir': input,
+            'casmvs_inp_dir': casmvs_inp_dir,
+            'casmvs_res_dir': casmvs_res_dir
+        }
+
+        self.model.preprocess_make_pair(input_dict)
+
+        return input_dict
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.forward(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+
+        pcd = self.model.postprocess(inputs)
+
+        # clear tmp dir
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+        outputs = {
+            OutputKeys.OUTPUT: pcd,
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
index b96e709c..fe941d9f 100644
--- a/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/image_panoptic_segmentation_pipeline.py
@@ -10,6 +10,7 @@ from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.cv.easycv_pipelines.base import EasyCVPipeline
 from modelscope.preprocessors import load_image
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
@@ -99,3 +100,36 @@ class ImagePanopticSegmentationPipeline(Pipeline):
             OutputKeys.SCORES: [0.999 for _ in range(len(labels_txt))]
         }
         return outputs
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.image_panoptic_segmentation_easycv)
+class ImagePanopticSegmentationEasyCVPipeline(EasyCVPipeline):
+    """Pipeline built upon easycv for image segmentation."""
+
+    def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs):
+        """
+            model (str): model id on modelscope hub or local model path.
+            model_file_pattern (str): model file pattern.
+        """
+        super(ImagePanopticSegmentationEasyCVPipeline, self).__init__(
+            model=model,
+            model_file_pattern=model_file_pattern,
+            *args,
+            **kwargs)
+
+    def __call__(self, inputs) -> Any:
+        outputs = self.predict_op(inputs)
+        easycv_results = outputs[0]
+
+        results = {
+            OutputKeys.MASKS:
+            easycv_results[OutputKeys.MASKS],
+            OutputKeys.LABELS:
+            easycv_results[OutputKeys.LABELS],
+            OutputKeys.SCORES:
+            [0.999 for _ in range(len(easycv_results[OutputKeys.LABELS]))]
+        }
+
+        return results
diff --git a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
index 3eec6526..18883171 100644
--- a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
+++ b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
@@ -47,6 +47,8 @@ class ImagePortraitEnhancementPipeline(Pipeline):
         self.use_sr = True
 
         self.size = 512
+        if 'hires' in model:
+            self.size = 1024
         self.n_mlp = 8
         self.channel_multiplier = 2
         self.narrow = 1
diff --git a/modelscope/pipelines/cv/indoor_layout_estimation_pipeline.py b/modelscope/pipelines/cv/indoor_layout_estimation_pipeline.py
new file mode 100644
index 00000000..673c8d3b
--- /dev/null
+++ b/modelscope/pipelines/cv/indoor_layout_estimation_pipeline.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.indoor_layout_estimation.networks.misc.fourier import (
+    fourier, fourier_gray)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.indoor_layout_estimation,
+    module_name=Pipelines.indoor_layout_estimation)
+class IndoorLayoutEstimationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a indoor layout estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('layout estimation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        image = LoadImage.convert_to_ndarray(input).astype(np.float32)
+        H, W = 512, 1024
+        image = cv2.resize(image, (W, H))
+        F = fourier(image)
+        F2 = fourier_gray(image) / 255.
+
+        image = image / 255.
+        x = np.concatenate((image, F, F2), axis=2).astype(np.float32)
+        x = x.transpose(2, 0, 1)[None]
+        data = {'images': x}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        layout_image = self.model.postprocess(inputs)
+        outputs = {
+            OutputKeys.LAYOUT: layout_image,
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/mask_face_recognition_pipeline.py b/modelscope/pipelines/cv/mask_face_recognition_pipeline.py
index 2190b6d0..5a59c962 100644
--- a/modelscope/pipelines/cv/mask_face_recognition_pipeline.py
+++ b/modelscope/pipelines/cv/mask_face_recognition_pipeline.py
@@ -19,13 +19,14 @@ from modelscope.pipelines.builder import PIPELINES
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
+from . import FaceProcessingBasePipeline
 
 logger = get_logger()
 
 
 @PIPELINES.register_module(
     Tasks.face_recognition, module_name=Pipelines.mask_face_recognition)
-class MaskFaceRecognitionPipeline(Pipeline):
+class MaskFaceRecognitionPipeline(FaceProcessingBasePipeline):
 
     def __init__(self, model: str, **kwargs):
         """
@@ -44,10 +45,6 @@ class MaskFaceRecognitionPipeline(Pipeline):
         face_model.eval()
         self.face_model = face_model
         logger.info('face recognition model loaded!')
-        # face detect pipeline
-        det_model_id = 'damo/cv_resnet50_face-detection_retinaface'
-        self.face_detection = pipeline(
-            Tasks.face_detection, model=det_model_id)
 
     def _prefix_revision(self, state_dict):
         new_state_dict = OrderedDict()
@@ -58,72 +55,13 @@ class MaskFaceRecognitionPipeline(Pipeline):
         state = new_state_dict
         return state
 
-    def _choose_face(self,
-                     det_result,
-                     min_face=10,
-                     top_face=1,
-                     center_face=False):
-        '''
-        choose face with maximum area
-        Args:
-            det_result: output of face detection pipeline
-            min_face: minimum size of valid face w/h
-            top_face: take faces with top max areas
-            center_face: choose the most centerd face from multi faces, only valid if top_face > 1
-        '''
-        bboxes = np.array(det_result[OutputKeys.BOXES])
-        landmarks = np.array(det_result[OutputKeys.KEYPOINTS])
-        if bboxes.shape[0] == 0:
-            logger.info('No face detected!')
-            return None
-        # face idx with enough size
-        face_idx = []
-        for i in range(bboxes.shape[0]):
-            box = bboxes[i]
-            if (box[2] - box[0]) >= min_face and (box[3] - box[1]) >= min_face:
-                face_idx += [i]
-        if len(face_idx) == 0:
-            logger.info(
-                f'Face size not enough, less than {min_face}x{min_face}!')
-            return None
-        bboxes = bboxes[face_idx]
-        landmarks = landmarks[face_idx]
-        # find max faces
-        boxes = np.array(bboxes)
-        area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-        sort_idx = np.argsort(area)[-top_face:]
-        # find center face
-        if top_face > 1 and center_face and bboxes.shape[0] > 1:
-            img_center = [img.shape[1] // 2, img.shape[0] // 2]
-            min_dist = float('inf')
-            sel_idx = -1
-            for _idx in sort_idx:
-                box = boxes[_idx]
-                dist = np.square(
-                    np.abs((box[0] + box[2]) / 2 - img_center[0])) + np.square(
-                        np.abs((box[1] + box[3]) / 2 - img_center[1]))
-                if dist < min_dist:
-                    min_dist = dist
-                    sel_idx = _idx
-            sort_idx = [sel_idx]
-        main_idx = sort_idx[-1]
-        return bboxes[main_idx], landmarks[main_idx]
-
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_ndarray(input)
-        img = img[:, :, ::-1]
-        det_result = self.face_detection(img.copy())
-        rtn = self._choose_face(det_result)
-        face_img = None
-        if rtn is not None:
-            _, face_lmks = rtn
-            face_lmks = face_lmks.reshape(5, 2)
-            align_img, _ = align_face(img, (112, 112), face_lmks)
-            face_img = align_img[:, :, ::-1]  # to rgb
-            face_img = np.transpose(face_img, axes=(2, 0, 1))
-            face_img = (face_img / 255. - 0.5) / 0.5
-            face_img = face_img.astype(np.float32)
-        result = {}
+        result = super().preprocess(input)
+        align_img = result['img']
+        face_img = align_img[:, :, ::-1]  # to rgb
+        face_img = np.transpose(face_img, axes=(2, 0, 1))
+        face_img = (face_img / 255. - 0.5) / 0.5
+        face_img = face_img.astype(np.float32)
         result['img'] = face_img
         return result
 
diff --git a/modelscope/pipelines/cv/maskdino_instance_segmentation_pipeline.py b/modelscope/pipelines/cv/maskdino_instance_segmentation_pipeline.py
new file mode 100644
index 00000000..8742a104
--- /dev/null
+++ b/modelscope/pipelines/cv/maskdino_instance_segmentation_pipeline.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+import torchvision.transforms as T
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_instance_segmentation import (
+    MaskDINOSwinModel, get_maskdino_ins_seg_result)
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.maskdino_instance_segmentation)
+class MaskDINOInstanceSegmentationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[MaskDINOSwinModel, str],
+                 preprocessor: Optional = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a MaskDINO instance segmentation
+        pipeline for prediction
+
+        Args:
+            model (MaskDINOSwinModel | str): a model instance
+            preprocessor (None): a preprocessor instance
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+
+    def get_preprocess_shape(self, oldh, oldw, short_edge_length, max_size):
+        h, w = oldh, oldw
+        size = short_edge_length * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            newh, neww = size, scale * w
+        else:
+            newh, neww = scale * h, size
+        if max(newh, neww) > max_size:
+            scale = max_size * 1.0 / max(newh, neww)
+            newh = newh * scale
+            neww = neww * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        image = LoadImage.convert_to_img(input)
+        w, h = image.size[:2]
+        new_h, new_w = self.get_preprocess_shape(h, w, 800, 1333)
+        test_transforms = T.Compose([
+            T.Resize((new_h, new_w)),
+            T.ToTensor(),
+        ])
+        image = test_transforms(image)
+        dataset_dict = {}
+        dataset_dict['height'] = h
+        dataset_dict['width'] = w
+        dataset_dict['image'] = image
+        result = {'batched_inputs': [dataset_dict]}
+        return result
+
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            output = self.model(input)
+        return output
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        result = get_maskdino_ins_seg_result(
+            inputs['eval_result'][0]['instances'],
+            class_names=self.model.model.classes)
+        return result
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 292ec2c5..682b05c4 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -1,22 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import math
 import os.path as osp
 from typing import Any, Dict
 
 import cv2
 import numpy as np
 import tensorflow as tf
+import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.cv.ocr_utils.model_vlpt import VLPTModel
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 from .ocr_utils import (SegLinkDetector, cal_width, combine_segments_python,
                         decode_segments_links_python, nms_python,
-                        rboxes_to_polygons)
+                        polygons_from_bitmap, rboxes_to_polygons)
 
 if tf.__version__ >= '2.0':
     import tf_slim as slim
@@ -53,132 +56,188 @@ class OCRDetectionPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        tf.reset_default_graph()
-        model_path = osp.join(
-            osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
-            'checkpoint-80000')
-        self._graph = tf.get_default_graph()
-        config = tf.ConfigProto(allow_soft_placement=True)
-        config.gpu_options.allow_growth = True
-        self._session = tf.Session(config=config)
+        if 'vlpt' in self.model:
+            model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+            logger.info(f'loading model from {model_path}')
 
-        with self._graph.as_default():
-            with device_placement(self.framework, self.device_name):
-                self.input_images = tf.placeholder(
-                    tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
-                self.output = {}
+            self.thresh = 0.3
+            self.image_short_side = 736
+            self.device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu')
+            self.infer_model = VLPTModel().to(self.device)
+            self.infer_model.eval()
+            checkpoint = torch.load(model_path, map_location=self.device)
+            if 'state_dict' in checkpoint:
+                self.infer_model.load_state_dict(checkpoint['state_dict'])
+            else:
+                self.infer_model.load_state_dict(checkpoint)
+        else:
+            tf.reset_default_graph()
+            model_path = osp.join(
+                osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
+                'checkpoint-80000')
+            self._graph = tf.get_default_graph()
+            config = tf.ConfigProto(allow_soft_placement=True)
+            config.gpu_options.allow_growth = True
+            self._session = tf.Session(config=config)
 
-                with tf.variable_scope('', reuse=tf.AUTO_REUSE):
-                    global_step = tf.get_variable(
-                        'global_step', [],
-                        initializer=tf.constant_initializer(0),
-                        dtype=tf.int64,
-                        trainable=False)
-                    variable_averages = tf.train.ExponentialMovingAverage(
-                        0.997, global_step)
+            with self._graph.as_default():
+                with device_placement(self.framework, self.device_name):
+                    self.input_images = tf.placeholder(
+                        tf.float32,
+                        shape=[1, 1024, 1024, 3],
+                        name='input_images')
+                    self.output = {}
 
-                    # detector
-                    detector = SegLinkDetector()
-                    all_maps = detector.build_model(
-                        self.input_images, is_training=False)
+                    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+                        global_step = tf.get_variable(
+                            'global_step', [],
+                            initializer=tf.constant_initializer(0),
+                            dtype=tf.int64,
+                            trainable=False)
+                        variable_averages = tf.train.ExponentialMovingAverage(
+                            0.997, global_step)
 
-                    # decode local predictions
-                    all_nodes, all_links, all_reg = [], [], []
-                    for i, maps in enumerate(all_maps):
-                        cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[
-                            2]
-                        reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
+                        # detector
+                        detector = SegLinkDetector()
+                        all_maps = detector.build_model(
+                            self.input_images, is_training=False)
 
-                        cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
+                        # decode local predictions
+                        all_nodes, all_links, all_reg = [], [], []
+                        for i, maps in enumerate(all_maps):
+                            cls_maps, lnk_maps, reg_maps = maps[0], maps[
+                                1], maps[2]
+                            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
 
-                        lnk_prob_pos = tf.nn.softmax(
-                            tf.reshape(lnk_maps, [-1, 4])[:, :2])
-                        lnk_prob_mut = tf.nn.softmax(
-                            tf.reshape(lnk_maps, [-1, 4])[:, 2:])
-                        lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut],
-                                             axis=1)
+                            cls_prob = tf.nn.softmax(
+                                tf.reshape(cls_maps, [-1, 2]))
 
-                        all_nodes.append(cls_prob)
-                        all_links.append(lnk_prob)
-                        all_reg.append(reg_maps)
+                            lnk_prob_pos = tf.nn.softmax(
+                                tf.reshape(lnk_maps, [-1, 4])[:, :2])
+                            lnk_prob_mut = tf.nn.softmax(
+                                tf.reshape(lnk_maps, [-1, 4])[:, 2:])
+                            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut],
+                                                 axis=1)
 
-                    # decode segments and links
-                    image_size = tf.shape(self.input_images)[1:3]
-                    segments, group_indices, segment_counts, _ = decode_segments_links_python(
-                        image_size,
-                        all_nodes,
-                        all_links,
-                        all_reg,
-                        anchor_sizes=list(detector.anchor_sizes))
+                            all_nodes.append(cls_prob)
+                            all_links.append(lnk_prob)
+                            all_reg.append(reg_maps)
 
-                    # combine segments
-                    combined_rboxes, combined_counts = combine_segments_python(
-                        segments, group_indices, segment_counts)
-                    self.output['combined_rboxes'] = combined_rboxes
-                    self.output['combined_counts'] = combined_counts
+                        # decode segments and links
+                        image_size = tf.shape(self.input_images)[1:3]
+                        segments, group_indices, segment_counts, _ = decode_segments_links_python(
+                            image_size,
+                            all_nodes,
+                            all_links,
+                            all_reg,
+                            anchor_sizes=list(detector.anchor_sizes))
 
-                with self._session.as_default() as sess:
-                    logger.info(f'loading model from {model_path}')
-                    # load model
-                    model_loader = tf.train.Saver(
-                        variable_averages.variables_to_restore())
-                    model_loader.restore(sess, model_path)
+                        # combine segments
+                        combined_rboxes, combined_counts = combine_segments_python(
+                            segments, group_indices, segment_counts)
+                        self.output['combined_rboxes'] = combined_rboxes
+                        self.output['combined_counts'] = combined_counts
+
+                    with self._session.as_default() as sess:
+                        logger.info(f'loading model from {model_path}')
+                        # load model
+                        model_loader = tf.train.Saver(
+                            variable_averages.variables_to_restore())
+                        model_loader.restore(sess, model_path)
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_ndarray(input)
+        if 'vlpt' in self.model:
+            img = LoadImage.convert_to_ndarray(input)[:, :, ::-1]
 
-        h, w, c = img.shape
-        img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
-        img_pad[:h, :w, :] = img
+            height, width, _ = img.shape
+            if height < width:
+                new_height = self.image_short_side
+                new_width = int(
+                    math.ceil(new_height / height * width / 32) * 32)
+            else:
+                new_width = self.image_short_side
+                new_height = int(
+                    math.ceil(new_width / width * height / 32) * 32)
+            resized_img = cv2.resize(img, (new_width, new_height))
 
-        resize_size = 1024
-        img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
-        img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
-        img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
-                                                   dtype=np.float32)
+            resized_img = resized_img - np.array([123.68, 116.78, 103.94],
+                                                 dtype=np.float32)
+            resized_img /= 255.
+            resized_img = torch.from_numpy(resized_img).permute(
+                2, 0, 1).float().unsqueeze(0)
 
-        with self._graph.as_default():
-            resize_size = tf.stack([resize_size, resize_size])
-            orig_size = tf.stack([max(h, w), max(h, w)])
-            self.output['orig_size'] = orig_size
-            self.output['resize_size'] = resize_size
+            result = {'img': resized_img, 'org_shape': [height, width]}
+            return result
+        else:
+            img = LoadImage.convert_to_ndarray(input)
 
-        result = {'img': np.expand_dims(img_pad_resize, axis=0)}
-        return result
+            h, w, c = img.shape
+            img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
+            img_pad[:h, :w, :] = img
+
+            resize_size = 1024
+            img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
+            img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
+            img_pad_resize = img_pad_resize - np.array(
+                [123.68, 116.78, 103.94], dtype=np.float32)
+
+            with self._graph.as_default():
+                resize_size = tf.stack([resize_size, resize_size])
+                orig_size = tf.stack([max(h, w), max(h, w)])
+                self.output['orig_size'] = orig_size
+                self.output['resize_size'] = resize_size
+
+            result = {'img': np.expand_dims(img_pad_resize, axis=0)}
+            return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        with self._graph.as_default():
-            with self._session.as_default():
-                feed_dict = {self.input_images: input['img']}
-                sess_outputs = self._session.run(
-                    self.output, feed_dict=feed_dict)
-                return sess_outputs
+        if 'vlpt' in self.model:
+            pred = self.infer_model(input['img'])
+            return {'results': pred, 'org_shape': input['org_shape']}
+        else:
+            with self._graph.as_default():
+                with self._session.as_default():
+                    feed_dict = {self.input_images: input['img']}
+                    sess_outputs = self._session.run(
+                        self.output, feed_dict=feed_dict)
+                    return sess_outputs
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        rboxes = inputs['combined_rboxes'][0]
-        count = inputs['combined_counts'][0]
-        if count == 0 or count < rboxes.shape[0]:
-            raise Exception('modelscope error: No text detected')
-        rboxes = rboxes[:count, :]
+        if 'vlpt' in self.model:
+            pred = inputs['results'][0]
+            height, width = inputs['org_shape']
+            segmentation = pred > self.thresh
 
-        # convert rboxes to polygons and find its coordinates on the original image
-        orig_h, orig_w = inputs['orig_size']
-        resize_h, resize_w = inputs['resize_size']
-        polygons = rboxes_to_polygons(rboxes)
-        scale_y = float(orig_h) / float(resize_h)
-        scale_x = float(orig_w) / float(resize_w)
+            boxes, scores = polygons_from_bitmap(pred, segmentation, width,
+                                                 height)
+            result = {OutputKeys.POLYGONS: np.array(boxes)}
+            return result
+        else:
+            rboxes = inputs['combined_rboxes'][0]
+            count = inputs['combined_counts'][0]
+            if count == 0 or count < rboxes.shape[0]:
+                raise Exception('modelscope error: No text detected')
+            rboxes = rboxes[:count, :]
 
-        # confine polygons inside image
-        polygons[:, ::2] = np.maximum(
-            0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
-        polygons[:, 1::2] = np.maximum(
-            0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
-        polygons = np.round(polygons).astype(np.int32)
+            # convert rboxes to polygons and find its coordinates on the original image
+            orig_h, orig_w = inputs['orig_size']
+            resize_h, resize_w = inputs['resize_size']
+            polygons = rboxes_to_polygons(rboxes)
+            scale_y = float(orig_h) / float(resize_h)
+            scale_x = float(orig_w) / float(resize_w)
 
-        # nms
-        dt_n9 = [o + [cal_width(o)] for o in polygons.tolist()]
-        dt_nms = nms_python(dt_n9)
-        dt_polygons = np.array([o[:8] for o in dt_nms])
+            # confine polygons inside image
+            polygons[:, ::2] = np.maximum(
+                0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
+            polygons[:, 1::2] = np.maximum(
+                0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
+            polygons = np.round(polygons).astype(np.int32)
 
-        result = {OutputKeys.POLYGONS: dt_polygons}
-        return result
+            # nms
+            dt_n9 = [o + [cal_width(o)] for o in polygons.tolist()]
+            dt_nms = nms_python(dt_n9)
+            dt_polygons = np.array([o[:8] for o in dt_nms])
+
+            result = {OutputKeys.POLYGONS: dt_polygons}
+            return result
diff --git a/modelscope/pipelines/cv/ocr_utils/__init__.py b/modelscope/pipelines/cv/ocr_utils/__init__.py
index 312445a9..979ea82c 100644
--- a/modelscope/pipelines/cv/ocr_utils/__init__.py
+++ b/modelscope/pipelines/cv/ocr_utils/__init__.py
@@ -6,12 +6,15 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .model_resnet_mutex_v4_linewithchar import SegLinkDetector
     from .ops import decode_segments_links_python, combine_segments_python
-    from .utils import rboxes_to_polygons, cal_width, nms_python
+    from .utils import rboxes_to_polygons, cal_width, nms_python, polygons_from_bitmap
 else:
     _import_structure = {
         'model_resnet_mutex_v4_linewithchar': ['SegLinkDetector'],
         'ops': ['decode_segments_links_python', 'combine_segments_python'],
-        'utils': ['rboxes_to_polygons', 'cal_width', 'nms_python']
+        'utils': [
+            'rboxes_to_polygons', 'cal_width', 'nms_python',
+            'polygons_from_bitmap'
+        ]
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/ocr_utils/model_vlpt.py b/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
new file mode 100644
index 00000000..19ac9807
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
@@ -0,0 +1,431 @@
+# ------------------------------------------------------------------------------
+# Part of implementation is adopted from ViLT,
+# made publicly available under the Apache License 2.0 at https://github.com/dandelin/ViLT.
+# ------------------------------------------------------------------------------
+
+import math
+import os
+import sys
+
+import torch
+import torch.nn as nn
+
+BatchNorm2d = nn.BatchNorm2d
+
+
+def constant_init(module, constant, bias=0):
+    nn.init.constant_(module.weight, constant)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
+        super(BasicBlock, self).__init__()
+        self.with_dcn = dcn is not None
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
+            self.with_modulated_dcn = dcn.get('modulated', False)
+        # self.conv2 = conv3x3(planes, planes)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(
+                planes, planes, kernel_size=3, padding=1, bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                from assets.ops.dcn import DeformConv
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                from assets.ops.dcn import ModulatedDeformConv
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                padding=1)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # out = self.conv2(out)
+        if not self.with_dcn:
+            out = self.conv2(out)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18, :, :]
+            mask = offset_mask[:, -9:, :, :].sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
+        super(Bottleneck, self).__init__()
+        self.with_dcn = dcn is not None
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
+            self.with_modulated_dcn = dcn.get('modulated', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(
+                planes,
+                planes,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                from assets.ops.dcn import DeformConv
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                from assets.ops.dcn import ModulatedDeformConv
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                padding=1)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # out = self.conv2(out)
+        if not self.with_dcn:
+            out = self.conv2(out)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18, :, :]
+            mask = offset_mask[:, -9:, :, :].sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False)):
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dcn=dcn)
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dcn=dcn)
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=2, dcn=dcn)
+        # self.avgpool = nn.AvgPool2d(7, stride=1)
+        # self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        # self.smooth = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        if self.dcn is not None:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) or isinstance(m, BasicBlock):
+                    if hasattr(m, 'conv2_offset'):
+                        constant_init(m.conv2_offset, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dcn=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, dcn=dcn))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, dcn=dcn))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+
+        return x2, x3, x4, x5
+
+
+class SegDetector(nn.Module):
+
+    def __init__(self,
+                 in_channels=[64, 128, 256, 512],
+                 inner_channels=256,
+                 k=10,
+                 bias=False,
+                 adaptive=False,
+                 smooth=False,
+                 serial=False,
+                 *args,
+                 **kwargs):
+        '''
+        bias: Whether conv layers have bias or not.
+        adaptive: Whether to use adaptive threshold training or not.
+        smooth: If true, use bilinear instead of deconv.
+        serial: If true, thresh prediction will combine segmentation result as input.
+        '''
+        super(SegDetector, self).__init__()
+        self.k = k
+        self.serial = serial
+        self.up5 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up4 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up3 = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.in5 = nn.Conv2d(in_channels[-1], inner_channels, 1, bias=bias)
+        self.in4 = nn.Conv2d(in_channels[-2], inner_channels, 1, bias=bias)
+        self.in3 = nn.Conv2d(in_channels[-3], inner_channels, 1, bias=bias)
+        self.in2 = nn.Conv2d(in_channels[-4], inner_channels, 1, bias=bias)
+
+        self.out5 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=8, mode='nearest'))
+        self.out4 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=4, mode='nearest'))
+        self.out3 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=2, mode='nearest'))
+        self.out2 = nn.Conv2d(
+            inner_channels, inner_channels // 4, 3, padding=1, bias=bias)
+
+        self.binarize = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels // 4), nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(inner_channels // 4, inner_channels // 4, 2, 2),
+            BatchNorm2d(inner_channels // 4), nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(inner_channels // 4, 1, 2, 2), nn.Sigmoid())
+        self.binarize.apply(self.weights_init)
+
+        self.adaptive = adaptive
+        if adaptive:
+            self.thresh = self._init_thresh(
+                inner_channels, serial=serial, smooth=smooth, bias=bias)
+            self.thresh.apply(self.weights_init)
+
+        self.in5.apply(self.weights_init)
+        self.in4.apply(self.weights_init)
+        self.in3.apply(self.weights_init)
+        self.in2.apply(self.weights_init)
+        self.out5.apply(self.weights_init)
+        self.out4.apply(self.weights_init)
+        self.out3.apply(self.weights_init)
+        self.out2.apply(self.weights_init)
+
+    def weights_init(self, m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    def _init_thresh(self,
+                     inner_channels,
+                     serial=False,
+                     smooth=False,
+                     bias=False):
+        in_channels = inner_channels
+        if serial:
+            in_channels += 1
+        self.thresh = nn.Sequential(
+            nn.Conv2d(
+                in_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels // 4), nn.ReLU(inplace=True),
+            self._init_upsample(
+                inner_channels // 4,
+                inner_channels // 4,
+                smooth=smooth,
+                bias=bias), BatchNorm2d(inner_channels // 4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(
+                inner_channels // 4, 1, smooth=smooth, bias=bias),
+            nn.Sigmoid())
+        return self.thresh
+
+    def _init_upsample(self,
+                       in_channels,
+                       out_channels,
+                       smooth=False,
+                       bias=False):
+        if smooth:
+            inter_out_channels = out_channels
+            if out_channels == 1:
+                inter_out_channels = in_channels
+            module_list = [
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)
+            ]
+            if out_channels == 1:
+                module_list.append(
+                    nn.Conv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=1,
+                        bias=True))
+
+            return nn.Sequential(module_list)
+        else:
+            return nn.ConvTranspose2d(in_channels, out_channels, 2, 2)
+
+    def forward(self, features, gt=None, masks=None, training=False):
+        c2, c3, c4, c5 = features
+        in5 = self.in5(c5)
+        in4 = self.in4(c4)
+        in3 = self.in3(c3)
+        in2 = self.in2(c2)
+
+        out4 = self.up5(in5) + in4  # 1/16
+        out3 = self.up4(out4) + in3  # 1/8
+        out2 = self.up3(out3) + in2  # 1/4
+
+        p5 = self.out5(in5)
+        p4 = self.out4(out4)
+        p3 = self.out3(out3)
+        p2 = self.out2(out2)
+
+        fuse = torch.cat((p5, p4, p3, p2), 1)
+        # this is the pred module, not binarization module;
+        # We do not correct the name due to the trained model.
+        binary = self.binarize(fuse)
+        return binary
+
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
+
+
+class VLPTModel(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(VLPTModel, self).__init__()
+        self.backbone = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+        self.decoder = SegDetector(
+            in_channels=[256, 512, 1024, 2048], adaptive=True, k=50, **kwargs)
+
+    def forward(self, x):
+        return self.decoder(self.backbone(x))
diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py
index 1d0fb297..b024844d 100644
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import cv2
 import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
 
 
 def rboxes_to_polygons(rboxes):
@@ -107,3 +109,102 @@ def point_line_dist(px, py, x1, y1, x2, y2):
     div = np.sqrt(dx * dx + dy * dy) + eps
     dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
     return dist
+
+
+# Part of the implementation is borrowed and modified from DB,
+# publicly available at https://github.com/MhLiao/DB.
+def polygons_from_bitmap(pred, _bitmap, dest_width, dest_height):
+    """
+    _bitmap: single map with shape (1, H, W),
+        whose values are binarized as {0, 1}
+    """
+
+    assert _bitmap.size(0) == 1
+    bitmap = _bitmap.cpu().numpy()[0]
+    pred = pred.cpu().detach().numpy()[0]
+    height, width = bitmap.shape
+    boxes = []
+    scores = []
+
+    contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
+                                   cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+
+    for contour in contours[:100]:
+        epsilon = 0.01 * cv2.arcLength(contour, True)
+        approx = cv2.approxPolyDP(contour, epsilon, True)
+        points = approx.reshape((-1, 2))
+        if points.shape[0] < 4:
+            continue
+
+        score = box_score_fast(pred, points.reshape(-1, 2))
+        if 0.7 > score:
+            continue
+
+        if points.shape[0] > 2:
+            box = unclip(points, unclip_ratio=2.0)
+            if len(box) > 1:
+                continue
+        else:
+            continue
+        box = box.reshape(-1, 2)
+        _, sside = get_mini_boxes(box.reshape((-1, 1, 2)))
+        if sside < 3 + 2:
+            continue
+
+        if not isinstance(dest_width, int):
+            dest_width = dest_width.item()
+            dest_height = dest_height.item()
+
+        box[:, 0] = np.clip(
+            np.round(box[:, 0] / width * dest_width), 0, dest_width)
+        box[:, 1] = np.clip(
+            np.round(box[:, 1] / height * dest_height), 0, dest_height)
+        boxes.append(box.tolist())
+        scores.append(score)
+    return boxes, scores
+
+
+def box_score_fast(bitmap, _box):
+    h, w = bitmap.shape[:2]
+    box = _box.copy()
+    xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
+    xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
+    ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
+    ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
+
+    mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+    box[:, 0] = box[:, 0] - xmin
+    box[:, 1] = box[:, 1] - ymin
+    cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+    return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+
+def unclip(box, unclip_ratio=1.5):
+    poly = Polygon(box)
+    distance = poly.area * unclip_ratio / poly.length
+    offset = pyclipper.PyclipperOffset()
+    offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+    expanded = np.array(offset.Execute(distance))
+    return expanded
+
+
+def get_mini_boxes(contour):
+    bounding_box = cv2.minAreaRect(contour)
+    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+
+    index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+    if points[1][1] > points[0][1]:
+        index_1 = 0
+        index_4 = 1
+    else:
+        index_1 = 1
+        index_4 = 0
+    if points[3][1] > points[2][1]:
+        index_2 = 2
+        index_3 = 3
+    else:
+        index_2 = 3
+        index_3 = 2
+
+    box = [points[index_1], points[index_2], points[index_3], points[index_4]]
+    return box, min(bounding_box[1])
diff --git a/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py b/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py
new file mode 100644
index 00000000..a1973285
--- /dev/null
+++ b/modelscope/pipelines/cv/panorama_depth_estimation_pipeline.py
@@ -0,0 +1,85 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import depth_to_color
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.panorama_depth_estimation,
+    module_name=Pipelines.panorama_depth_estimation)
+class PanoramaDepthEstimationPipeline(Pipeline):
+    """ This pipeline will estimation the depth panoramic image from one rgb panoramic image.
+        The input panoramic image should be equirectanlar, in the size of 512x1024.
+
+    Example:
+    '''python
+    >>> import cv2
+    >>> from modelscope.outputs import OutputKeys
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+
+    >>> task = 'panorama-depth-estimation'
+    >>> model_id = 'damo/cv_unifuse_image-depth-estimation'
+
+    >>> input_location = 'data/test/images/panorama_depth_estimation.jpg'
+    >>> estimator = pipeline(Tasks.panorama_depth_estimation, model=model_id)
+    >>> result = estimator(input_location)
+    >>> depth_vis = result[OutputKeys.DEPTHS_COLOR]
+    >>> cv2.imwrite('result.jpg', depth_vis)
+    '''
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a panorama depth estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('depth estimation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input)
+        H, W = 512, 1024
+        img = cv2.resize(img, dsize=(W, H), interpolation=cv2.INTER_CUBIC)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        cube_img = self.model.e2c.run(img)
+        data = {}
+        rgb = self.model.to_tensor(img.copy())
+        cube_rgb = self.model.to_tensor(cube_img.copy())
+        rgb = self.model.normalize(rgb)
+        cube_rgb = self.model.normalize(cube_rgb)
+        data['rgb'] = rgb[None, ...]
+        data['cube_rgb'] = cube_rgb[None, ...]
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.forward(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        depths = results[OutputKeys.DEPTHS]
+        if isinstance(depths, torch.Tensor):
+            depths = depths.detach().cpu().squeeze().numpy()
+        depths_color = depth_to_color(depths)
+        outputs = {
+            OutputKeys.DEPTHS: depths,
+            OutputKeys.DEPTHS_COLOR: depths_color
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/pointcloud_sceneflow_estimation_pipeline.py b/modelscope/pipelines/cv/pointcloud_sceneflow_estimation_pipeline.py
new file mode 100644
index 00000000..05f86393
--- /dev/null
+++ b/modelscope/pipelines/cv/pointcloud_sceneflow_estimation_pipeline.py
@@ -0,0 +1,114 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+from plyfile import PlyData, PlyElement
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import depth_to_color
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.pointcloud_sceneflow_estimation,
+    module_name=Pipelines.pointcloud_sceneflow_estimation)
+class PointCloudSceneFlowEstimationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image depth estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('pointcloud scenflow estimation model, pipeline init')
+
+    def check_input_pcd(self, pcd):
+        assert pcd.ndim == 2, 'pcd ndim must equal to 2'
+        assert pcd.shape[1] == 3, 'pcd.shape[1] must equal to 3'
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        assert isinstance(input, tuple), 'only support tuple input'
+        assert isinstance(input[0], str) and isinstance(
+            input[1], str), 'only support tuple input with str type'
+
+        pcd1_file, pcd2_file = input
+        logger.info(f'input pcd file:{pcd1_file},  \n  {pcd2_file}')
+        pcd1 = np.load(pcd1_file)
+        pcd2 = np.load(pcd2_file)
+        self.check_input_pcd(pcd1)
+        self.check_input_pcd(pcd2)
+        pcd1_torch = torch.from_numpy(pcd1).float().unsqueeze(0).cuda()
+        pcd2_torch = torch.from_numpy(pcd2).float().unsqueeze(0).cuda()
+
+        data = {
+            'pcd1': pcd1_torch,
+            'pcd2': pcd2_torch,
+            'pcd1_ori': pcd1,
+            'pcd2_ori': pcd2
+        }
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = {}
+        output = self.model.inference(input)
+        results['output'] = output
+        results['pcd1_ori'] = input['pcd1_ori']
+        results['pcd2_ori'] = input['pcd2_ori']
+        return results
+
+    def save_ply_data(self, pcd1, pcd2):
+        vertexs = np.concatenate([pcd1, pcd2], axis=0)
+        color1 = np.array([[255, 0, 0]], dtype=np.uint8)
+        color2 = np.array([[0, 255, 0]], dtype=np.uint8)
+        color1 = np.tile(color1, (pcd1.shape[0], 1))
+        color2 = np.tile(color2, (pcd2.shape[0], 1))
+        vertex_colors = np.concatenate([color1, color2], axis=0)
+
+        vertexs = np.array([tuple(v) for v in vertexs],
+                           dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+        vertex_colors = np.array([tuple(v) for v in vertex_colors],
+                                 dtype=[('red', 'u1'), ('green', 'u1'),
+                                        ('blue', 'u1')])
+
+        vertex_all = np.empty(
+            len(vertexs), vertexs.dtype.descr + vertex_colors.dtype.descr)
+        for prop in vertexs.dtype.names:
+            vertex_all[prop] = vertexs[prop]
+        for prop in vertex_colors.dtype.names:
+            vertex_all[prop] = vertex_colors[prop]
+
+        el = PlyElement.describe(vertex_all, 'vertex')
+        ply_data = PlyData([el])
+        # .write(save_name)
+        return ply_data
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        flow = results[OutputKeys.OUTPUT]
+
+        pcd1 = inputs['pcd1_ori']
+        pcd2 = inputs['pcd2_ori']
+        if isinstance(pcd1, torch.Tensor):
+            pcd1 = pcd1.cpu().numpy()
+        if isinstance(pcd2, torch.Tensor):
+            pcd2 = pcd2.cpu().numpy()
+        if isinstance(flow, torch.Tensor):
+            flow = flow.cpu().numpy()
+
+        outputs = {
+            OutputKeys.OUTPUT: flow,
+            OutputKeys.PCD12: self.save_ply_data(pcd1, pcd2),
+            OutputKeys.PCD12_ALIGN: self.save_ply_data(pcd1 + flow, pcd2),
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
index f0ec07bb..49897281 100644
--- a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
@@ -2,6 +2,7 @@
 # originally Apache 2.0 License and publicly avaialbe at https://github.com/mttr2021/MTTR
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import tempfile
 from typing import Any, Dict
 
 import numpy as np
@@ -33,6 +34,7 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
 
         Args:
             model: model id on modelscope hub
+            render: whether to generate output video for demo service, default: False
         """
         _device = kwargs.pop('device', 'gpu')
         if torch.cuda.is_available() and _device == 'gpu':
@@ -123,7 +125,11 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
                 pred_masks_per_query.append(pred_masks)
         return pred_masks_per_query
 
-    def postprocess(self, inputs) -> Dict[str, Any]:
+    def postprocess(self, inputs, **kwargs) -> Dict[str, Any]:
+        output_clip_path = None
+        render = kwargs.get('render', False)
+        if render:
+            self.model.cfg.pipeline.save_masked_video = True
         if self.model.cfg.pipeline.save_masked_video:
             # RGB colors for instance masks:
             light_blue = (41, 171, 226)
@@ -182,8 +188,9 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
                 masked_video.append(np.array(vid_frame))
             # generate and save the output clip:
 
-            assert self.model.cfg.pipeline.output_path
-            output_clip_path = self.model.cfg.pipeline.output_path
+            output_clip_path = self.model.cfg.pipeline.get(
+                'output_path',
+                tempfile.NamedTemporaryFile(suffix='.mp4').name)
             clip = ImageSequenceClip(
                 sequence=masked_video, fps=self.meta['video_fps'])
 
@@ -207,8 +214,9 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
         for frame_idx in range(self.video.shape[0]):
             output_timestamps.append(timestamp_format(seconds=frame_idx / fps))
         result = {
-            OutputKeys.MASKS: masks,
-            OutputKeys.TIMESTAMPS: output_timestamps
+            OutputKeys.MASKS: None if render else masks,
+            OutputKeys.TIMESTAMPS: None if render else output_timestamps,
+            OutputKeys.OUTPUT_VIDEO: output_clip_path
         }
 
         return result
diff --git a/modelscope/pipelines/cv/tinynas_detection_pipeline.py b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
index d35d4d36..c897af4d 100644
--- a/modelscope/pipelines/cv/tinynas_detection_pipeline.py
+++ b/modelscope/pipelines/cv/tinynas_detection_pipeline.py
@@ -1,16 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict
-
-import cv2
-import numpy as np
-import torch
+from typing import Any, Dict, Optional, Union
 
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
+from modelscope.outputs.cv_outputs import DetectionOutput
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors import LoadImage, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.cv.image_utils import \
     show_image_object_detection_auto_result
@@ -19,40 +16,54 @@ from modelscope.utils.logger import get_logger
 logger = get_logger()
 
 
+@PIPELINES.register_module(
+    Tasks.domain_specific_object_detection,
+    module_name=Pipelines.tinynas_detection)
 @PIPELINES.register_module(
     Tasks.image_object_detection, module_name=Pipelines.tinynas_detection)
 class TinynasDetectionPipeline(Pipeline):
 
-    def __init__(self, model: str, **kwargs):
+    def __init__(self,
+                 model: str,
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """Object detection pipeline, currently only for the tinynas-detection model.
+
+        Args:
+            model: A str format model id or model local dir to build the model instance from.
+            preprocessor: A preprocessor instance to preprocess the data, if None,
+            the pipeline will try to build the preprocessor according to the configuration.json file.
+            kwargs: The args needed by the `Pipeline` class.
         """
-            model: model id on modelscope hub.
-        """
-        super().__init__(model=model, auto_collate=False, **kwargs)
-        if torch.cuda.is_available():
-            self.device = 'cuda'
-        else:
-            self.device = 'cpu'
-        self.model.to(self.device)
-        self.model.eval()
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-
         img = LoadImage.convert_to_ndarray(input)
-        self.img = img
-        img = img.astype(np.float)
-        img = self.model.preprocess(img)
-        result = {'img': img.to(self.device)}
-        return result
+        return super().preprocess(img)
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(
+            self, input: Dict[str,
+                              Any]) -> Union[Dict[str, Any], DetectionOutput]:
+        """The forward method of this pipeline.
 
-        outputs = self.model.inference(input['img'])
-        result = {'data': outputs}
-        return result
+        Args:
+            input: The input data output from the `preprocess` procedure.
 
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        Returns:
+            A model output, either in a dict format, or in a standard `DetectionOutput` dataclass.
+            If outputs a dict, these keys are needed:
+                class_ids (`Tensor`, *optional*): class id for each object.
+                boxes (`Tensor`, *optional*): Bounding box for each detected object
+                    in [left, top, right, bottom] format.
+                scores (`Tensor`, *optional*): Detection score for each object.
+        """
+        return self.model(input['img'])
 
-        bboxes, scores, labels = self.model.postprocess(inputs['data'])
+    def postprocess(
+            self, inputs: Union[Dict[str, Any],
+                                DetectionOutput]) -> Dict[str, Any]:
+        bboxes, scores, labels = inputs['boxes'], inputs['scores'], inputs[
+            'class_ids']
         if bboxes is None:
             outputs = {
                 OutputKeys.SCORES: [],
diff --git a/modelscope/pipelines/cv/video_depth_estimation_pipeline.py b/modelscope/pipelines/cv/video_depth_estimation_pipeline.py
new file mode 100644
index 00000000..d862c40d
--- /dev/null
+++ b/modelscope/pipelines/cv/video_depth_estimation_pipeline.py
@@ -0,0 +1,50 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_depth_estimation, module_name=Pipelines.video_depth_estimation)
+class VideoDepthEstimationPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a video depth estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('depth estimation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        video_path = input
+        data = {'video_path': video_path}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        depths = results['depths']
+        depths_color = results['depths_color']
+        poses = results['poses']
+
+        outputs = {
+            OutputKeys.DEPTHS: depths,
+            OutputKeys.DEPTHS_COLOR: depths_color,
+            OutputKeys.POSES: poses
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py b/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py
new file mode 100644
index 00000000..d241b00a
--- /dev/null
+++ b/modelscope/pipelines/cv/video_frame_interpolation_pipeline.py
@@ -0,0 +1,602 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import math
+import os
+import os.path as osp
+import subprocess
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.utils import make_grid
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_frame_interpolation.utils.scene_change_detection import \
+    do_scene_detect
+from modelscope.models.cv.video_frame_interpolation.VFINet_for_video_frame_interpolation import \
+    VFINetForVideoFrameInterpolation
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors.cv import VideoReader
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+VIDEO_EXTENSIONS = ('.mp4', '.mov')
+logger = get_logger()
+
+
+def img_trans(img_tensor):  # in format of RGB
+    img_tensor = img_tensor / 255.0
+    mean = torch.Tensor([0.429, 0.431, 0.397]).view(1, 3, 1,
+                                                    1).type_as(img_tensor)
+    img_tensor -= mean
+    return img_tensor
+
+
+def add_mean(x):
+    mean = torch.Tensor([0.429, 0.431, 0.397]).view(1, 3, 1, 1).type_as(x)
+    return x + mean
+
+
+def img_padding(img_tensor, height, width, pad_num=32):
+    ph = ((height - 1) // pad_num + 1) * pad_num
+    pw = ((width - 1) // pad_num + 1) * pad_num
+    padding = (0, pw - width, 0, ph - height)
+    img_tensor = F.pad(img_tensor, padding)
+    return img_tensor
+
+
+def do_inference_lowers(flow_10,
+                        flow_12,
+                        flow_21,
+                        flow_23,
+                        img1,
+                        img2,
+                        inter_model,
+                        read_count,
+                        inter_count,
+                        delta,
+                        outputs,
+                        start_end_flag=False):
+    # given frame1, frame2 and optical flow, predict frame_t
+    if start_end_flag:
+        read_count -= 1
+    else:
+        read_count -= 2
+    while inter_count <= read_count:
+        t = inter_count + 1 - read_count
+        t = round(t, 2)
+        if (t - 0) < delta / 2:
+            output = img1
+        elif (1 - t) < delta / 2:
+            output = img2
+        else:
+            output = inter_model(flow_10, flow_12, flow_21, flow_23, img1,
+                                 img2, t)
+
+        output = 255 * add_mean(output)
+        outputs.append(output)
+        inter_count += delta
+
+    return outputs, inter_count
+
+
+def do_inference_highers(flow_10,
+                         flow_12,
+                         flow_21,
+                         flow_23,
+                         img1,
+                         img2,
+                         img1_up,
+                         img2_up,
+                         inter_model,
+                         read_count,
+                         inter_count,
+                         delta,
+                         outputs,
+                         start_end_flag=False):
+    # given frame1, frame2 and optical flow, predict frame_t. For videos with a resolution of 2k and above
+    if start_end_flag:
+        read_count -= 1
+    else:
+        read_count -= 2
+    while inter_count <= read_count:
+        t = inter_count + 1 - read_count
+        t = round(t, 2)
+        if (t - 0) < delta / 2:
+            output = img1_up
+        elif (1 - t) < delta / 2:
+            output = img2_up
+        else:
+            output = inter_model(flow_10, flow_12, flow_21, flow_23, img1,
+                                 img2, img1_up, img2_up, t)
+
+        output = 255 * add_mean(output)
+        outputs.append(output)
+        inter_count += delta
+
+    return outputs, inter_count
+
+
+def inference_lowers(flow_model, refine_model, inter_model, video_len,
+                     read_count, inter_count, delta, scene_change_flag,
+                     img_tensor_list, img_ori_list, inputs, outputs):
+    # given a video with a resolution less than 2k and output fps, execute the video frame interpolation function.
+    height, width = inputs[read_count].size(2), inputs[read_count].size(3)
+    # We use four consecutive frames to do frame interpolation. flow_10 represents
+    # optical flow from frame0 to frame1. The similar goes for flow_12, flow_21 and
+    # flow_23.
+    flow_10 = None
+    flow_12 = None
+    flow_21 = None
+    flow_23 = None
+    with torch.no_grad():
+        while (read_count < video_len):
+            img = inputs[read_count]
+            img = img_padding(img, height, width)
+            img_ori_list.append(img)
+            img_tensor_list.append(img_trans(img))
+            read_count += 1
+            if len(img_tensor_list) == 2:
+                img0 = img_tensor_list[0]
+                img1 = img_tensor_list[1]
+                img0_ori = img_ori_list[0]
+                img1_ori = img_ori_list[1]
+                _, flow_01_up = flow_model(
+                    img0_ori, img1_ori, iters=12, test_mode=True)
+                _, flow_10_up = flow_model(
+                    img1_ori, img0_ori, iters=12, test_mode=True)
+                flow_01, flow_10 = refine_model(img0, img1, flow_01_up,
+                                                flow_10_up, 2)
+                scene_change_flag[0] = do_scene_detect(
+                    flow_01[:, :, 0:height, 0:width], flow_10[:, :, 0:height,
+                                                              0:width],
+                    img_ori_list[0][:, :, 0:height, 0:width],
+                    img_ori_list[1][:, :, 0:height, 0:width])
+                if scene_change_flag[0]:
+                    outputs, inter_count = do_inference_lowers(
+                        None,
+                        None,
+                        None,
+                        None,
+                        img0,
+                        img1,
+                        inter_model,
+                        read_count,
+                        inter_count,
+                        delta,
+                        outputs,
+                        start_end_flag=True)
+                else:
+                    outputs, inter_count = do_inference_lowers(
+                        None,
+                        flow_01,
+                        flow_10,
+                        None,
+                        img0,
+                        img1,
+                        inter_model,
+                        read_count,
+                        inter_count,
+                        delta,
+                        outputs,
+                        start_end_flag=True)
+
+            if len(img_tensor_list) == 4:
+                if flow_12 is None or flow_21 is None:
+                    img2 = img_tensor_list[2]
+                    img2_ori = img_ori_list[2]
+                    _, flow_12_up = flow_model(
+                        img1_ori, img2_ori, iters=12, test_mode=True)
+                    _, flow_21_up = flow_model(
+                        img2_ori, img1_ori, iters=12, test_mode=True)
+                    flow_12, flow_21 = refine_model(img1, img2, flow_12_up,
+                                                    flow_21_up, 2)
+                    scene_change_flag[1] = do_scene_detect(
+                        flow_12[:, :, 0:height,
+                                0:width], flow_21[:, :, 0:height, 0:width],
+                        img_ori_list[1][:, :, 0:height, 0:width],
+                        img_ori_list[2][:, :, 0:height, 0:width])
+
+                img3 = img_tensor_list[3]
+                img3_ori = img_ori_list[3]
+                _, flow_23_up = flow_model(
+                    img2_ori, img3_ori, iters=12, test_mode=True)
+                _, flow_32_up = flow_model(
+                    img3_ori, img2_ori, iters=12, test_mode=True)
+                flow_23, flow_32 = refine_model(img2, img3, flow_23_up,
+                                                flow_32_up, 2)
+                scene_change_flag[2] = do_scene_detect(
+                    flow_23[:, :, 0:height, 0:width], flow_32[:, :, 0:height,
+                                                              0:width],
+                    img_ori_list[2][:, :, 0:height, 0:width],
+                    img_ori_list[3][:, :, 0:height, 0:width])
+
+                if scene_change_flag[1]:
+                    outputs, inter_count = do_inference_lowers(
+                        None, None, None, None, img1, img2, inter_model,
+                        read_count, inter_count, delta, outputs)
+                elif scene_change_flag[0] or scene_change_flag[2]:
+                    outputs, inter_count = do_inference_lowers(
+                        None, flow_12, flow_21, None, img1, img2, inter_model,
+                        read_count, inter_count, delta, outputs)
+                else:
+                    outputs, inter_count = do_inference_lowers(
+                        flow_10_up, flow_12, flow_21, flow_23_up, img1, img2,
+                        inter_model, read_count, inter_count, delta, outputs)
+
+                img_tensor_list.pop(0)
+                img_ori_list.pop(0)
+
+                # for next group
+                img1 = img2
+                img2 = img3
+                img1_ori = img2_ori
+                img2_ori = img3_ori
+                flow_10 = flow_21
+                flow_12 = flow_23
+                flow_21 = flow_32
+
+                flow_10_up = flow_21_up
+                flow_12_up = flow_23_up
+                flow_21_up = flow_32_up
+
+                # save scene change flag for next group
+                scene_change_flag[0] = scene_change_flag[1]
+                scene_change_flag[1] = scene_change_flag[2]
+                scene_change_flag[2] = False
+
+        if read_count > 0:  # the last remaining 3 images
+            img_ori_list.pop(0)
+            img_tensor_list.pop(0)
+            assert (len(img_tensor_list) == 2)
+
+            if scene_change_flag[1]:
+                outputs, inter_count = do_inference_lowers(
+                    None,
+                    None,
+                    None,
+                    None,
+                    img1,
+                    img2,
+                    inter_model,
+                    read_count,
+                    inter_count,
+                    delta,
+                    outputs,
+                    start_end_flag=True)
+            else:
+                outputs, inter_count = do_inference_lowers(
+                    None,
+                    flow_12,
+                    flow_21,
+                    None,
+                    img1,
+                    img2,
+                    inter_model,
+                    read_count,
+                    inter_count,
+                    delta,
+                    outputs,
+                    start_end_flag=True)
+
+    return outputs
+
+
+def inference_highers(flow_model, refine_model, inter_model, video_len,
+                      read_count, inter_count, delta, scene_change_flag,
+                      img_tensor_list, img_ori_list, inputs, outputs):
+    # given a video with a resolution of 2k or above and output fps, execute the video frame interpolation function.
+    if inputs[read_count].size(2) % 2 != 0 or inputs[read_count].size(
+            3) % 2 != 0:
+        raise RuntimeError('Video width and height must be even')
+
+    height, width = inputs[read_count].size(2) // 2, inputs[read_count].size(
+        3) // 2
+    # We use four consecutive frames to do frame interpolation. flow_10 represents
+    # optical flow from frame0 to frame1. The similar goes for flow_12, flow_21 and
+    # flow_23.
+    flow_10 = None
+    flow_12 = None
+    flow_21 = None
+    flow_23 = None
+    img_up_list = []
+    with torch.no_grad():
+        while (read_count < video_len):
+            img_up = inputs[read_count]
+            img_up = img_padding(img_up, height * 2, width * 2, pad_num=64)
+            img = F.interpolate(
+                img_up, scale_factor=0.5, mode='bilinear', align_corners=False)
+
+            img_up_list.append(img_trans(img_up))
+            img_ori_list.append(img)
+            img_tensor_list.append(img_trans(img))
+            read_count += 1
+            if len(img_tensor_list) == 2:
+                img0 = img_tensor_list[0]
+                img1 = img_tensor_list[1]
+                img0_ori = img_ori_list[0]
+                img1_ori = img_ori_list[1]
+                img0_up = img_up_list[0]
+                img1_up = img_up_list[1]
+                _, flow_01_up = flow_model(
+                    img0_ori, img1_ori, iters=12, test_mode=True)
+                _, flow_10_up = flow_model(
+                    img1_ori, img0_ori, iters=12, test_mode=True)
+                flow_01, flow_10 = refine_model(img0, img1, flow_01_up,
+                                                flow_10_up, 2)
+                scene_change_flag[0] = do_scene_detect(
+                    flow_01[:, :, 0:height, 0:width], flow_10[:, :, 0:height,
+                                                              0:width],
+                    img_ori_list[0][:, :, 0:height, 0:width],
+                    img_ori_list[1][:, :, 0:height, 0:width])
+                if scene_change_flag[0]:
+                    outputs, inter_count = do_inference_highers(
+                        None,
+                        None,
+                        None,
+                        None,
+                        img0,
+                        img1,
+                        img0_up,
+                        img1_up,
+                        inter_model,
+                        read_count,
+                        inter_count,
+                        delta,
+                        outputs,
+                        start_end_flag=True)
+                else:
+                    outputs, inter_count = do_inference_highers(
+                        None,
+                        flow_01,
+                        flow_10,
+                        None,
+                        img0,
+                        img1,
+                        img0_up,
+                        img1_up,
+                        inter_model,
+                        read_count,
+                        inter_count,
+                        delta,
+                        outputs,
+                        start_end_flag=True)
+
+            if len(img_tensor_list) == 4:
+                if flow_12 is None or flow_21 is None:
+                    img2 = img_tensor_list[2]
+                    img2_ori = img_ori_list[2]
+                    img2_up = img_up_list[2]
+                    _, flow_12_up = flow_model(
+                        img1_ori, img2_ori, iters=12, test_mode=True)
+                    _, flow_21_up = flow_model(
+                        img2_ori, img1_ori, iters=12, test_mode=True)
+                    flow_12, flow_21 = refine_model(img1, img2, flow_12_up,
+                                                    flow_21_up, 2)
+                    scene_change_flag[1] = do_scene_detect(
+                        flow_12[:, :, 0:height,
+                                0:width], flow_21[:, :, 0:height, 0:width],
+                        img_ori_list[1][:, :, 0:height, 0:width],
+                        img_ori_list[2][:, :, 0:height, 0:width])
+
+                img3 = img_tensor_list[3]
+                img3_ori = img_ori_list[3]
+                img3_up = img_up_list[3]
+                _, flow_23_up = flow_model(
+                    img2_ori, img3_ori, iters=12, test_mode=True)
+                _, flow_32_up = flow_model(
+                    img3_ori, img2_ori, iters=12, test_mode=True)
+                flow_23, flow_32 = refine_model(img2, img3, flow_23_up,
+                                                flow_32_up, 2)
+                scene_change_flag[2] = do_scene_detect(
+                    flow_23[:, :, 0:height, 0:width], flow_32[:, :, 0:height,
+                                                              0:width],
+                    img_ori_list[2][:, :, 0:height, 0:width],
+                    img_ori_list[3][:, :, 0:height, 0:width])
+
+                if scene_change_flag[1]:
+                    outputs, inter_count = do_inference_highers(
+                        None, None, None, None, img1, img2, img1_up, img2_up,
+                        inter_model, read_count, inter_count, delta, outputs)
+                elif scene_change_flag[0] or scene_change_flag[2]:
+                    outputs, inter_count = do_inference_highers(
+                        None, flow_12, flow_21, None, img1, img2, img1_up,
+                        img2_up, inter_model, read_count, inter_count, delta,
+                        outputs)
+                else:
+                    outputs, inter_count = do_inference_highers(
+                        flow_10_up, flow_12, flow_21, flow_23_up, img1, img2,
+                        img1_up, img2_up, inter_model, read_count, inter_count,
+                        delta, outputs)
+
+                img_up_list.pop(0)
+                img_tensor_list.pop(0)
+                img_ori_list.pop(0)
+
+                # for next group
+                img1 = img2
+                img2 = img3
+                img1_ori = img2_ori
+                img2_ori = img3_ori
+                img1_up = img2_up
+                img2_up = img3_up
+                flow_10 = flow_21
+                flow_12 = flow_23
+                flow_21 = flow_32
+
+                flow_10_up = flow_21_up
+                flow_12_up = flow_23_up
+                flow_21_up = flow_32_up
+
+                # save scene change flag for next group
+                scene_change_flag[0] = scene_change_flag[1]
+                scene_change_flag[1] = scene_change_flag[2]
+                scene_change_flag[2] = False
+
+        if read_count > 0:  # the last remaining 3 images
+            img_ori_list.pop(0)
+            img_tensor_list.pop(0)
+            assert (len(img_tensor_list) == 2)
+
+            if scene_change_flag[1]:
+                outputs, inter_count = do_inference_highers(
+                    None,
+                    None,
+                    None,
+                    None,
+                    img1,
+                    img2,
+                    img1_up,
+                    img2_up,
+                    inter_model,
+                    read_count,
+                    inter_count,
+                    delta,
+                    outputs,
+                    start_end_flag=True)
+            else:
+                outputs, inter_count = do_inference_highers(
+                    None,
+                    flow_12,
+                    flow_21,
+                    None,
+                    img1,
+                    img2,
+                    img1_up,
+                    img2_up,
+                    inter_model,
+                    read_count,
+                    inter_count,
+                    delta,
+                    outputs,
+                    start_end_flag=True)
+
+    return outputs
+
+
+def convert(param):
+    return {
+        k.replace('module.', ''): v
+        for k, v in param.items() if 'module.' in k
+    }
+
+
+__all__ = ['VideoFrameInterpolationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.video_frame_interpolation,
+    module_name=Pipelines.video_frame_interpolation)
+class VideoFrameInterpolationPipeline(Pipeline):
+    """ Video Frame Interpolation Pipeline.
+    Example:
+    ```python
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> from modelscope.outputs import OutputKeys
+
+    >>> video = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/video_frame_interpolation_test.mp4'
+    >>> video_frame_interpolation_pipeline = pipeline(Tasks.video_frame_interpolation,
+    'damo/cv_raft_video-frame-interpolation')
+    >>> result = video_frame_interpolation_pipeline(video)[OutputKeys.OUTPUT_VIDEO]
+    >>> print('pipeline: the output video path is {}'.format(result))
+    ```
+    """
+
+    def __init__(self,
+                 model: Union[VFINetForVideoFrameInterpolation, str],
+                 preprocessor=None,
+                 **kwargs):
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        self.net = self.model.model
+        self.net.to(self._device)
+        self.net.eval()
+        logger.info('load video frame-interpolation done')
+
+    def preprocess(self, input: Input, out_fps: float = 0) -> Dict[str, Any]:
+        # input is a video file
+        video_reader = VideoReader(input)
+        inputs = []
+        for frame in video_reader:
+            inputs.append(frame)
+        fps = video_reader.fps
+
+        for i, img in enumerate(inputs):
+            img = torch.from_numpy(img.copy()).permute(2, 0, 1).float()
+            inputs[i] = img.unsqueeze(0)
+
+        if out_fps == 0:
+            out_fps = 2 * fps
+        return {'video': inputs, 'fps': fps, 'out_fps': out_fps}
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        inputs = input['video']
+        fps = input['fps']
+        out_fps = input['out_fps']
+        video_len = len(inputs)
+
+        flow_model = self.net.flownet
+        refine_model = self.net.internet.ifnet
+
+        read_count = 0
+        inter_count = 0
+        delta = fps / out_fps
+        scene_change_flag = [False, False, False]
+        img_tensor_list = []
+        img_ori_list = []
+        outputs = []
+        height, width = inputs[read_count].size(2), inputs[read_count].size(3)
+        if height >= 1440 or width >= 2560:
+            inter_model = self.net.internet_Ds.internet
+            outputs = inference_highers(flow_model, refine_model, inter_model,
+                                        video_len, read_count, inter_count,
+                                        delta, scene_change_flag,
+                                        img_tensor_list, img_ori_list, inputs,
+                                        outputs)
+        else:
+            inter_model = self.net.internet.internet
+            outputs = inference_lowers(flow_model, refine_model, inter_model,
+                                       video_len, read_count, inter_count,
+                                       delta, scene_change_flag,
+                                       img_tensor_list, img_ori_list, inputs,
+                                       outputs)
+
+        for i in range(len(outputs)):
+            outputs[i] = outputs[i][:, :, 0:height, 0:width]
+        return {'output': outputs, 'fps': out_fps}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get('output_video', None)
+        demo_service = kwargs.get('demo_service', True)
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+        h, w = inputs['output'][0].shape[-2:]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc,
+                                       inputs['fps'], (w, h))
+        for i in range(len(inputs['output'])):
+            img = inputs['output'][i]
+            img = img[0].permute(1, 2, 0).byte().cpu().numpy()
+            video_writer.write(img.astype(np.uint8))
+
+        video_writer.release()
+        if demo_service:
+            assert os.system(
+                'ffmpeg -version') == 0, 'ffmpeg is not installed correctly!'
+            output_video_path_for_web = output_video_path[:-4] + '_web.mp4'
+            convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}'
+            subprocess.call(convert_cmd, shell=True)
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web}
+        else:
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path}
diff --git a/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py b/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py
new file mode 100644
index 00000000..0f02413c
--- /dev/null
+++ b/modelscope/pipelines/cv/video_multi_object_tracking_pipeline.py
@@ -0,0 +1,81 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_multi_object_tracking.tracker.multitracker import \
+    JDETracker
+from modelscope.models.cv.video_multi_object_tracking.utils.utils import (
+    LoadVideo, cfg_opt)
+from modelscope.models.cv.video_single_object_tracking.utils.utils import \
+    timestamp_format
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_multi_object_tracking,
+    module_name=Pipelines.video_multi_object_tracking)
+class VideoMultiObjectTrackingPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a multi object tracking pipeline
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE)
+        logger.info(f'loading model from {ckpt_path}')
+        opt = cfg_opt()
+        self.opt = opt
+        self.tracker = JDETracker(opt, ckpt_path, self.device)
+        logger.info('init tracker done')
+
+    def preprocess(self, input) -> Input:
+        self.video_path = input[0]
+        return input
+
+    def forward(self, input: Input) -> Dict[str, Any]:
+        dataloader = LoadVideo(input, self.opt.img_size)
+        self.tracker.set_buffer_len(dataloader.frame_rate)
+
+        results = []
+        output_timestamps = []
+        frame_id = 0
+        for i, (path, img, img0) in enumerate(dataloader):
+            output_timestamps.append(
+                timestamp_format(seconds=frame_id / dataloader.frame_rate))
+            blob = torch.from_numpy(img).unsqueeze(0)
+            online_targets = self.tracker.update(blob, img0)
+            online_tlwhs = []
+            online_ids = []
+            for t in online_targets:
+                tlwh = t.tlwh
+                tid = t.track_id
+                vertical = tlwh[2] / tlwh[3] > 1.6
+                if tlwh[2] * tlwh[3] > self.opt.min_box_area and not vertical:
+                    online_tlwhs.append([
+                        tlwh[0], tlwh[1], tlwh[0] + tlwh[2], tlwh[1] + tlwh[3]
+                    ])
+                    online_ids.append(tid)
+                results.append([
+                    frame_id + 1, tid, tlwh[0], tlwh[1], tlwh[0] + tlwh[2],
+                    tlwh[1] + tlwh[3]
+                ])
+            frame_id += 1
+
+        return {
+            OutputKeys.BOXES: results,
+            OutputKeys.TIMESTAMPS: output_timestamps
+        }
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/video_stabilization_pipeline.py b/modelscope/pipelines/cv/video_stabilization_pipeline.py
new file mode 100644
index 00000000..c0b50a04
--- /dev/null
+++ b/modelscope/pipelines/cv/video_stabilization_pipeline.py
@@ -0,0 +1,124 @@
+# Modified from https://github.com/Annbless/DUTCode
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import math
+import os
+import subprocess
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.metrics.video_stabilization_metric import warpprocess
+from modelscope.models.cv.video_stabilization.DUTRAFTStabilizer import \
+    DUTRAFTStabilizer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors.cv import VideoReader
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+__all__ = ['VideoStabilizationPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.video_stabilization, module_name=Pipelines.video_stabilization)
+class VideoStabilizationPipeline(Pipeline):
+    """  Video Stabilization Pipeline.
+
+    Example:
+
+    ```python
+    >>> import cv2
+    >>> from modelscope.outputs import OutputKeys
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+
+    >>> test_video = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/video_stabilization_test_video.avi'
+    >>> video_stabilization = pipeline(Tasks.video_stabilization, model='damo/cv_dut-raft_video-stabilization_base')
+    >>> out_video_path = video_stabilization(test_video)[OutputKeys.OUTPUT_VIDEO]
+    >>> print('Pipeline: the output video path is {}'.format(out_video_path))
+    ```
+    """
+
+    def __init__(self,
+                 model: Union[DUTRAFTStabilizer, str],
+                 preprocessor=None,
+                 **kwargs):
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+
+        logger.info('load video stabilization model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        # read video
+        video_reader = VideoReader(input)
+        fps = video_reader.fps
+        width = video_reader.width
+        height = video_reader.height
+
+        return {
+            'vid_path': input,
+            'fps': fps,
+            'width': width,
+            'height': height
+        }
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model._inference_forward(input['vid_path'])
+        results = warpprocess(results)
+        out_images = results['output']
+        out_images = out_images.numpy().astype(np.uint8)
+        out_images = [
+            np.transpose(out_images[idx], (1, 2, 0))
+            for idx in range(out_images.shape[0])
+        ]
+        base_crop_width = results['base_crop_width']
+
+        return {
+            'output': out_images,
+            'fps': input['fps'],
+            'base_crop_width': base_crop_width
+        }
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get('output_video', None)
+        is_cvt_h264 = kwargs.get('is_cvt_h264', False)
+
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+        h, w = inputs['output'][0].shape[-3:-1]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc,
+                                       inputs['fps'], (w, h))
+        for idx, frame in enumerate(inputs['output']):
+            horizontal_border = int(inputs['base_crop_width'] * w / 1280)
+            vertical_border = int(horizontal_border * h / w)
+            new_frame = frame[vertical_border:-vertical_border,
+                              horizontal_border:-horizontal_border]
+            new_frame = cv2.resize(new_frame, (w, h))
+            video_writer.write(new_frame)
+        video_writer.release()
+
+        if is_cvt_h264:
+            assert os.system(
+                'ffmpeg -version'
+            ) == 0, 'ffmpeg is not installed correctly, please refer to https://trac.ffmpeg.org/wiki/CompilationGuide.'
+            output_video_path_for_web = output_video_path[:-4] + '_web.mp4'
+            convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}'
+            subprocess.call(convert_cmd, shell=True)
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web}
+        else:
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path}
diff --git a/modelscope/pipelines/cv/video_super_resolution_pipeline.py b/modelscope/pipelines/cv/video_super_resolution_pipeline.py
new file mode 100644
index 00000000..87b73346
--- /dev/null
+++ b/modelscope/pipelines/cv/video_super_resolution_pipeline.py
@@ -0,0 +1,167 @@
+# The implementation here is modified based on RealBasicVSR,
+# originally Apache 2.0 License and publicly avaialbe at
+# https://github.com/ckkelvinchan/RealBasicVSR/blob/master/inference_realbasicvsr.py
+import math
+import os
+import subprocess
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+from torchvision.utils import make_grid
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_super_resolution import \
+    RealBasicVSRNetForVideoSR
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.cv import VideoReader
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+VIDEO_EXTENSIONS = ('.mp4', '.mov')
+
+logger = get_logger()
+
+
+def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)):
+    """Convert torch Tensors into image numpy arrays.
+    After clamping to (min, max), image values will be normalized to [0, 1].
+    For different tensor shapes, this function will have different behaviors:
+        1. 4D mini-batch Tensor of shape (N x 3/1 x H x W):
+            Use `make_grid` to stitch images in the batch dimension, and then
+            convert it to numpy array.
+        2. 3D Tensor of shape (3/1 x H x W) and 2D Tensor of shape (H x W):
+            Directly change to numpy array.
+    Note that the image channel in input tensors should be RGB order. This
+    function will convert it to cv2 convention, i.e., (H x W x C) with BGR
+    order.
+    Args:
+        tensor (Tensor | list[Tensor]): Input tensors.
+        out_type (numpy type): Output types. If ``np.uint8``, transform outputs
+            to uint8 type with range [0, 255]; otherwise, float type with
+            range [0, 1]. Default: ``np.uint8``.
+        min_max (tuple): min and max values for clamp.
+    Returns:
+        (Tensor | list[Tensor]): 3D ndarray of shape (H x W x C) or 2D ndarray
+        of shape (H x W).
+    """
+    condition = torch.is_tensor(tensor) or (isinstance(tensor, list) and all(
+        torch.is_tensor(t) for t in tensor))
+    if not condition:
+        raise TypeError(
+            f'tensor or list of tensors expected, got {type(tensor)}')
+
+    if torch.is_tensor(tensor):
+        tensor = [tensor]
+    result = []
+    for _tensor in tensor:
+        # Squeeze two times so that:
+        # 1. (1, 1, h, w) -> (h, w) or
+        # 3. (1, 3, h, w) -> (3, h, w) or
+        # 2. (n>1, 3/1, h, w) -> (n>1, 3/1, h, w)
+        _tensor = _tensor.squeeze(0).squeeze(0)
+        _tensor = _tensor.float().detach().cpu().clamp_(*min_max)
+        _tensor = (_tensor - min_max[0]) / (min_max[1] - min_max[0])
+        n_dim = _tensor.dim()
+        if n_dim == 4:
+            img_np = make_grid(
+                _tensor, nrow=int(math.sqrt(_tensor.size(0))),
+                normalize=False).numpy()
+            img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))
+        elif n_dim == 3:
+            img_np = _tensor.numpy()
+            img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0))
+        elif n_dim == 2:
+            img_np = _tensor.numpy()
+        else:
+            raise ValueError('Only support 4D, 3D or 2D tensor. '
+                             f'But received with dimension: {n_dim}')
+        if out_type == np.uint8:
+            # Unlike MATLAB, numpy.unit8() WILL NOT round by default.
+            img_np = (img_np * 255.0).round()
+        img_np = img_np.astype(out_type)
+        result.append(img_np)
+    result = result[0] if len(result) == 1 else result
+    return result
+
+
+@PIPELINES.register_module(
+    Tasks.video_super_resolution, module_name=Pipelines.video_super_resolution)
+class VideoSuperResolutionPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[RealBasicVSRNetForVideoSR, str],
+                 preprocessor=None,
+                 **kwargs):
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        self.config = self.model.config
+
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+
+        logger.info('load video super-resolution model done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        # input is a video file
+        video_reader = VideoReader(input)
+        inputs = []
+        for frame in video_reader:
+            inputs.append(np.flip(frame, axis=2))
+        fps = video_reader.fps
+
+        for i, img in enumerate(inputs):
+            img = torch.from_numpy(img / 255.).permute(2, 0, 1).float()
+            inputs[i] = img.unsqueeze(0)
+        inputs = torch.stack(inputs, dim=1)
+        return {'video': inputs, 'fps': fps}
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        inputs = input['video']
+        with torch.no_grad():
+            if isinstance(self.config.model.max_seq_len, int):
+                outputs = []
+                for i in range(0, inputs.size(1),
+                               self.config.model.max_seq_len):
+                    imgs = inputs[:,
+                                  i:i + self.config.model.max_seq_len, :, :, :]
+                    imgs = imgs.to(self._device)
+                    outputs.append(
+                        self.model._inference_forward(imgs)['output'].cpu())
+                outputs = torch.cat(outputs, dim=1)
+            else:
+                inputs = inputs.to(self._device)
+                outputs = self.model._inference_forward(inputs)['output'].cpu()
+        return {'output': outputs, 'fps': input['fps']}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get('output_video', None)
+        demo_service = kwargs.get('demo_service', True)
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+
+        h, w = inputs['output'].shape[-2:]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc,
+                                       inputs['fps'], (w, h))
+        for i in range(0, inputs['output'].size(1)):
+            img = tensor2img(inputs['output'][:, i, :, :, :])
+            video_writer.write(img.astype(np.uint8))
+        video_writer.release()
+
+        if demo_service:
+            assert os.system(
+                'ffmpeg -version'
+            ) == 0, 'ffmpeg is not installed correctly, please refer to https://trac.ffmpeg.org/wiki/CompilationGuide.'
+            output_video_path_for_web = output_video_path[:-4] + '_web.mp4'
+            convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}'
+            subprocess.call(convert_cmd, shell=True)
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web}
+        else:
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path}
diff --git a/modelscope/pipelines/cv/vision_middleware_pipeline.py b/modelscope/pipelines/cv/vision_middleware_pipeline.py
new file mode 100644
index 00000000..0f0b8dc6
--- /dev/null
+++ b/modelscope/pipelines/cv/vision_middleware_pipeline.py
@@ -0,0 +1,66 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision  Team Authors. All rights reserved.
+
+import math
+import os.path as osp
+from typing import Any, Dict
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from mmcv.parallel import collate, scatter
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.vision_middleware import VisionMiddlewareModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_segmentation,
+    module_name=Pipelines.vision_middleware_multi_task)
+class VisionMiddlewarePipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a vision middleware pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        self.model = self.model.cuda()
+        self.model.eval()
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        img = LoadImage.convert_to_img(input)
+
+        data = self.transform(img)
+        data = collate([data], samples_per_gpu=1)
+        if next(self.model.parameters()).is_cuda:
+            # scatter to specified GPU
+            data = scatter(data, [next(self.model.parameters()).device])[0]
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        with torch.no_grad():
+            # currently only support one task in pipeline
+            results = self.model(input, task_name='seg-voc')
+            return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/vop_retrieval_pipeline.py b/modelscope/pipelines/cv/vop_retrieval_pipeline.py
new file mode 100644
index 00000000..f0907027
--- /dev/null
+++ b/modelscope/pipelines/cv/vop_retrieval_pipeline.py
@@ -0,0 +1,122 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import gzip
+import math
+import os
+import os.path as osp
+import pickle
+import random
+from collections import defaultdict, deque
+from typing import Any, Dict
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.cv.vop_retrieval import (LengthAdaptiveTokenizer, VoP,
+                                                init_transform_dict, load_data,
+                                                load_frames_from_video)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import load_image
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.vop_retrieval, module_name=Pipelines.vop_retrieval)
+class VopRetrievalPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a vop pipeline for retrieval
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        # [from pretrain] load model
+        self.model = Model.from_pretrained('damo/cv_vit-b32_retrieval_vop').to(
+            self.device)
+        logger.info('load model done')
+
+        # others: load transform
+        self.local_pth = model
+        self.cfg = Config.from_file(osp.join(model, ModelFile.CONFIGURATION))
+        self.img_transform = init_transform_dict(
+            self.cfg.hyperparam.input_res)['clip_test']
+        logger.info('load transform done')
+
+        # others: load tokenizer
+        bpe_path = gzip.open(osp.join(
+            model,
+            'bpe_simple_vocab_16e6.txt.gz')).read().decode('utf-8').split('\n')
+        self.tokenizer = LengthAdaptiveTokenizer(self.cfg.hyperparam, bpe_path)
+        logger.info('load tokenizer done')
+
+        # others: load dataset
+        self.database = load_data(
+            osp.join(model, 'VoP_msrvtt9k_features.pkl'), self.device)
+        logger.info('load database done')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        if isinstance(input, str):
+            if '.mp4' in input:
+                query = []
+                for video_path in [input]:
+                    video_path = osp.join(self.local_pth, video_path)
+                    imgs, idxs = load_frames_from_video(
+                        video_path, self.cfg.hyperparam.num_frames,
+                        self.cfg.hyperparam.video_sample_type)
+                    imgs = self.img_transform(imgs)
+                    query.append(imgs)
+                query = torch.stack(
+                    query, dim=0).to(
+                        self.device, non_blocking=True)
+                mode = 'v2t'
+            else:
+                query = self.tokenizer(
+                    input, return_tensors='pt', padding=True, truncation=True)
+                if isinstance(query, torch.Tensor):
+                    query = query.to(self.device, non_blocking=True)
+                else:
+                    query = {
+                        key: val.to(self.device, non_blocking=True)
+                        for key, val in query.items()
+                    }
+                mode = 't2v'
+        else:
+            raise TypeError(f'input should be a str,'
+                            f'  but got {type(input)}')
+        result = {'input_data': query, 'mode': mode}
+        return result
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        text_embeds, vid_embeds_pooled, vid_ids, texts = self.database
+        with torch.no_grad():
+            if input['mode'] == 't2v':
+                query_feats = self.model.get_text_features(input['input_data'])
+                score = query_feats @ vid_embeds_pooled.T
+                retrieval_idxs = torch.topk(
+                    score, k=self.cfg.hyperparam.topk,
+                    dim=-1)[1].cpu().numpy()
+                res = np.array(vid_ids)[retrieval_idxs]
+            elif input['mode'] == 'v2t':
+                query_feats = self.model.get_video_features(
+                    input['input_data'])
+                score = query_feats @ text_embeds.T
+                retrieval_idxs = torch.topk(
+                    score, k=self.cfg.hyperparam.topk,
+                    dim=-1)[1].cpu().numpy()
+                res = np.array(texts)[retrieval_idxs]
+            results = {'output_data': res, 'mode': input['mode']}
+            return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/__init__.py b/modelscope/pipelines/multi_modal/__init__.py
index d5c171a3..e8ca1a3c 100644
--- a/modelscope/pipelines/multi_modal/__init__.py
+++ b/modelscope/pipelines/multi_modal/__init__.py
@@ -14,7 +14,11 @@ if TYPE_CHECKING:
         VideoMultiModalEmbeddingPipeline
     from .visual_question_answering_pipeline import VisualQuestionAnsweringPipeline
     from .asr_pipeline import AutomaticSpeechRecognitionPipeline
-
+    from .mgeo_ranking_pipeline import MGeoRankingPipeline
+    from .document_vl_embedding_pipeline import DocumentVLEmbeddingPipeline
+    from .video_captioning_pipeline import VideoCaptioningPipeline
+    from .video_question_answering_pipeline import VideoQuestionAnsweringPipeline
+    from .diffusers_wrapped import StableDiffusionWrapperPipeline, ChineseStableDiffusionPipeline
 else:
     _import_structure = {
         'image_captioning_pipeline': ['ImageCaptioningPipeline'],
@@ -29,6 +33,13 @@ else:
         'generative_multi_modal_embedding_pipeline':
         ['GEMMMultiModalEmbeddingPipeline'],
         'asr_pipeline': ['AutomaticSpeechRecognitionPipeline'],
+        'mgeo_ranking_pipeline': ['MGeoRankingPipeline'],
+        'document_vl_embedding_pipeline': ['DocumentVLEmbeddingPipeline'],
+        'video_captioning_pipeline': ['VideoCaptioningPipeline'],
+        'video_question_answering_pipeline':
+        ['VideoQuestionAnsweringPipeline'],
+        'diffusers_wrapped':
+        ['StableDiffusionWrapperPipeline', 'ChineseStableDiffusionPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/multi_modal/asr_pipeline.py b/modelscope/pipelines/multi_modal/asr_pipeline.py
index 3cb7439c..590a2496 100644
--- a/modelscope/pipelines/multi_modal/asr_pipeline.py
+++ b/modelscope/pipelines/multi_modal/asr_pipeline.py
@@ -5,9 +5,9 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
                                       Preprocessor)
 from modelscope.utils.constant import Tasks
@@ -45,6 +45,12 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
                 preprocessor = MPlugPreprocessor(pipe_model.model_dir)
         super().__init__(model=pipe_model, preprocessor=preprocessor, **kwargs)
 
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(AutomaticSpeechRecognitionPipeline, self)._batch(data)
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/__init__.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/__init__.py
new file mode 100644
index 00000000..0c9fc5e8
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .stable_diffusion import StableDiffusionWrapperPipeline
+    from .stable_diffusion import ChineseStableDiffusionPipeline
+else:
+    _import_structure = {
+        'stable_diffusion':
+        ['StableDiffusionWrapperPipeline', 'ChineseStableDiffusionPipeline']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py
new file mode 100644
index 00000000..efec87d1
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Generator, List, Union
+
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.utils.constant import Hubs
+from modelscope.utils.device import create_device
+from modelscope.utils.hub import snapshot_download
+
+
+class DiffusersPipeline(Pipeline):
+
+    def __init__(self, model: str, device: str = 'gpu', **kwargs):
+        """
+        use `model` to create a diffusers pipeline
+        Args:
+            model: model id on modelscope hub.
+            device: str = 'gpu'
+        """
+
+        self.device_name = device
+        self.cfg = None
+        self.preprocessor = None
+        self.framework = None
+        self.device = create_device(self.device_name)
+        self.hubs = kwargs.get('hubs', Hubs.modelscope)
+
+        # make sure we download the model from modelscope hub
+        model_folder = model
+        if not os.path.isdir(model_folder):
+            if self.hubs != Hubs.modelscope:
+                raise NotImplementedError(
+                    'Only support model retrieval from ModelScope hub for now.'
+                )
+            model_folder = snapshot_download(model)
+
+        self.model = model_folder
+        self.models = [self.model]
+        self.has_multiple_models = len(self.models) > 1
+
+    def preprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
+
+    def __call__(self, input: Union[Input, List[Input]], *args,
+                 **kwargs) -> Union[Dict[str, Any], Generator]:
+
+        return self.postprocess(
+            self.forward(self.preprocess(input), *args, **kwargs))
diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/__init__.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/__init__.py
new file mode 100644
index 00000000..6892877a
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .stable_diffusion_pipeline import StableDiffusionWrapperPipeline
+    from .chinese_stable_diffusion_pipeline import ChineseStableDiffusionPipeline
+else:
+    _import_structure = {
+        'stable_diffusion_pipeline': ['StableDiffusionWrapperPipeline'],
+        'chinese_stable_diffusion_pipeline':
+        ['ChineseStableDiffusionPipeline']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
new file mode 100644
index 00000000..8bf25ba3
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
@@ -0,0 +1,186 @@
+# Copyright 2022 The HuggingFace Team.
+# Copyright 2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+
+# The implementation here is modified based on diffusers,
+# originally Apache License, Copyright 2022 The HuggingFace Team,
+# and publicly available at
+# https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+
+from typing import Any, Dict, List, Union
+
+import torch
+from diffusers import StableDiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.schedulers import (DDIMScheduler, DPMSolverMultistepScheduler,
+                                  EulerAncestralDiscreteScheduler,
+                                  EulerDiscreteScheduler, LMSDiscreteScheduler,
+                                  PNDMScheduler)
+from transformers import (ChineseCLIPProcessor, ChineseCLIPTextModel,
+                          CLIPFeatureExtractor)
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.multi_modal.diffusers_wrapped.diffusers_pipeline import \
+    DiffusersPipeline
+from modelscope.utils.constant import Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.text_to_image_synthesis,
+    module_name=Pipelines.chinese_stable_diffusion)
+class ChineseStableDiffusionPipeline(DiffusersPipeline):
+
+    def __init__(self, model: str, device: str = 'gpu', **kwargs):
+        """
+        use `model` to create a stable diffusion pipeline
+        Args:
+            model: model id on modelscope hub.
+            device: str = 'gpu'
+        """
+        super().__init__(model, device, **kwargs)
+
+        torch_dtype = kwargs.get('torch_dtype', torch.float16)
+        self.pipeline = _DiffuersChineseStableDiffusionPipeline.from_pretrained(
+            model, torch_dtype=torch_dtype).to(self.device)
+
+    def forward(self, prompt, **kwargs):
+        return self.pipeline(prompt, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return {OutputKeys.OUTPUT_IMG: inputs.images}
+
+
+class _DiffuersChineseStableDiffusionPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Chinese Stable Diffusion.
+
+    This model inherits from [`StableDiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`ChineseCLIPTextModel`]):
+            Frozen text-encoder. Chinese Stable Diffusion uses the text portion of [ChineseCLIP]
+            (https://huggingface.co/docs/transformers/main/en/model_doc/chinese_clip#transformers.ChineseCLIPTextModel),
+            specifically the [chinese-clip-vit-huge-patch14]
+            (https://huggingface.co/OFA-Sys/chinese-clip-vit-huge-patch14) variant.
+        tokenizer (`ChineseCLIPProcessor`):
+            Tokenizer of class
+            [ChineseCLIPProcessor](https://huggingface.co/docs/transformers/main/en/model_doc/chinese_clip#transformers.ChineseCLIPProcessor).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    _optional_components = ['safety_checker', 'feature_extractor']
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ChineseCLIPTextModel,
+        tokenizer: ChineseCLIPProcessor,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler,
+                         EulerDiscreteScheduler,
+                         EulerAncestralDiscreteScheduler,
+                         DPMSolverMultistepScheduler, ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt,
+                       do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            text=prompt,
+            padding='max_length',
+            truncation=True,
+            max_length=52,
+            return_tensors='pt')
+        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+        text_embeddings = self.text_encoder(**text_inputs)
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(
+            bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [''] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f'`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !='
+                    f' {type(prompt)}.')
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f'`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:'
+                    f' {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches'
+                    ' the batch size of `prompt`.')
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                text=uncond_tokens,
+                padding='max_length',
+                truncation=True,
+                max_length=52,
+                return_tensors='pt')
+            uncond_input = {k: v.to(device) for k, v in uncond_input.items()}
+            uncond_embeddings = self.text_encoder(**uncond_input)
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(
+                1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(
+                batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py
new file mode 100644
index 00000000..0e89200f
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py
@@ -0,0 +1,44 @@
+# Copyright © Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import torch
+from diffusers import StableDiffusionPipeline
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.multi_modal.diffusers_wrapped.diffusers_pipeline import \
+    DiffusersPipeline
+from modelscope.utils.constant import Tasks
+
+
+# Wrap around the diffusers stable diffusion pipeline implementation
+# for a unified ModelScope pipeline experience. Native stable diffusion
+# pipelines will be implemented in later releases.
+@PIPELINES.register_module(
+    Tasks.text_to_image_synthesis,
+    module_name=Pipelines.diffusers_stable_diffusion)
+class StableDiffusionWrapperPipeline(DiffusersPipeline):
+
+    def __init__(self, model: str, device: str = 'gpu', **kwargs):
+        """
+        use `model` to create a stable diffusion pipeline
+        Args:
+            model: model id on modelscope hub.
+            device: str = 'gpu'
+        """
+        super().__init__(model, device, **kwargs)
+
+        torch_dtype = kwargs.get('torch_dtype', torch.float16)
+
+        # build upon the diffuser stable diffusion pipeline
+        self.pipeline = StableDiffusionPipeline.from_pretrained(
+            model, torch_dtype=torch_dtype)
+        self.pipeline.to(self.device)
+
+    def forward(self, prompt, **kwargs):
+        return self.pipeline(prompt, **kwargs)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return {OutputKeys.OUTPUT_IMG: inputs.images}
diff --git a/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py b/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py
new file mode 100644
index 00000000..754d2d2b
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/document_vl_embedding_pipeline.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal.vldoc.model import VLDocForDocVLEmbedding
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.multi_modal import (Preprocessor,
+                                                  VLDocPreprocessor)
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.document_vl_embedding, module_name=Pipelines.document_vl_embedding)
+class DocumentVLEmbeddingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """ The pipeline for multi-modal document embedding generation.
+
+        Args:
+            model: model id on modelscope hub.
+            preprocessor: type `Preprocessor`. If None, `VLDocPreprocessor` is used.
+
+        Example:
+        ```python
+        >>> from modelscope.models import Model
+        >>> from modelscope.pipelines import pipeline
+        >>> model = Model.from_pretrained(
+            'damo/multi-modal_convnext-roberta-base_vldoc-embedding')
+        >>> doc_VL_emb_pipeline = pipeline(task='document-vl-embedding', model=model)
+        >>> inp = {
+                'images': ['data/demo.png'],
+                'ocr_info_paths': ['data/demo.json']
+            }
+        >>> result = doc_VL_emb_pipeline(inp)
+        ```
+        """
+
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        if preprocessor is None:
+            if isinstance(self.model, VLDocForDocVLEmbedding):
+                self.preprocessor = VLDocPreprocessor(self.model.model_dir)
+            else:
+                raise NotImplementedError
+
+    def forward(self, encodings: Dict[str, Any]) -> Dict[str, Any]:
+        for k, v in encodings.items():
+            encodings[k] = encodings[k].to(self.device)
+        return self.model(**encodings)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
index e1d5c769..0a16c58f 100644
--- a/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/image_captioning_pipeline.py
@@ -5,9 +5,9 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
                                       Preprocessor)
 from modelscope.utils.constant import Tasks
@@ -39,17 +39,7 @@ class ImageCaptioningPipeline(Pipeline):
 
     def _batch(self, data):
         if isinstance(self.model, OfaForAllTasks):
-            # collate batch data due to the nested data structure
-            if isinstance(data, list):
-                batch_data = {}
-                batch_data['nsentences'] = len(data)
-                batch_data['samples'] = [d['samples'][0] for d in data]
-                batch_data['net_input'] = {}
-                for k in data[0]['net_input'].keys():
-                    batch_data['net_input'][k] = torch.cat(
-                        [d['net_input'][k] for d in data])
-
-            return batch_data
+            return batch_process(self.model, data)
         elif isinstance(self.model, MPlugForAllTasks):
             from transformers.tokenization_utils_base import BatchEncoding
             batch_data = dict(train=data[0]['train'])
@@ -60,7 +50,7 @@ class ImageCaptioningPipeline(Pipeline):
             batch_data['question'] = BatchEncoding(question)
             return batch_data
         else:
-            return super()._collate_batch(data)
+            return super(ImageCaptioningPipeline, self)._batch(data)
 
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
diff --git a/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py b/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py
new file mode 100644
index 00000000..da6c0d2f
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/mgeo_ranking_pipeline.py
@@ -0,0 +1,183 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import Preprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['MGeoRankingPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.text_ranking, module_name=Pipelines.mgeo_ranking)
+class MGeoRankingPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 sequence_length=128,
+                 **kwargs):
+        """Use `model` and `preprocessor` to create a nlp word segment pipeline
+           for prediction.
+
+        Args:
+            model (str or Model): Supply either a local model dir which
+            supported the WS task, or a model id from the model hub, or a torch
+            model instance. preprocessor (Preprocessor): An optional
+            preprocessor instance, please make sure the preprocessor fits for
+            the model if supplied. kwargs (dict, `optional`):
+                Extra kwargs passed into the preprocessor's constructor.
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        if preprocessor is None:
+            self.preprocessor = Preprocessor.from_pretrained(
+                self.model.model_dir,
+                sequence_length=sequence_length,
+                **kwargs)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        return self.model(**inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """process the prediction results
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, Any]: the predicted text representation
+        """
+
+        def sigmoid(logits):
+            return np.exp(logits) / (1 + np.exp(logits))
+
+        logits = inputs[OutputKeys.LOGITS].squeeze(-1).detach().cpu().numpy()
+        pred_list = sigmoid(logits).tolist()
+        return {OutputKeys.SCORES: pred_list}
+
+    def get_gis(self, gis, inps):
+        gis_input_ids, gis_token_type_ids, gis_rel_type_ids = ([], [], [])
+        gis_absolute_position_ids, gis_relative_position_ids = ([], [])
+        gis_prov_ids, gis_city_ids, gis_dist_ids = ([], [], [])
+        china_version = False
+        if len(inps[0]) == 6:
+            for geom_id, geom_type, rel_type, absolute_position, relative_position, lxly in inps:
+                gis_input_ids.append(geom_id)
+                gis_token_type_ids.append(geom_type)
+                gis_rel_type_ids.append(rel_type)
+                gis_absolute_position_ids.append(absolute_position)
+                gis_relative_position_ids.append(relative_position)
+        elif len(inps[0]) == 9:
+            china_version = True
+            for geom_id, geom_type, rel_type, absolute_position, relative_position, \
+                    prov_id, city_id, dist_id, lxly in inps:
+                gis_input_ids.append(geom_id)
+                gis_token_type_ids.append(geom_type)
+                gis_rel_type_ids.append(rel_type)
+                gis_absolute_position_ids.append(absolute_position)
+                gis_relative_position_ids.append(relative_position)
+                gis_prov_ids.append(prov_id)
+                gis_city_ids.append(city_id)
+                gis_dist_ids.append(dist_id)
+
+        gis.update(gis_input_ids, gis_token_type_ids, gis_rel_type_ids,
+                   gis_absolute_position_ids, gis_relative_position_ids,
+                   gis_prov_ids, gis_city_ids, gis_dist_ids, china_version)
+        for att in vars(gis).keys():
+            if isinstance(getattr(gis, att), torch.Tensor):
+                setattr(gis, att, getattr(gis, att).to(self.device))
+        return gis
+
+    def _collate_fn(self, batch):
+        merged_batch = {}
+        gis_list = []
+        gis_tp = []
+        for k in batch:
+            if 'sentence1_gis' == k:
+                gis = batch['gis1']
+                gis = self.get_gis(gis, batch['sentence1_gis'])
+                if gis.prov_ids is not None:
+                    gis_list.append({
+                        'input_ids': gis.input_ids,
+                        'attention_mask': gis.attention_mask,
+                        'token_type_ids': gis.token_type_ids,
+                        'rel_type_ids': gis.rel_type_ids,
+                        'absolute_position_ids': gis.absolute_position_ids,
+                        'relative_position_ids': gis.relative_position_ids,
+                        'prov_ids': gis.prov_ids,
+                        'city_ids': gis.city_ids,
+                        'dist_ids': gis.dist_ids
+                    })
+                else:
+                    gis_list.append({
+                        'input_ids':
+                        gis.input_ids,
+                        'attention_mask':
+                        gis.attention_mask,
+                        'token_type_ids':
+                        gis.token_type_ids,
+                        'rel_type_ids':
+                        gis.rel_type_ids,
+                        'absolute_position_ids':
+                        gis.absolute_position_ids,
+                        'relative_position_ids':
+                        gis.relative_position_ids
+                    })
+                gis_tp.append(torch.LongTensor([1]).to(self.device))
+            elif 'sentence2_gis' == k:
+                gis = batch['gis2']
+                gis = self.get_gis(gis, batch['sentence2_gis'])
+                if gis.prov_ids is not None:
+                    gis_list.append({
+                        'input_ids': gis.input_ids,
+                        'attention_mask': gis.attention_mask,
+                        'token_type_ids': gis.token_type_ids,
+                        'rel_type_ids': gis.rel_type_ids,
+                        'absolute_position_ids': gis.absolute_position_ids,
+                        'relative_position_ids': gis.relative_position_ids,
+                        'prov_ids': gis.prov_ids,
+                        'city_ids': gis.city_ids,
+                        'dist_ids': gis.dist_ids
+                    })
+                else:
+                    gis_list.append({
+                        'input_ids':
+                        gis.input_ids,
+                        'attention_mask':
+                        gis.attention_mask,
+                        'token_type_ids':
+                        gis.token_type_ids,
+                        'rel_type_ids':
+                        gis.rel_type_ids,
+                        'absolute_position_ids':
+                        gis.absolute_position_ids,
+                        'relative_position_ids':
+                        gis.relative_position_ids
+                    })
+                gis_tp.append(torch.LongTensor([0]).to(self.device))
+            elif 'qid' in k or 'labels' in k:
+                merged_batch[k] = batch[k].to(self.device)
+            elif not k.startswith('gis'):
+                merged_batch[k] = batch[k].to(self.device)
+        if len(gis_list) > 0:
+            merged_batch['gis_list'] = gis_list
+        if len(gis_tp) > 0:
+            merged_batch['gis_tp'] = gis_tp
+        return merged_batch
diff --git a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
index 3c4a3c3c..337742a7 100644
--- a/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
+++ b/modelscope/pipelines/multi_modal/ocr_recognition_pipeline.py
@@ -5,9 +5,9 @@ import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
@@ -34,6 +34,12 @@ class OcrRecognitionPipeline(Pipeline):
             if isinstance(self.model, OfaForAllTasks):
                 self.preprocessor = OfaPreprocessor(self.model.model_dir)
 
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(OcrRecognitionPipeline, self)._batch(data)
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
diff --git a/modelscope/pipelines/multi_modal/sudoku_pipeline.py b/modelscope/pipelines/multi_modal/sudoku_pipeline.py
new file mode 100644
index 00000000..2eefcd72
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/sudoku_pipeline.py
@@ -0,0 +1,53 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(Tasks.sudoku, module_name=Pipelines.ofa_sudoku)
+class SudokuPipeline(Pipeline):
+    R"""
+    pipeline for sudoku solving
+    """
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a pipeline for solving sudoku
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        if preprocessor is None:
+            if isinstance(self.model, OfaForAllTasks):
+                self.preprocessor = OfaPreprocessor(self.model.model_dir)
+            else:
+                raise 'no preprocessor is provided'
+
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(SudokuPipeline, self)._batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/text2sql_pipeline.py b/modelscope/pipelines/multi_modal/text2sql_pipeline.py
new file mode 100644
index 00000000..b586fab7
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/text2sql_pipeline.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import OfaForAllTasks
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
+from modelscope.preprocessors import OfaPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(Tasks.text2sql, module_name=Pipelines.ofa_text2sql)
+class TextToSqlPipeline(Pipeline):
+    R"""
+    pipeline for text to sql task
+    """
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a pipeline for text2sql task
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        if preprocessor is None:
+            if isinstance(self.model, OfaForAllTasks):
+                self.preprocessor = OfaPreprocessor(self.model.model_dir)
+
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(TextToSqlPipeline, self)._batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/video_captioning_pipeline.py b/modelscope/pipelines/multi_modal/video_captioning_pipeline.py
new file mode 100644
index 00000000..e13e1ae5
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/video_captioning_pipeline.py
@@ -0,0 +1,56 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.multi_modal import HiTeAForAllTasks
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_captioning, module_name=Pipelines.video_captioning)
+class VideoCaptioningPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """
+        use `model` and `preprocessor` to create a video captioning pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        self.model.eval()
+        if preprocessor is None:
+            if isinstance(self.model, HiTeAForAllTasks):
+                self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
+
+    def _batch(self, data):
+        if isinstance(self.model, HiTeAForAllTasks):
+            from transformers.tokenization_utils_base import BatchEncoding
+            batch_data = dict(train=data[0]['train'])
+            batch_data['video'] = torch.cat([d['video'] for d in data])
+            question = {}
+            for k in data[0]['question'].keys():
+                question[k] = torch.cat([d['question'][k] for d in data])
+            batch_data['question'] = BatchEncoding(question)
+            return batch_data
+        else:
+            return super()._collate_batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/video_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/video_question_answering_pipeline.py
new file mode 100644
index 00000000..63a730ac
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/video_question_answering_pipeline.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Optional, Union
+
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.models.multi_modal import HiTeAForAllTasks
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline, Tensor
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import HiTeAPreprocessor, Preprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['VideoQuestionAnsweringPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.video_question_answering,
+    module_name=Pipelines.video_question_answering)
+class VideoQuestionAnsweringPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Optional[Preprocessor] = None,
+                 **kwargs):
+        """use `model` and `preprocessor` to create a video question answering pipeline for prediction
+
+        Args:
+            model (HiTeAForVideoQuestionAnswering): a model instance
+            preprocessor (HiTeAForVideoQuestionAnsweringPreprocessor): a preprocessor instance
+        """
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        if preprocessor is None:
+            if isinstance(self.model, HiTeAForAllTasks):
+                self.preprocessor = HiTeAPreprocessor(self.model.model_dir)
+        self.model.eval()
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
+    def postprocess(self, inputs: Dict[str, Tensor],
+                    **postprocess_params) -> Dict[str, str]:
+        """process the prediction results
+
+        Args:
+            inputs (Dict[str, Any]): _description_
+
+        Returns:
+            Dict[str, str]: the prediction results
+        """
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
index 67661b39..703b8d1b 100644
--- a/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_entailment_pipeline.py
@@ -1,10 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
+import torch
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
@@ -30,5 +33,16 @@ class VisualEntailmentPipeline(Pipeline):
         if preprocessor is None and isinstance(self.model, OfaForAllTasks):
             self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)
 
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(VisualEntailmentPipeline, self)._batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
index f8a79d55..36b6754b 100644
--- a/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_grounding_pipeline.py
@@ -1,10 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
+import torch
+
 from modelscope.metainfo import Pipelines
 from modelscope.models.multi_modal import OfaForAllTasks
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import OfaPreprocessor, Preprocessor
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
@@ -30,5 +33,16 @@ class VisualGroundingPipeline(Pipeline):
         if preprocessor is None and isinstance(self.model, OfaForAllTasks):
             self.preprocessor = OfaPreprocessor(model_dir=self.model.model_dir)
 
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(VisualGroundingPipeline, self)._batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
index a30cf1c5..09b6e1ba 100644
--- a/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
+++ b/modelscope/pipelines/multi_modal/visual_question_answering_pipeline.py
@@ -6,9 +6,9 @@ import torch
 from modelscope.metainfo import Pipelines
 from modelscope.models import Model
 from modelscope.models.multi_modal import MPlugForAllTasks, OfaForAllTasks
-from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline, Tensor
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import (MPlugPreprocessor, OfaPreprocessor,
                                       Preprocessor)
 from modelscope.utils.constant import Tasks
@@ -39,6 +39,12 @@ class VisualQuestionAnsweringPipeline(Pipeline):
                 self.preprocessor = MPlugPreprocessor(self.model.model_dir)
         self.model.eval()
 
+    def _batch(self, data):
+        if isinstance(self.model, OfaForAllTasks):
+            return batch_process(self.model, data)
+        else:
+            return super(VisualQuestionAnsweringPipeline, self)._batch(data)
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 707e2ac0..8fcf3e3f 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -33,6 +33,7 @@ if TYPE_CHECKING:
     from .codegeex_code_translation_pipeline import CodeGeeXCodeTranslationPipeline
     from .codegeex_code_generation_pipeline import CodeGeeXCodeGenerationPipeline
     from .translation_evaluation_pipeline import TranslationEvaluationPipeline
+    from .user_satisfaction_estimation_pipeline import UserSatisfactionEstimationPipeline
 
 else:
     _import_structure = {
@@ -79,6 +80,8 @@ else:
         'codegeex_code_generation_pipeline':
         ['CodeGeeXCodeGenerationPipeline'],
         'translation_evaluation_pipeline': ['TranslationEvaluationPipeline'],
+        'user_satisfaction_estimation_pipeline':
+        ['UserSatisfactionEstimationPipeline']
     }
 
     import sys
diff --git a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
index 9c5600fd..5901ab36 100644
--- a/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
+++ b/modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
@@ -29,7 +29,7 @@ class NamedEntityRecognitionPipeline(TokenClassificationPipeline):
                  config_file: str = None,
                  device: str = 'gpu',
                  auto_collate=True,
-                 sequence_length=128,
+                 sequence_length=512,
                  **kwargs):
         """Use `model` and `preprocessor` to create a nlp NER pipeline for prediction
 
diff --git a/modelscope/pipelines/nlp/summarization_pipeline.py b/modelscope/pipelines/nlp/summarization_pipeline.py
index 7c8355f9..5c5e5305 100644
--- a/modelscope/pipelines/nlp/summarization_pipeline.py
+++ b/modelscope/pipelines/nlp/summarization_pipeline.py
@@ -1,9 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import Any, Dict, Optional, Union
 
+import torch
+
 from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.pipelines.base import Model, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.logger import get_logger
@@ -48,5 +51,16 @@ class SummarizationPipeline(Pipeline):
                 self.preprocessor = Preprocessor.from_pretrained(
                     self.model.model_dir, **kwargs)
 
+    def _batch(self, data):
+        if self.model.__class__.__name__ == 'OfaForAllTasks':
+            return batch_process(self.model, data)
+        else:
+            return super(SummarizationPipeline, self)._batch(data)
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return super().forward(inputs, **forward_params)
+
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         return inputs
diff --git a/modelscope/pipelines/nlp/text_classification_pipeline.py b/modelscope/pipelines/nlp/text_classification_pipeline.py
index 06f95d7e..9a3b6901 100644
--- a/modelscope/pipelines/nlp/text_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/text_classification_pipeline.py
@@ -2,12 +2,14 @@
 from typing import Any, Dict, Union
 
 import numpy as np
+import torch
 
 from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys, TextClassificationModelOutput
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.util import batch_process
 from modelscope.preprocessors import Preprocessor
 from modelscope.utils.constant import Fields, Tasks
 from modelscope.utils.logger import get_logger
@@ -83,10 +85,17 @@ class TextClassificationPipeline(Pipeline):
         if hasattr(self.preprocessor, 'id2label'):
             self.id2label = self.preprocessor.id2label
 
+    def _batch(self, data):
+        if self.model.__class__.__name__ == 'OfaForAllTasks':
+            return batch_process(self.model, data)
+        else:
+            return super(TextClassificationPipeline, self)._batch(data)
+
     def forward(self, inputs: Dict[str, Any],
                 **forward_params) -> Dict[str, Any]:
         if self.model.__class__.__name__ == 'OfaForAllTasks':
-            return super().forward(inputs, **forward_params)
+            with torch.no_grad():
+                return super().forward(inputs, **forward_params)
         return self.model(**inputs, **forward_params)
 
     def postprocess(self,
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index 1e6d525a..4abaaca1 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -78,9 +78,14 @@ class TextErrorCorrectionPipeline(Pipeline):
 
         """
 
-        pred_str = self.vocab.string(
-            inputs['predictions'],
-            '@@',
-            extra_symbols_to_ignore={self.vocab.pad()})
+        sc_sent = []
+        for sent in inputs['predictions']:
+            pred_str = self.vocab.string(
+                sent, '@@', extra_symbols_to_ignore={self.vocab.pad()})
+            sc_sent.append(''.join(pred_str.split()))
 
-        return {OutputKeys.OUTPUT: ''.join(pred_str.split())}
+        # for  consistent with old version
+        if len(sc_sent) == 1:
+            sc_sent = sc_sent[0]
+
+        return {OutputKeys.OUTPUT: sc_sent}
diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py
index 63f241a2..9926ee78 100644
--- a/modelscope/pipelines/nlp/token_classification_pipeline.py
+++ b/modelscope/pipelines/nlp/token_classification_pipeline.py
@@ -36,7 +36,7 @@ class TokenClassificationPipeline(Pipeline):
                  config_file: str = None,
                  device: str = 'gpu',
                  auto_collate=True,
-                 sequence_length=128,
+                 sequence_length=512,
                  **kwargs):
         """use `model` and `preprocessor` to create a token classification pipeline for prediction
 
@@ -116,9 +116,10 @@ class TokenClassificationPipeline(Pipeline):
             offset_mapping = torch.narrow(
                 offset_mapping, 0, 0,
                 masked_lengths)  # index_select only move loc, not resize
-            predictions = torch.narrow(
-                predictions, 0, 0,
-                masked_lengths)  # index_select only move loc, not resize
+
+            if len(label_mask.shape) == 2:
+                label_mask = label_mask[0]
+            predictions = predictions.masked_select(label_mask)
 
         offset_mapping = torch_nested_numpify(
             torch_nested_detach(offset_mapping))
diff --git a/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py b/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py
new file mode 100644
index 00000000..6abfca8b
--- /dev/null
+++ b/modelscope/pipelines/nlp/user_satisfaction_estimation_pipeline.py
@@ -0,0 +1,110 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import DialogueClassificationUsePreprocessor
+from modelscope.utils.constant import Tasks
+
+__all__ = ['UserSatisfactionEstimationPipeline']
+
+
+@PIPELINES.register_module(
+    group_key=Tasks.text_classification,
+    module_name=Pipelines.user_satisfaction_estimation)
+class UserSatisfactionEstimationPipeline(Pipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: DialogueClassificationUsePreprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True):
+        """The inference pipeline for the user satisfaction estimation task.
+
+        Args:
+            model (str or Model): Supply either a local model dir which supported user satisfaction estimation task, or
+            a model id from the model hub, or a torch model instance.
+            preprocessor (DialogueClassificationUsePreprocessor): An optional preprocessor instance.
+            device (str): device str, should be either cpu, cuda, gpu, gpu:X or cuda:X
+            auto_collate (bool): automatically to convert data to tensor or not.
+
+            Example:
+            >>> from modelscope.pipelines import pipeline
+            >>> pipeline_ins = pipeline('text-classification',
+                model='damo/nlp_user-satisfaction-estimation_chinese')
+            >>> input = [('返修退换货咨询|||', '手机有质量问题怎么办|||稍等，我看下', '开不开机了|||',
+                       '说话|||谢谢哈')]
+            >>> print(pipeline_ins(input))
+        """
+        super().__init__(
+            model=model,
+            preprocessor=preprocessor,
+            config_file=config_file,
+            device=device,
+            auto_collate=auto_collate)
+
+        if hasattr(self.preprocessor, 'id2label'):
+            self.id2label = self.preprocessor.id2label
+
+        self.model.eval()
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        with torch.no_grad():
+            return self.model(**inputs, **forward_params)
+
+    def postprocess(self,
+                    inputs: Dict[str, Any],
+                    topk: int = None) -> Dict[str, Any]:
+        """Process the prediction results
+
+                Args:
+                    inputs (`Dict[str, Any]` or `DialogueUseClassificationModelOutput`): The model output, please check
+                        the `DialogueUseClassificationModelOutput` class for details.
+                    topk (int): The topk probs to take
+                Returns:
+                    Dict[str, Any]: the prediction results.
+                        scores: The probabilities of each label.
+                        labels: The real labels.
+                    Label at index 0 is the largest probability.
+                """
+        logits = inputs[OutputKeys.LOGITS].cpu().numpy()
+        if logits.shape[0] == 1:
+            logits = logits[0]
+
+        def softmax(logits):
+            exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+            return exp / exp.sum(axis=-1, keepdims=True)
+
+        probs = softmax(logits)
+        num_classes = probs.shape[-1]
+        topk = min(topk, num_classes) if topk is not None else num_classes
+        top_indices = np.argpartition(probs, -topk)[-topk:]
+        probs = np.take_along_axis(probs, top_indices, axis=-1).tolist()
+
+        def map_to_label(_id):
+            if getattr(self, 'id2label', None) is not None:
+                if _id in self.id2label:
+                    return self.id2label[_id]
+                elif str(_id) in self.id2label:
+                    return self.id2label[str(_id)]
+                else:
+                    raise Exception(
+                        f'id {_id} not found in id2label: {self.id2label}')
+            else:
+                return _id
+
+        v_func = np.vectorize(map_to_label)
+        top_indices = v_func(top_indices).tolist()
+        probs = list(reversed(probs))
+        top_indices = list(reversed(top_indices))
+
+        return {OutputKeys.SCORES: probs, OutputKeys.LABELS: top_indices}
diff --git a/modelscope/pipelines/util.py b/modelscope/pipelines/util.py
index 99a11317..fbbf4084 100644
--- a/modelscope/pipelines/util.py
+++ b/modelscope/pipelines/util.py
@@ -2,6 +2,8 @@
 import os.path as osp
 from typing import List, Optional, Union
 
+import torch
+
 from modelscope.hub.api import HubApi
 from modelscope.hub.file_download import model_file_download
 from modelscope.utils.config import Config
@@ -81,3 +83,25 @@ def is_model(path: Union[str, List]):
             )
 
         return all_true
+
+
+def batch_process(model, data):
+    if model.__class__.__name__ == 'OfaForAllTasks':
+        # collate batch data due to the nested data structure
+        assert isinstance(data, list)
+        batch_data = {
+            'nsentences': len(data),
+            'samples': [d['samples'][0] for d in data],
+            'net_input': {}
+        }
+        for k in data[0]['net_input'].keys():
+            batch_data['net_input'][k] = torch.cat(
+                [d['net_input'][k] for d in data])
+        if 'w_resize_ratios' in data[0]:
+            batch_data['w_resize_ratios'] = torch.cat(
+                [d['w_resize_ratios'] for d in data])
+        if 'h_resize_ratios' in data[0]:
+            batch_data['h_resize_ratios'] = torch.cat(
+                [d['h_resize_ratios'] for d in data])
+
+        return batch_data
diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py
index c538a580..df081036 100644
--- a/modelscope/preprocessors/__init__.py
+++ b/modelscope/preprocessors/__init__.py
@@ -8,14 +8,16 @@ if TYPE_CHECKING:
     from .builder import PREPROCESSORS, build_preprocessor
     from .common import Compose, ToTensor, Filter
     from .asr import WavToScp
-    from .audio import LinearAECAndFbank
+    from .audio import LinearAECAndFbank, AudioBrainPreprocessor
     from .image import (LoadImage, load_image,
                         ImageColorEnhanceFinetunePreprocessor,
                         ImageInstanceSegmentationPreprocessor,
-                        ImageDenoisePreprocessor)
+                        ImageDenoisePreprocessor, ImageDeblurPreprocessor)
+    from .cv import (ImageClassificationMmcvPreprocessor)
     from .kws import WavToLists
     from .tts import KanttsDataPreprocessor
-    from .multi_modal import (OfaPreprocessor, MPlugPreprocessor)
+    from .multi_modal import (OfaPreprocessor, MPlugPreprocessor,
+                              HiTeAPreprocessor)
     from .nlp import (
         DocumentSegmentationTransformersPreprocessor,
         FaqQuestionAnsweringTransformersPreprocessor,
@@ -24,6 +26,7 @@ if TYPE_CHECKING:
         RelationExtractionTransformersPreprocessor,
         SentenceEmbeddingTransformersPreprocessor,
         TextClassificationTransformersPreprocessor,
+        TextGenerationSentencePiecePreprocessor,
         TokenClassificationTransformersPreprocessor,
         TextErrorCorrectionPreprocessor, TextGenerationT5Preprocessor,
         TextGenerationTransformersPreprocessor, Tokenize,
@@ -35,7 +38,8 @@ if TYPE_CHECKING:
         DialogStateTrackingPreprocessor, ConversationalTextToSqlPreprocessor,
         TableQuestionAnsweringPreprocessor, NERPreprocessorViet,
         NERPreprocessorThai, WordSegmentationPreprocessorThai,
-        TranslationEvaluationPreprocessor)
+        TranslationEvaluationPreprocessor,
+        DialogueClassificationUsePreprocessor)
     from .video import ReadVideoData, MovieSceneSegmentationPreprocessor
 
 else:
@@ -43,16 +47,19 @@ else:
         'base': ['Preprocessor'],
         'builder': ['PREPROCESSORS', 'build_preprocessor'],
         'common': ['Compose', 'ToTensor', 'Filter'],
-        'audio': ['LinearAECAndFbank'],
+        'audio': ['LinearAECAndFbank', 'AudioBrainPreprocessor'],
         'asr': ['WavToScp'],
         'video': ['ReadVideoData', 'MovieSceneSegmentationPreprocessor'],
         'image': [
             'LoadImage', 'load_image', 'ImageColorEnhanceFinetunePreprocessor',
-            'ImageInstanceSegmentationPreprocessor', 'ImageDenoisePreprocessor'
+            'ImageInstanceSegmentationPreprocessor',
+            'ImageDenoisePreprocessor', 'ImageDeblurPreprocessor'
         ],
+        'cv': ['ImageClassificationMmcvPreprocessor'],
         'kws': ['WavToLists'],
         'tts': ['KanttsDataPreprocessor'],
-        'multi_modal': ['OfaPreprocessor', 'MPlugPreprocessor'],
+        'multi_modal':
+        ['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor'],
         'nlp': [
             'DocumentSegmentationTransformersPreprocessor',
             'FaqQuestionAnsweringTransformersPreprocessor',
@@ -61,6 +68,7 @@ else:
             'TextRankingTransformersPreprocessor',
             'RelationExtractionTransformersPreprocessor',
             'SentenceEmbeddingTransformersPreprocessor',
+            'TextGenerationSentencePiecePreprocessor',
             'TextClassificationTransformersPreprocessor',
             'TokenClassificationTransformersPreprocessor',
             'TextErrorCorrectionPreprocessor',
@@ -76,7 +84,8 @@ else:
             'DialogStateTrackingPreprocessor',
             'ConversationalTextToSqlPreprocessor',
             'TableQuestionAnsweringPreprocessor',
-            'TranslationEvaluationPreprocessor'
+            'TranslationEvaluationPreprocessor',
+            'DialogueClassificationUsePreprocessor'
         ],
     }
 
diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py
index a06c9134..3de744a3 100644
--- a/modelscope/preprocessors/asr.py
+++ b/modelscope/preprocessors/asr.py
@@ -38,9 +38,9 @@ class WavToScp(Preprocessor):
                            audio_in, audio_fs)
         return out
 
-    def forward(self, model: Dict[str,
-                                  Any], recog_type: str, audio_format: str,
-                audio_in: Union[str, bytes], audio_fs: int) -> Dict[str, Any]:
+    def forward(self, model: Dict[str, Any], recog_type: str,
+                audio_format: str, audio_in: Union[str, bytes], audio_fs: int,
+                cmd: Dict[str, Any]) -> Dict[str, Any]:
         assert len(recog_type) > 0, 'preprocess recog_type is empty'
         assert len(audio_format) > 0, 'preprocess audio_format is empty'
         assert len(
@@ -56,39 +56,29 @@ class WavToScp(Preprocessor):
         assert len(model['model_config']
                    ) > 0, 'preprocess model[model_config] is empty'
 
-        rst = {
-            # the recognition model dir path
-            'model_workspace': model['model_workspace'],
-            # the am model name
-            'am_model': model['am_model'],
-            # the am model file path
-            'am_model_path': model['am_model_path'],
-            # the asr type setting, eg: test dev train wav
-            'recog_type': recog_type,
-            # the asr audio format setting, eg: wav, pcm, kaldi_ark, tfrecord
-            'audio_format': audio_format,
-            # the recognition model config dict
-            'model_config': model['model_config'],
-            # the sample rate of audio_in
-            'audio_fs': audio_fs
-        }
+        cmd['model_workspace'] = model['model_workspace']
+        cmd['am_model'] = model['am_model']
+        cmd['am_model_path'] = model['am_model_path']
+        cmd['recog_type'] = recog_type
+        cmd['audio_format'] = audio_format
+        cmd['model_config'] = model['model_config']
+        cmd['audio_fs'] = audio_fs
 
         if isinstance(audio_in, str):
             # wav file path or the dataset path
-            rst['wav_path'] = audio_in
+            cmd['wav_path'] = audio_in
 
-        out = self.config_checking(rst)
-        out = self.env_setting(out)
+        cmd = self.env_setting(cmd)
         if audio_format == 'wav':
-            out['audio_lists'] = self.scp_generation_from_wav(out)
+            cmd['audio_lists'] = self.scp_generation_from_wav(cmd)
         elif audio_format == 'kaldi_ark':
-            out['audio_lists'] = self.scp_generation_from_ark(out)
+            cmd['audio_lists'] = self.scp_generation_from_ark(cmd)
         elif audio_format == 'tfrecord':
-            out['audio_lists'] = os.path.join(out['wav_path'], 'data.records')
-        elif audio_format == 'pcm':
-            out['audio_lists'] = audio_in
+            cmd['audio_lists'] = os.path.join(out['wav_path'], 'data.records')
+        elif audio_format == 'pcm' or audio_format == 'scp':
+            cmd['audio_lists'] = audio_in
 
-        return out
+        return cmd
 
     def config_checking(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """config checking
@@ -113,34 +103,35 @@ class WavToScp(Preprocessor):
         if inputs['model_type'] == Frameworks.torch:
             assert inputs['model_config'].__contains__(
                 'batch_size'), 'batch_size does not exist'
-            assert inputs['model_config'].__contains__(
-                'am_model_config'), 'am_model_config does not exist'
-            assert inputs['model_config'].__contains__(
-                'asr_model_config'), 'asr_model_config does not exist'
 
-            am_model_config: str = os.path.join(
-                inputs['model_workspace'],
-                inputs['model_config']['am_model_config'])
-            assert os.path.exists(
-                am_model_config), 'am_model_config does not exist'
-            inputs['am_model_config'] = am_model_config
-
-            asr_model_config: str = os.path.join(
-                inputs['model_workspace'],
-                inputs['model_config']['asr_model_config'])
-            assert os.path.exists(
-                asr_model_config), 'asr_model_config does not exist'
+            if inputs['model_config'].__contains__('am_model_config'):
+                am_model_config = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['am_model_config'])
+                assert os.path.exists(
+                    am_model_config), 'am_model_config does not exist'
+                inputs['am_model_config'] = am_model_config
+            else:
+                inputs['am_model_config'] = ''
+            if inputs['model_config'].__contains__('asr_model_config'):
+                asr_model_config = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['asr_model_config'])
+                assert os.path.exists(
+                    asr_model_config), 'asr_model_config does not exist'
+                inputs['asr_model_config'] = asr_model_config
+            else:
+                asr_model_config = ''
+                inputs['asr_model_config'] = ''
 
             if 'asr_model_wav_config' in inputs['model_config']:
                 asr_model_wav_config: str = os.path.join(
                     inputs['model_workspace'],
                     inputs['model_config']['asr_model_wav_config'])
+                assert os.path.exists(asr_model_wav_config
+                                      ), 'asr_model_wav_config does not exist'
             else:
-                asr_model_wav_config: str = os.path.join(
-                    inputs['model_workspace'],
-                    inputs['model_config']['asr_model_config'])
-            assert os.path.exists(
-                asr_model_wav_config), 'asr_model_wav_config does not exist'
+                asr_model_wav_config: str = inputs['asr_model_config']
 
             # the lm model file path
             if 'lm_model_name' in inputs['model_config']:
@@ -163,17 +154,53 @@ class WavToScp(Preprocessor):
             else:
                 inputs['lm_model_path'] = None
                 inputs['lm_model_config'] = None
-            if inputs['audio_format'] == 'wav' or inputs[
-                    'audio_format'] == 'pcm':
-                inputs['asr_model_config'] = asr_model_wav_config
-            else:
-                inputs['asr_model_config'] = asr_model_config
+            if 'audio_format' in inputs:
+                if inputs['audio_format'] == 'wav' or inputs[
+                        'audio_format'] == 'pcm':
+                    inputs['asr_model_config'] = asr_model_wav_config
+                else:
+                    inputs['asr_model_config'] = asr_model_config
 
             if inputs['model_config'].__contains__('mvn_file'):
                 mvn_file = os.path.join(inputs['model_workspace'],
                                         inputs['model_config']['mvn_file'])
                 assert os.path.exists(mvn_file), 'mvn_file does not exist'
                 inputs['mvn_file'] = mvn_file
+            if inputs['model_config'].__contains__('vad_model_name'):
+                vad_model_name = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['vad_model_name'])
+                assert os.path.exists(
+                    vad_model_name), 'vad_model_name does not exist'
+                inputs['vad_model_name'] = vad_model_name
+            if inputs['model_config'].__contains__('vad_model_config'):
+                vad_model_config = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['vad_model_config'])
+                assert os.path.exists(
+                    vad_model_config), 'vad_model_config does not exist'
+                inputs['vad_model_config'] = vad_model_config
+            if inputs['model_config'].__contains__('vad_mvn_file'):
+                vad_mvn_file = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['vad_mvn_file'])
+                assert os.path.exists(
+                    vad_mvn_file), 'vad_mvn_file does not exist'
+                inputs['vad_mvn_file'] = vad_mvn_file
+            if inputs['model_config'].__contains__('punc_model_name'):
+                punc_model_name = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['punc_model_name'])
+                assert os.path.exists(
+                    punc_model_name), 'punc_model_name does not exist'
+                inputs['punc_model_name'] = punc_model_name
+            if inputs['model_config'].__contains__('punc_model_config'):
+                punc_model_config = os.path.join(
+                    inputs['model_workspace'],
+                    inputs['model_config']['punc_model_config'])
+                assert os.path.exists(
+                    punc_model_config), 'punc_model_config does not exist'
+                inputs['punc_model_config'] = punc_model_config
 
         elif inputs['model_type'] == Frameworks.tf:
             assert inputs['model_config'].__contains__(
diff --git a/modelscope/preprocessors/audio.py b/modelscope/preprocessors/audio.py
index f02381ad..3332b1e4 100644
--- a/modelscope/preprocessors/audio.py
+++ b/modelscope/preprocessors/audio.py
@@ -11,7 +11,34 @@ import torch
 from modelscope.fileio import File
 from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields
+from modelscope.utils.constant import Fields, ModeKeys
+
+
+class AudioBrainPreprocessor(Preprocessor):
+    """A preprocessor takes audio file path and reads it into tensor
+
+    Args:
+        takes: the audio file field name
+        provides: the tensor field name
+        mode: process mode, default 'inference'
+    """
+
+    def __init__(self,
+                 takes: str,
+                 provides: str,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        super(AudioBrainPreprocessor, self).__init__(mode, *args, **kwargs)
+        self.takes = takes
+        self.provides = provides
+        import speechbrain as sb
+        self.read_audio = sb.dataio.dataio.read_audio
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        result = self.read_audio(data[self.takes])
+        data[self.provides] = result
+        return data
 
 
 def load_kaldi_feature_transform(filename):
diff --git a/modelscope/preprocessors/base.py b/modelscope/preprocessors/base.py
index 61bb0222..d9b2836f 100644
--- a/modelscope/preprocessors/base.py
+++ b/modelscope/preprocessors/base.py
@@ -133,10 +133,20 @@ PREPROCESSOR_MAP = {
     Preprocessors.sequence_labeling_tokenizer,
     (Models.tcrf, Tasks.named_entity_recognition):
     Preprocessors.sequence_labeling_tokenizer,
+
+    # cv
+    (Models.tinynas_detection, Tasks.image_object_detection):
+    Preprocessors.object_detection_tinynas_preprocessor,
+    (Models.tinynas_damoyolo, Tasks.image_object_detection):
+    Preprocessors.object_detection_tinynas_preprocessor,
+    (Models.tinynas_damoyolo, Tasks.domain_specific_object_detection):
+    Preprocessors.object_detection_tinynas_preprocessor,
 }
 
 
 class Preprocessor(ABC):
+    """Base of preprocessors.
+    """
 
     def __init__(self, mode=ModeKeys.INFERENCE, *args, **kwargs):
         self._mode = mode
@@ -214,7 +224,17 @@ class Preprocessor(ABC):
             model_dir = snapshot_download(
                 model_name_or_path,
                 revision=revision,
-                user_agent={Invoke.KEY: Invoke.PREPROCESSOR})
+                user_agent={Invoke.KEY: Invoke.PREPROCESSOR},
+                ignore_file_pattern=[
+                    '.*.bin',
+                    '.*.ts',
+                    '.*.pt',
+                    '.*.data-00000-of-00001',
+                    '.*.onnx',
+                    '.*.meta',
+                    '.*.pb',
+                    '.*.index',
+                ])
         else:
             model_dir = model_name_or_path
         if cfg_dict is None:
diff --git a/modelscope/preprocessors/cv/__init__.py b/modelscope/preprocessors/cv/__init__.py
new file mode 100644
index 00000000..21324ed7
--- /dev/null
+++ b/modelscope/preprocessors/cv/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .video_super_resolution import (VideoReader)
+    from .video_stabilization import (stabilization_preprocessor)
+    from .mmcls_preprocessor import ImageClassificationMmcvPreprocessor
+
+else:
+    _import_structure = {
+        'video_super_resolution': ['VideoReader'],
+        'video_stabilization': ['stabilization_preprocessor'],
+        'mmcls_preprocessor': ['ImageClassificationMmcvPreprocessor'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/preprocessors/cv/mmcls_preprocessor.py b/modelscope/preprocessors/cv/mmcls_preprocessor.py
new file mode 100644
index 00000000..36e7ac4d
--- /dev/null
+++ b/modelscope/preprocessors/cv/mmcls_preprocessor.py
@@ -0,0 +1,68 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict
+
+import numpy as np
+from numpy import ndarray
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.hub import read_config
+from modelscope.utils.type_assert import type_assert
+
+
+@PREPROCESSORS.register_module(
+    Fields.cv,
+    module_name=Preprocessors.image_classification_mmcv_preprocessor)
+class ImageClassificationMmcvPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir, **kwargs):
+        """Preprocess the image.
+
+        What this preprocessor will do:
+        1. Remove the `LoadImageFromFile` preprocessor(which will be called in the pipeline).
+        2. Compose and instantiate other preprocessors configured in the file.
+        3. Call the sub preprocessors one by one.
+
+        This preprocessor supports two types of configuration:
+        1. The mmcv config file, configured in a `config.py`
+        2. The maas config file, configured in a `configuration.json`
+        By default, if the `config.py` exists, the preprocessor will use the mmcv config file.
+
+        Args:
+            model_dir (str): The model dir to build the preprocessor from.
+        """
+
+        import mmcv
+        from mmcls.datasets.pipelines import Compose
+        from modelscope.models.cv.image_classification.utils import preprocess_transform
+        super().__init__(**kwargs)
+
+        self.config_type = 'ms_config'
+        mm_config = os.path.join(model_dir, 'config.py')
+        if os.path.exists(mm_config):
+            cfg = mmcv.Config.fromfile(mm_config)
+            cfg.model.pretrained = None
+            config_type = 'mmcv_config'
+        else:
+            cfg = read_config(model_dir)
+            cfg.model.mm_model.pretrained = None
+            config_type = 'ms_config'
+
+        if config_type == 'mmcv_config':
+            if cfg.data.test.pipeline[0]['type'] == 'LoadImageFromFile':
+                cfg.data.test.pipeline.pop(0)
+            self.preprocessors = Compose(cfg.data.test.pipeline)
+        else:
+            if cfg.preprocessor.val[0]['type'] == 'LoadImageFromFile':
+                cfg.preprocessor.val.pop(0)
+            data_pipeline = preprocess_transform(cfg.preprocessor.val)
+            self.preprocessors = Compose(data_pipeline)
+
+    @type_assert(object, object)
+    def __call__(self, data: np.ndarray) -> Dict[str, ndarray]:
+        data = dict(img=data)
+        data = self.preprocessors(data)
+        return data
diff --git a/modelscope/preprocessors/cv/timer.py b/modelscope/preprocessors/cv/timer.py
new file mode 100644
index 00000000..90d56f9a
--- /dev/null
+++ b/modelscope/preprocessors/cv/timer.py
@@ -0,0 +1,111 @@
+# The implementation is adopted from mmcv,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/timer.py
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super(TimerError, self).__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+    :Example:
+    >>> import time
+    >>> import mmcv
+    >>> with mmcv.Timer():
+    >>>     # simulate a code block that will run for 1s
+    >>>     time.sleep(1)
+    1.000
+    >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+    >>>     # simulate a code block that will run for 1s
+    >>>     time.sleep(1)
+    it takes 1.0 seconds
+    >>> timer = mmcv.Timer()
+    >>> time.sleep(0.5)
+    >>> print(timer.since_start())
+    0.500
+    >>> time.sleep(0.5)
+    >>> print(timer.since_last_check())
+    0.500
+    >>> print(timer.since_start())
+    1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+        Returns (float): Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+        Returns (float): Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+    :Example:
+    >>> import time
+    >>> import mmcv
+    >>> for i in range(1, 6):
+    >>>     # simulate a code block
+    >>>     time.sleep(i)
+    >>>     mmcv.check_time('task1')
+    2.000
+    3.000
+    4.000
+    5.000
+    Args:
+        timer_id (str): Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/modelscope/preprocessors/cv/util.py b/modelscope/preprocessors/cv/util.py
new file mode 100644
index 00000000..cf7eeb57
--- /dev/null
+++ b/modelscope/preprocessors/cv/util.py
@@ -0,0 +1,107 @@
+# The implementation is adopted from mmcv,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils
+import os
+import os.path as osp
+import sys
+from collections.abc import Iterable
+from shutil import get_terminal_size
+
+from .timer import Timer
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+class ProgressBar:
+    """A progress bar which can print the progress."""
+
+    def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks=1):
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+    Tasks are done with a simple for-loop.
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
diff --git a/modelscope/preprocessors/cv/video_stabilization.py b/modelscope/preprocessors/cv/video_stabilization.py
new file mode 100644
index 00000000..862cea06
--- /dev/null
+++ b/modelscope/preprocessors/cv/video_stabilization.py
@@ -0,0 +1,45 @@
+# Part of the implementation is borrowed and modified from DUTCode,
+# publicly available at https://github.com/Annbless/DUTCode
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+
+from modelscope.preprocessors.cv import VideoReader
+
+
+def stabilization_preprocessor(input, cfg):
+    video_reader = VideoReader(input)
+    inputs = []
+    for frame in video_reader:
+        inputs.append(np.flip(frame, axis=2))
+    fps = video_reader.fps
+    w = video_reader.width
+    h = video_reader.height
+    rgb_images = []
+    images = []
+    ori_images = []
+    for i, frame in enumerate(inputs):
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        image = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        image = image * (1. / 255.)
+        image = cv2.resize(image, (cfg.MODEL.WIDTH, cfg.MODEL.HEIGHT))
+        images.append(image.reshape(1, 1, cfg.MODEL.HEIGHT, cfg.MODEL.WIDTH))
+        rgb_image = cv2.resize(frame, (cfg.MODEL.WIDTH, cfg.MODEL.HEIGHT))
+        rgb_images.append(
+            np.expand_dims(np.transpose(rgb_image, (2, 0, 1)), 0))
+        ori_images.append(np.expand_dims(np.transpose(frame, (2, 0, 1)), 0))
+    x = np.concatenate(images, 1).astype(np.float32)
+    x = torch.from_numpy(x).unsqueeze(0)
+    x_rgb = np.concatenate(rgb_images, 0).astype(np.float32)
+    x_rgb = torch.from_numpy(x_rgb).unsqueeze(0)
+
+    return {
+        'ori_images': ori_images,
+        'x': x,
+        'x_rgb': x_rgb,
+        'fps': fps,
+        'width': w,
+        'height': h
+    }
diff --git a/modelscope/preprocessors/cv/video_super_resolution.py b/modelscope/preprocessors/cv/video_super_resolution.py
new file mode 100644
index 00000000..f7fbbc32
--- /dev/null
+++ b/modelscope/preprocessors/cv/video_super_resolution.py
@@ -0,0 +1,265 @@
+# The implementation is adopted from mmcv,
+# made publicly available under the Apache 2.0 License at
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/video/io.py
+import os.path as osp
+from collections import OrderedDict
+
+import cv2
+from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
+                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
+                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
+
+from .util import check_file_exist, mkdir_or_exist, track_progress
+
+
+class Cache:
+
+    def __init__(self, capacity):
+        self._cache = OrderedDict()
+        self._capacity = int(capacity)
+        if capacity <= 0:
+            raise ValueError('capacity must be a positive integer')
+
+    @property
+    def capacity(self):
+        return self._capacity
+
+    @property
+    def size(self):
+        return len(self._cache)
+
+    def put(self, key, val):
+        if key in self._cache:
+            return
+        if len(self._cache) >= self.capacity:
+            self._cache.popitem(last=False)
+        self._cache[key] = val
+
+    def get(self, key, default=None):
+        val = self._cache[key] if key in self._cache else default
+        return val
+
+
+class VideoReader:
+    """Video class with similar usage to a list object.
+    This video warpper class provides convenient apis to access frames.
+    There exists an issue of OpenCV's VideoCapture class that jumping to a
+    certain frame may be inaccurate. It is fixed in this class by checking
+    the position after jumping each time.
+    Cache is used when decoding videos. So if the same frame is visited for
+    the second time, there is no need to decode again if it is stored in the
+    cache.
+    :Example:
+    >>> import mmcv
+    >>> v = mmcv.VideoReader('sample.mp4')
+    >>> len(v)  # get the total frame number with `len()`
+    120
+    >>> for img in v:  # v is iterable
+    >>>     mmcv.imshow(img)
+    >>> v[5]  # get the 6th frame
+    """
+
+    def __init__(self, filename, cache_capacity=10):
+        # Check whether the video path is a url
+        if not filename.startswith(('https://', 'http://')):
+            check_file_exist(filename, 'Video file not found: ' + filename)
+        self._vcap = cv2.VideoCapture(filename)
+        assert cache_capacity > 0
+        self._cache = Cache(cache_capacity)
+        self._position = 0
+        # get basic info
+        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
+        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
+        self._fps = self._vcap.get(CAP_PROP_FPS)
+        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
+        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)
+
+    @property
+    def vcap(self):
+        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
+        return self._vcap
+
+    @property
+    def opened(self):
+        """bool: Indicate whether the video is opened."""
+        return self._vcap.isOpened()
+
+    @property
+    def width(self):
+        """int: Width of video frames."""
+        return self._width
+
+    @property
+    def height(self):
+        """int: Height of video frames."""
+        return self._height
+
+    @property
+    def resolution(self):
+        """tuple: Video resolution (width, height)."""
+        return (self._width, self._height)
+
+    @property
+    def fps(self):
+        """float: FPS of the video."""
+        return self._fps
+
+    @property
+    def frame_cnt(self):
+        """int: Total frames of the video."""
+        return self._frame_cnt
+
+    @property
+    def fourcc(self):
+        """str: "Four character code" of the video."""
+        return self._fourcc
+
+    @property
+    def position(self):
+        """int: Current cursor position, indicating frame decoded."""
+        return self._position
+
+    def _get_real_position(self):
+        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))
+
+    def _set_real_position(self, frame_id):
+        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
+        pos = self._get_real_position()
+        for _ in range(frame_id - pos):
+            self._vcap.read()
+        self._position = frame_id
+
+    def read(self):
+        """Read the next frame.
+        If the next frame have been decoded before and in the cache, then
+        return it directly, otherwise decode, cache and return it.
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        # pos = self._position
+        if self._cache:
+            img = self._cache.get(self._position)
+            if img is not None:
+                ret = True
+            else:
+                if self._position != self._get_real_position():
+                    self._set_real_position(self._position)
+                ret, img = self._vcap.read()
+                if ret:
+                    self._cache.put(self._position, img)
+        else:
+            ret, img = self._vcap.read()
+        if ret:
+            self._position += 1
+        return img
+
+    def get_frame(self, frame_id):
+        """Get frame by index.
+        Args:
+            frame_id (int): Index of the expected frame, 0-based.
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        if frame_id < 0 or frame_id >= self._frame_cnt:
+            raise IndexError(
+                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
+        if frame_id == self._position:
+            return self.read()
+        if self._cache:
+            img = self._cache.get(frame_id)
+            if img is not None:
+                self._position = frame_id + 1
+                return img
+        self._set_real_position(frame_id)
+        ret, img = self._vcap.read()
+        if ret:
+            if self._cache:
+                self._cache.put(self._position, img)
+            self._position += 1
+        return img
+
+    def current_frame(self):
+        """Get the current frame (frame that is just visited).
+        Returns:
+            ndarray or None: If the video is fresh, return None, otherwise
+                return the frame.
+        """
+        if self._position == 0:
+            return None
+        return self._cache.get(self._position - 1)
+
+    def cvt2frames(self,
+                   frame_dir,
+                   file_start=0,
+                   filename_tmpl='{:06d}.jpg',
+                   start=0,
+                   max_num=0,
+                   show_progress=True):
+        """Convert a video to frame images.
+        Args:
+            frame_dir (str): Output directory to store all the frame images.
+            file_start (int): Filenames will start from the specified number.
+            filename_tmpl (str): Filename template with the index as the
+                placeholder.
+            start (int): The starting frame index.
+            max_num (int): Maximum number of frames to be written.
+            show_progress (bool): Whether to show a progress bar.
+        """
+        mkdir_or_exist(frame_dir)
+        if max_num == 0:
+            task_num = self.frame_cnt - start
+        else:
+            task_num = min(self.frame_cnt - start, max_num)
+        if task_num <= 0:
+            raise ValueError('start must be less than total frame number')
+        if start > 0:
+            self._set_real_position(start)
+
+        def write_frame(file_idx):
+            img = self.read()
+            if img is None:
+                return
+            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+            cv2.imwrite(filename, img)
+
+        if show_progress:
+            track_progress(write_frame, range(file_start,
+                                              file_start + task_num))
+        else:
+            for i in range(task_num):
+                write_frame(file_start + i)
+
+    def __len__(self):
+        return self.frame_cnt
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self.get_frame(i)
+                for i in range(*index.indices(self.frame_cnt))
+            ]
+        # support negative indexing
+        if index < 0:
+            index += self.frame_cnt
+            if index < 0:
+                raise IndexError('index out of range')
+        return self.get_frame(index)
+
+    def __iter__(self):
+        self._set_real_position(0)
+        return self
+
+    def __next__(self):
+        img = self.read()
+        if img is not None:
+            return img
+        else:
+            raise StopIteration
+
+    next = __next__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._vcap.release()
diff --git a/modelscope/preprocessors/image.py b/modelscope/preprocessors/image.py
index f0401f16..aca3023f 100644
--- a/modelscope/preprocessors/image.py
+++ b/modelscope/preprocessors/image.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import io
+import os
 from typing import Any, Dict, Union
 
 import cv2
@@ -11,6 +12,7 @@ from PIL import Image, ImageOps
 from modelscope.fileio import File
 from modelscope.metainfo import Preprocessors
 from modelscope.utils.constant import Fields
+from modelscope.utils.hub import read_config
 from modelscope.utils.type_assert import type_assert
 from .base import Preprocessor
 from .builder import PREPROCESSORS
@@ -105,6 +107,55 @@ def load_image(image_path_or_url: str) -> Image.Image:
     return loader(image_path_or_url)['img']
 
 
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.object_detection_tinynas_preprocessor)
+class ObjectDetectionTinynasPreprocessor(Preprocessor):
+
+    def __init__(self, size_divisible=32, **kwargs):
+        """Preprocess the image.
+
+        What this preprocessor will do:
+        1. Transpose the image matrix to make the channel the first dim.
+        2. If the size_divisible is gt than 0, it will be used to pad the image.
+        3. Expand an extra image dim as dim 0.
+
+        Args:
+            size_divisible (int): The number will be used as a length unit to pad the image.
+                Formula: int(math.ceil(shape / size_divisible) * size_divisible)
+                Default 32.
+        """
+
+        super().__init__(**kwargs)
+        self.size_divisible = size_divisible
+
+    @type_assert(object, object)
+    def __call__(self, data: np.ndarray) -> Dict[str, ndarray]:
+        """Preprocess the image.
+
+        Args:
+            data: The input image with 3 dimensions.
+
+        Returns:
+            The processed data in dict.
+            {'img': np.ndarray}
+
+        """
+        image = data.astype(np.float32)
+        image = image.transpose((2, 0, 1))
+        shape = image.shape  # c, h, w
+        if self.size_divisible > 0:
+            import math
+            stride = self.size_divisible
+            shape = list(shape)
+            shape[1] = int(math.ceil(shape[1] / stride) * stride)
+            shape[2] = int(math.ceil(shape[2] / stride) * stride)
+            shape = tuple(shape)
+        pad_img = np.zeros(shape).astype(np.float32)
+        pad_img[:, :image.shape[1], :image.shape[2]] = image
+        pad_img = np.expand_dims(pad_img, 0)
+        return {'img': pad_img}
+
+
 @PREPROCESSORS.register_module(
     Fields.cv, module_name=Preprocessors.image_color_enhance_preprocessor)
 class ImageColorEnhanceFinetunePreprocessor(Preprocessor):
@@ -139,7 +190,7 @@ class ImageColorEnhanceFinetunePreprocessor(Preprocessor):
 
 
 @PREPROCESSORS.register_module(
-    Fields.cv, module_name=Preprocessors.image_denoie_preprocessor)
+    Fields.cv, module_name=Preprocessors.image_denoise_preprocessor)
 class ImageDenoisePreprocessor(Preprocessor):
 
     def __init__(self, model_dir: str, *args, **kwargs):
@@ -171,6 +222,39 @@ class ImageDenoisePreprocessor(Preprocessor):
         return data
 
 
+@PREPROCESSORS.register_module(
+    Fields.cv, module_name=Preprocessors.image_deblur_preprocessor)
+class ImageDeblurPreprocessor(Preprocessor):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        """
+
+        Args:
+            model_dir (str): model path
+        """
+        super().__init__(*args, **kwargs)
+        self.model_dir: str = model_dir
+
+        from .common import Filter
+
+        # TODO: `Filter` should be moved to configurarion file of each model
+        self._transforms = [Filter(reserved_keys=['input', 'target'])]
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """process the raw input data
+
+        Args:
+            data Dict[str, Any]
+
+        Returns:
+            Dict[str, Any]: the preprocessed data
+        """
+        for t in self._transforms:
+            data = t(data)
+
+        return data
+
+
 @PREPROCESSORS.register_module(
     Fields.cv,
     module_name=Preprocessors.image_portrait_enhancement_preprocessor)
diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py
index 6d326df3..a4f77684 100644
--- a/modelscope/preprocessors/multi_modal.py
+++ b/modelscope/preprocessors/multi_modal.py
@@ -3,7 +3,9 @@ import os.path as osp
 from io import BytesIO
 from typing import Any, Dict, List, Tuple, Union
 
+import decord
 import json
+import numpy as np
 import torch
 from PIL import Image
 from timm.data import create_transform
@@ -12,6 +14,8 @@ from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import Preprocessors
 from modelscope.pipelines.base import Input
+from modelscope.pipelines.cv.cmdssl_video_embedding_pipeline import (
+    VCenterCrop, VCompose, VNormalize, VRescale, VToTensor)
 from modelscope.preprocessors import load_image
 from modelscope.utils.config import Config
 from modelscope.utils.constant import (Fields, Invoke, ModeKeys, ModelFile,
@@ -22,10 +26,7 @@ from .ofa import *  # noqa
 from .ofa.utils.collate import collate_fn
 from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
 
-__all__ = [
-    'OfaPreprocessor',
-    'MPlugPreprocessor',
-]
+__all__ = ['OfaPreprocessor', 'MPlugPreprocessor', 'HiTeAPreprocessor']
 
 
 @PREPROCESSORS.register_module(
@@ -55,7 +56,9 @@ class OfaPreprocessor(Preprocessor):
             Tasks.text_classification: OfaTextClassificationPreprocessor,
             Tasks.text_summarization: OfaSummarizationPreprocessor,
             Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor,
-            Tasks.auto_speech_recognition: OfaASRPreprocessor
+            Tasks.auto_speech_recognition: OfaASRPreprocessor,
+            Tasks.sudoku: OfaSudokuPreprocessor,
+            Tasks.text2sql: OfaTextToSqlPreprocessor
         }
         model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
             model_dir, user_agent={Invoke.KEY: Invoke.PREPROCESSOR})
@@ -387,3 +390,213 @@ class MPlugPreprocessor(Preprocessor):
             if self.cfg.task == Tasks.image_text_retrieval:
                 output['index'] = index
             return output
+
+
+@PREPROCESSORS.register_module(
+    Fields.multi_modal, module_name=Preprocessors.vldoc_preprocessor)
+class VLDocPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """Preprocess data for the model `VLDocForDocVLEmbedding`.
+
+        Args:
+            model_dir (str): model path in model hub.
+            mode (str): model mode, in ('train', 'eval', 'inference').
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_dir = model_dir
+        self.mode = mode
+
+        model_cfg_path = osp.join(model_dir, 'config.json')
+        with open(model_cfg_path, 'r', encoding='utf-8') as f:
+            model_cfg = json.load(f)
+
+        from modelscope.models.multi_modal.vldoc.tokenization import VLDocXLMTokenizer
+        tokenizer_path = osp.join(model_dir, ModelFile.TOKENIZER_FOLDER)
+        self.tokenizer = VLDocXLMTokenizer.from_pretrained(tokenizer_path)
+
+        from modelscope.models.multi_modal.vldoc.processing import Processor, ImageProcessor
+        self.img_proc = ImageProcessor(
+            do_preprocess=True,
+            do_resize=True,
+            image_size={
+                'height': model_cfg['image_size'][0],
+                'width': model_cfg['image_size'][1],
+            },
+            do_normalize=True,
+            apply_ocr=False)
+        self.proc = Processor(
+            max_seq_length=model_cfg['max_seq_length'],
+            max_block_num=model_cfg['max_block_num'],
+            img_processor=self.img_proc,
+            tokenizer=self.tokenizer,
+            width=model_cfg['image_size'][1],
+            height=model_cfg['image_size'][0],
+        )
+
+    def __call__(self, input: Dict[str, Any], *args,
+                 **kwargs) -> Dict[str, Any]:
+        """
+        Args:
+            input: {
+                'images': ['img_path1', 'img_path2', ...],
+                'ocr_info_paths': ['json_path1', 'json_path2', ...]
+            }
+        Return:
+            encodings: Dict[str, Tensor]
+        """
+
+        ocr_infos = []
+        for one_ocr_info_path in input['ocr_info_paths']:
+            with open(one_ocr_info_path, 'r') as f:
+                ocr_info = json.load(f)
+                ocr_info = ocr_info['form']
+                ocr_infos.append(ocr_info)
+
+        proc_input = {'images': input['images'], 'ocr_infos': ocr_infos}
+        encodings = self.proc(**proc_input)
+
+        return encodings
+
+
+@PREPROCESSORS.register_module(
+    Fields.multi_modal, module_name=Preprocessors.hitea_tasks_preprocessor)
+class HiTeAPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 tokenizer_max_length: int = 25,
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model_dir = model_dir
+        self.mode = mode
+        self.tokenizer_max_length = tokenizer_max_length
+
+        self._tokenizer = None
+        self._patch_resize_transform = None
+        self._num_frames = None
+        self._video_map = {}
+
+    @property
+    def tokenizer(self):
+        from transformers import BertTokenizer
+
+        if self._tokenizer is None:
+            self._tokenizer = BertTokenizer.from_pretrained(self.model_dir)
+        return self._tokenizer
+
+    @property
+    def patch_resize_transform(self):
+        if self._patch_resize_transform is None:
+            from torchvision import transforms
+            from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
+
+            config = HiTeAConfig.from_yaml_file(
+                osp.join(self.model_dir, CONFIG_NAME))
+
+            mean = (0.48145466, 0.4578275, 0.40821073)
+            std = (0.26862954, 0.26130258, 0.27577711)
+
+            self._patch_resize_transform = transforms.Compose([
+                transforms.Resize((config.image_res, config.image_res),
+                                  interpolation=Image.BICUBIC),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=mean, std=std),
+            ])
+        return self._patch_resize_transform
+
+    @property
+    def num_frames(self):
+        if self._num_frames is None:
+            from torchvision import transforms
+            from modelscope.models.multi_modal.mplug import CONFIG_NAME, HiTeAConfig
+
+            config = HiTeAConfig.from_yaml_file(
+                osp.join(self.model_dir, CONFIG_NAME))
+
+            self._num_frames = config.num_frames
+        return self._num_frames
+
+    def video_open(self, path: str) -> Tuple[decord.VideoReader, int]:
+        if path not in self._video_map:
+            index = len(self._video_map)
+            vr = decord.VideoReader(path, ctx=decord.cpu(0))
+            self._video_map[path] = (vr, index)
+        return self._video_map[path]
+
+    def sample_frames(self, num_frames: int, vlen: int) -> List[int]:
+        acc_samples = min(num_frames, vlen)
+        # split the video into `acc_samples` intervals, and sample from each interval.
+        intervals = np.linspace(
+            start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+
+        frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+
+        if len(frame_indices) < num_frames:  # padded with last frame
+            padded_frame_indices = [frame_indices[-1]] * num_frames
+            padded_frame_indices[:len(frame_indices)] = frame_indices
+            frame_indices = padded_frame_indices
+        return frame_indices
+
+    def __call__(
+        self, data: Union[decord.VideoReader, tuple,
+                          Dict[str, Any]]) -> Dict[str, Any]:
+        self.cfg = Config.from_file(
+            osp.join(self.model_dir, ModelFile.CONFIGURATION))
+
+        if isinstance(data, (decord.VideoReader, str)):
+            video = data
+        elif isinstance(data, tuple):
+            video = data[0]
+        else:
+            video = data['video']
+        index = 0
+        if isinstance(video, str):
+            video, index = self.video_open(video)
+        frame_indices = self.sample_frames(self.num_frames, len(video))
+        video.seek(0)
+        video = torch.from_numpy(video.get_batch(frame_indices).asnumpy())
+        video = [
+            self.patch_resize_transform(Image.fromarray(f))
+            for f in video.numpy()
+        ]
+        video = torch.stack(video, dim=0)
+        question = '' if self.cfg.task == Tasks.video_captioning \
+            else data[1 if isinstance(data, tuple)
+                      else ('text' if 'text' in data else 'question')]
+        question = self.tokenizer(
+            question.lower(),
+            padding='max_length',
+            truncation=True,
+            max_length=self.tokenizer_max_length,
+            return_tensors='pt')
+
+        if self.mode == ModeKeys.INFERENCE:
+            video = torch.stack([video], dim=0)
+            return {'video': video, 'question': question}
+        else:
+            answer = data['answer']
+            answer = self.tokenizer(
+                answer,
+                padding='max_length',
+                truncation=True,
+                max_length=self.tokenizer_max_length,
+                return_tensors='pt')
+            output = {
+                'video': video,
+                'question_input_ids': question.input_ids.squeeze(),
+                'question_attention_mask': question.attention_mask.squeeze(),
+                'answer_input_ids': answer.input_ids.squeeze(),
+                'answer_attention_mask': answer.attention_mask.squeeze(),
+            }
+            return output
diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py
index c6fa2025..4add627e 100644
--- a/modelscope/preprocessors/nlp/__init__.py
+++ b/modelscope/preprocessors/nlp/__init__.py
@@ -6,7 +6,6 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .text_error_correction import TextErrorCorrectionPreprocessor
     from .text_generation_preprocessor import TextGenerationJiebaPreprocessor
-    from .sentence_piece_preprocessor import SentencePiecePreprocessor
     from .bert_seq_cls_tokenizer import Tokenize
     from .document_segmentation_preprocessor import DocumentSegmentationTransformersPreprocessor
     from .faq_question_answering_preprocessor import FaqQuestionAnsweringTransformersPreprocessor
@@ -15,7 +14,8 @@ if TYPE_CHECKING:
     from .relation_extraction_preprocessor import RelationExtractionTransformersPreprocessor
     from .text_classification_preprocessor import TextClassificationTransformersPreprocessor
     from .sentence_embedding_preprocessor import SentenceEmbeddingTransformersPreprocessor
-    from .text_generation_preprocessor import TextGenerationTransformersPreprocessor, TextGenerationT5Preprocessor
+    from .text_generation_preprocessor import TextGenerationTransformersPreprocessor, \
+        TextGenerationT5Preprocessor, TextGenerationSentencePiecePreprocessor, SentencePiecePreprocessor
     from .token_classification_preprocessor import TokenClassificationTransformersPreprocessor, \
         WordSegmentationBlankSetToLabelPreprocessor
     from .token_classification_thai_preprocessor import WordSegmentationPreprocessorThai, NERPreprocessorThai
@@ -29,9 +29,9 @@ if TYPE_CHECKING:
     from .space_T_cn import TableQuestionAnsweringPreprocessor
     from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor
     from .translation_evaluation_preprocessor import TranslationEvaluationPreprocessor
+    from .dialog_classification_use_preprocessor import DialogueClassificationUsePreprocessor
 else:
     _import_structure = {
-        'sentence_piece_preprocessor': ['SentencePiecePreprocessor'],
         'bert_seq_cls_tokenizer': ['Tokenize'],
         'document_segmentation_preprocessor':
         ['DocumentSegmentationTransformersPreprocessor'],
@@ -48,7 +48,10 @@ else:
         ['SentenceEmbeddingTransformersPreprocessor'],
         'text_generation_preprocessor': [
             'TextGenerationTransformersPreprocessor',
-            'TextGenerationJiebaPreprocessor', 'TextGenerationT5Preprocessor'
+            'TextGenerationJiebaPreprocessor',
+            'TextGenerationT5Preprocessor',
+            'TextGenerationSentencePiecePreprocessor',
+            'SentencePiecePreprocessor',
         ],
         'token_classification_preprocessor': [
             'TokenClassificationTransformersPreprocessor',
@@ -79,6 +82,8 @@ else:
         'space_T_cn': ['TableQuestionAnsweringPreprocessor'],
         'translation_evaluation_preprocessor':
         ['TranslationEvaluationPreprocessor'],
+        'dialog_classification_use_preprocessor':
+        ['DialogueClassificationUsePreprocessor']
     }
 
     import sys
diff --git a/modelscope/preprocessors/nlp/dialog_classification_use_preprocessor.py b/modelscope/preprocessors/nlp/dialog_classification_use_preprocessor.py
new file mode 100644
index 00000000..27b9cdaa
--- /dev/null
+++ b/modelscope/preprocessors/nlp/dialog_classification_use_preprocessor.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict, List, Tuple
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from transformers import BertTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields
+from modelscope.utils.hub import parse_label_mapping
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.dialog_use_preprocessor)
+class DialogueClassificationUsePreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 label2id: Dict = None,
+                 max_length: int = None):
+        """The preprocessor for user satisfaction estimation task, based on transformers' tokenizer.
+
+        Args:
+            model_dir: The model dir containing the essential files to build the tokenizer.
+            label2id: The dict with label-id mappings, default the label_mapping.json file in the model_dir.
+            max_length: The max length of dialogue, default 30.
+        """
+        super().__init__()
+        self.model_dir: str = model_dir
+        self.tokenizer = BertTokenizer.from_pretrained(self.model_dir)
+        self.max_seq_len = min(max_length,
+                               30) if max_length is not None else 30
+        self.label2id = label2id
+        if self.label2id is None and self.model_dir is not None:
+            self.label2id = parse_label_mapping(self.model_dir)
+
+    @property
+    def id2label(self):
+        """Return the id2label mapping according to the label2id mapping.
+
+        @return: The id2label mapping if exists.
+        """
+        if self.label2id is not None:
+            return {id: label for label, id in self.label2id.items()}
+        return None
+
+    def __call__(self, data: List[Tuple[str]]) -> Dict[str, Any]:
+        input_ids = []
+        for pair in data:
+            ids = []
+            for sent in str(pair).split('|||'):
+                ids += self.tokenizer.encode(sent)[1:]
+                if len(ids) >= self.max_seq_len - 1:
+                    ids = ids[:self.max_seq_len - 2] + [102]
+                    break
+            input_ids.append([101] + ids)  # [CLS] + (max_len-1) tokens
+        input_ids = [torch.tensor(utt, dtype=torch.long) for utt in input_ids]
+        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
+        input_ids = input_ids.view(1, len(data), -1)
+        rst = {'input_ids': input_ids}
+        return rst
diff --git a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
index be922bf7..29c6c58f 100644
--- a/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
+++ b/modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
@@ -232,7 +232,6 @@ class DocumentSegmentationTransformersPreprocessor(Preprocessor):
                     continue
 
         output_samples = {}
-
         output_samples['input_ids'] = new_input_ids
         output_samples['token_type_ids'] = new_token_type_ids
         output_samples['attention_mask'] = new_attention_mask
diff --git a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
index bdf8b30f..eb8c501f 100644
--- a/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
+++ b/modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
@@ -2,6 +2,8 @@
 
 from typing import Any, Dict
 
+import torch
+
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
@@ -19,6 +21,7 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
                  tokenizer='BertTokenizer',
                  query_set='query_set',
                  support_set='support_set',
+                 query_label='query_label',
                  label_in_support_set='label',
                  text_in_support_set='text',
                  sequence_length=None,
@@ -49,6 +52,7 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
         else:
             self.max_len = kwargs.get('max_seq_length', 50)
         self.label_dict = None
+        self.query_label = query_label
         self.query_set = query_set
         self.support_set = support_set
         self.label_in_support_set = label_in_support_set
@@ -78,6 +82,10 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
     @type_assert(object, Dict)
     def __call__(self, data: Dict[str, Any],
                  **preprocessor_param) -> Dict[str, Any]:
+        invoke_mode = preprocessor_param.get('mode', None)
+        if self.mode in (ModeKeys.TRAIN,
+                         ModeKeys.EVAL) and invoke_mode != ModeKeys.INFERENCE:
+            return data
         tmp_max_len = preprocessor_param.get(
             'sequence_length',
             preprocessor_param.get('max_seq_length', self.max_len))
@@ -111,11 +119,28 @@ class FaqQuestionAnsweringTransformersPreprocessor(Preprocessor):
         supportset_labels_ids = [
             label_dict.index(label) for label in supportset_labels_ori
         ]
-        return {
-            'query': queryset_padded,
-            'support': supportset_padded,
-            'support_labels': supportset_labels_ids
+
+        query_atttention_mask = torch.ne(
+            torch.tensor(queryset_padded, dtype=torch.int32),
+            self.tokenizer.pad_token_id)
+        support_atttention_mask = torch.ne(
+            torch.tensor(supportset_padded, dtype=torch.int32),
+            self.tokenizer.pad_token_id)
+
+        result = {
+            'query': torch.LongTensor(queryset_padded),
+            'support': torch.LongTensor(supportset_padded),
+            'query_attention_mask': query_atttention_mask,
+            'support_attention_mask': support_atttention_mask,
+            'support_labels': torch.LongTensor(supportset_labels_ids)
         }
+        if self.query_label in data:
+            query_label = data[self.query_label]
+            query_label_ids = [
+                label_dict.index(label) for label in query_label
+            ]
+            result['labels'] = torch.LongTensor(query_label_ids)
+        return result
 
     def batch_encode(self, sentence_list: list, max_length=None):
         if not max_length:
diff --git a/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py b/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py
new file mode 100644
index 00000000..c7f3677f
--- /dev/null
+++ b/modelscope/preprocessors/nlp/mgeo_ranking_preprocessor.py
@@ -0,0 +1,199 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Any, Dict
+
+import torch
+from transformers import AutoTokenizer
+
+from modelscope.metainfo import Preprocessors
+from modelscope.preprocessors import Preprocessor
+from modelscope.preprocessors.builder import PREPROCESSORS
+from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.type_assert import type_assert
+
+
+class GisUtt:
+
+    def __init__(self, pad_token_id, cls_token_id):
+        self.pad_token_id = pad_token_id
+        self.cls_token_id = cls_token_id
+        self.input_ids = None
+        self.attention_mask = None
+        self.token_type_ids = None
+        self.rel_type_ids = None
+        self.absolute_position_ids = None
+        self.relative_position_ids = None
+        self.prov_ids = None
+        self.city_ids = None
+        self.dist_ids = None
+        self.max_length = 32
+
+    def update(self, gis_input_ids, gis_token_type_ids, gis_rel_type_ids,
+               gis_absolute_position_ids, gis_relative_position_ids,
+               gis_prov_ids, gis_city_ids, gis_dist_ids, china_version):
+        gis_input_ids = [[self.cls_token_id] + f for f in gis_input_ids]
+        gis_token_type_ids = [[self.pad_token_id] + f
+                              for f in gis_token_type_ids]
+        gis_rel_type_ids = [[self.pad_token_id] + f for f in gis_rel_type_ids]
+        gis_absolute_position_ids = [[[self.pad_token_id] * 4] + f
+                                     for f in gis_absolute_position_ids]
+        gis_relative_position_ids = [[[self.pad_token_id] * 4] + f
+                                     for f in gis_relative_position_ids]
+        if china_version:
+            gis_prov_ids = [[self.pad_token_id] + f for f in gis_prov_ids]
+            gis_city_ids = [[self.pad_token_id] + f for f in gis_city_ids]
+            gis_dist_ids = [[self.pad_token_id] + f for f in gis_dist_ids]
+
+        gis_input_ids = [f[:self.max_length] for f in gis_input_ids]
+        gis_token_type_ids = [f[:self.max_length] for f in gis_token_type_ids]
+        gis_rel_type_ids = [f[:self.max_length] for f in gis_rel_type_ids]
+        gis_absolute_position_ids = [
+            f[:self.max_length] for f in gis_absolute_position_ids
+        ]
+        gis_relative_position_ids = [
+            f[:self.max_length] for f in gis_relative_position_ids
+        ]
+        if china_version:
+            gis_prov_ids = [f[:self.max_length] for f in gis_prov_ids]
+            gis_city_ids = [f[:self.max_length] for f in gis_city_ids]
+            gis_dist_ids = [f[:self.max_length] for f in gis_dist_ids]
+
+        max_length = max([len(item) for item in gis_input_ids])
+        self.input_ids = torch.tensor([
+            f + [self.pad_token_id] * (max_length - len(f))
+            for f in gis_input_ids
+        ],
+                                      dtype=torch.long)  # noqa: E126
+        self.attention_mask = torch.tensor(
+            [
+                [1] * len(f) + [0] *  # noqa: W504
+                (max_length - len(f)) for f in gis_input_ids
+            ],
+            dtype=torch.long)  # noqa: E126
+        self.token_type_ids = torch.tensor([
+            f + [self.pad_token_id] * (max_length - len(f))
+            for f in gis_token_type_ids
+        ],
+                                           dtype=torch.long)  # noqa: E126
+        self.rel_type_ids = torch.tensor([
+            f + [self.pad_token_id] * (max_length - len(f))
+            for f in gis_rel_type_ids
+        ],
+                                         dtype=torch.long)  # noqa: E126
+
+        self.absolute_position_ids = torch.tensor(
+            [
+                f + [[self.pad_token_id] * 4] * (max_length - len(f))
+                for f in gis_absolute_position_ids
+            ],
+            dtype=torch.long)  # noqa: E126
+        self.relative_position_ids = torch.tensor(
+            [
+                f + [[self.pad_token_id] * 4] * (max_length - len(f))
+                for f in gis_relative_position_ids
+            ],
+            dtype=torch.long)  # noqa: E126
+        if china_version:
+            self.prov_ids = torch.tensor([
+                f + [self.pad_token_id] * (max_length - len(f))
+                for f in gis_prov_ids
+            ],
+                                         dtype=torch.long)  # noqa: E126
+            self.city_ids = torch.tensor([
+                f + [self.pad_token_id] * (max_length - len(f))
+                for f in gis_city_ids
+            ],
+                                         dtype=torch.long)  # noqa: E126
+            self.dist_ids = torch.tensor([
+                f + [self.pad_token_id] * (max_length - len(f))
+                for f in gis_dist_ids
+            ],
+                                         dtype=torch.long)  # noqa: E126
+
+
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.mgeo_ranking)
+class MGeoRankingTransformersPreprocessor(Preprocessor):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 first_sequence='source_sentence',
+                 second_sequence='sentences_to_compare',
+                 first_sequence_gis='first_sequence_gis',
+                 second_sequence_gis='second_sequence_gis',
+                 label='labels',
+                 qid='qid',
+                 max_length=None,
+                 **kwargs):
+        """The tokenizer preprocessor class for the text ranking preprocessor.
+
+        Args:
+            model_dir(str, `optional`): The model dir used to parse the label mapping, can be None.
+            first_sequence(str, `optional`): The key of the first sequence.
+            second_sequence(str, `optional`): The key of the second sequence.
+            label(str, `optional`): The keys of the label columns, default `labels`.
+            qid(str, `optional`): The qid info.
+            mode: The mode for the preprocessor.
+            max_length: The max sequence length which the model supported,
+                will be passed into tokenizer as the 'max_length' param.
+        """
+        super().__init__(mode)
+        self.model_dir = model_dir
+        self.first_sequence = first_sequence
+        self.second_sequence = second_sequence
+        self.first_sequence_gis = first_sequence_gis
+        self.second_sequence_gis = second_sequence_gis
+
+        self.label = label
+        self.qid = qid
+        self.sequence_length = max_length if max_length is not None else kwargs.get(
+            'sequence_length', 128)
+        kwargs.pop('sequence_length', None)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+
+    @type_assert(object, dict)
+    def __call__(self,
+                 data: Dict,
+                 padding='longest',
+                 truncation=True,
+                 **kwargs) -> Dict[str, Any]:
+        sentence1 = data.get(self.first_sequence)
+        sentence2 = data.get(self.second_sequence)
+        labels = data.get(self.label)
+        qid = data.get(self.qid)
+        sentence1_gis = data.get(self.first_sequence_gis)
+        sentence2_gis = data.get(self.second_sequence_gis)
+        if sentence1_gis is not None:
+            sentence1_gis *= len(sentence2)
+
+        if isinstance(sentence2, str):
+            sentence2 = [sentence2]
+        if isinstance(sentence1, str):
+            sentence1 = [sentence1]
+        sentence1 = sentence1 * len(sentence2)
+        kwargs['max_length'] = kwargs.get(
+            'max_length', kwargs.pop('sequence_length', self.sequence_length))
+        if 'return_tensors' not in kwargs:
+            kwargs['return_tensors'] = 'pt'
+        feature = self.tokenizer(
+            sentence1,
+            sentence2,
+            padding=padding,
+            truncation=truncation,
+            **kwargs)
+        if labels is not None:
+            feature['labels'] = labels
+        if qid is not None:
+            feature['qid'] = qid
+        if sentence1_gis is not None:
+            feature['sentence1_gis'] = sentence1_gis
+            gis = GisUtt(0, 1)
+            feature['gis1'] = gis
+
+        if sentence2_gis is not None:
+            feature['sentence2_gis'] = sentence2_gis
+            gis = GisUtt(0, 1)
+            feature['gis2'] = gis
+
+        return feature
diff --git a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
index 77d65dec..5930e007 100644
--- a/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
+++ b/modelscope/preprocessors/nlp/sentence_embedding_preprocessor.py
@@ -21,7 +21,7 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
                  first_sequence='source_sentence',
                  second_sequence='sentences_to_compare',
                  mode=ModeKeys.INFERENCE,
-                 use_fast: bool = None,
+                 use_fast: bool = True,
                  max_length: int = None,
                  **kwargs):
         """The preprocessor for sentence embedding task, based on transformers' tokenizer.
@@ -70,12 +70,14 @@ class SentenceEmbeddingTransformersPreprocessor(Preprocessor):
         Returns:
             Dict[str, Any]: the preprocessed data
         """
-        source_sentence = data[self.first_sequence]
-        compare_sentences = data[self.second_sequence]
-        sentences = [source_sentence[0]]
-        for sent in compare_sentences:
-            sentences.append(sent)
-
+        source_sentences = data[self.first_sequence]
+        if self.second_sequence in data:
+            compare_sentences = data[self.second_sequence]
+            sentences = [source_sentences[0]]
+            for sent in compare_sentences:
+                sentences.append(sent)
+        else:
+            sentences = source_sentences
         if 'return_tensors' not in kwargs:
             kwargs[
                 'return_tensors'] = 'pt' if self.mode == ModeKeys.INFERENCE else None
diff --git a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py b/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
deleted file mode 100644
index fbaa7ace..00000000
--- a/modelscope/preprocessors/nlp/sentence_piece_preprocessor.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-import os.path as osp
-
-import sentencepiece as spm
-import torch
-
-from modelscope.metainfo import Preprocessors
-from modelscope.preprocessors.base import Preprocessor
-from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields, ModeKeys
-
-
-@PREPROCESSORS.register_module(
-    Fields.nlp, module_name=Preprocessors.sentence_piece)
-class SentencePiecePreprocessor(Preprocessor):
-
-    def __init__(self,
-                 model_dir: str,
-                 mode=ModeKeys.INFERENCE,
-                 *args,
-                 **kwargs):
-        """The preprocessor for the sentence piece tokenizer.
-
-        Args:
-            model_dir: The model dir contains the essential files used by the `SentencePieceProcessor`.
-            mode: The mode for the preprocessor.
-        """
-
-        super().__init__(mode)
-        self.tokenizer = None
-        for file_name in os.listdir(model_dir):
-            if file_name.endswith('.model'):
-                m_file = osp.join(model_dir, file_name)
-                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
-                break
-        assert self.tokenizer is not None, 'Can not find .model file'
-
-    def __call__(self, data: str) -> torch.Tensor:
-        return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long)
-
-    def decode(self, tokens, **kwargs):
-        """Decode the tokens to real text.
-
-        Args:
-            tokens: The output tokens from model's `forward` and `generate`
-
-        Returns:
-            The actual text.
-        """
-        return self.tokenizer.decode(tokens)
diff --git a/modelscope/preprocessors/nlp/text_error_correction.py b/modelscope/preprocessors/nlp/text_error_correction.py
index 357a946f..e3a1433d 100644
--- a/modelscope/preprocessors/nlp/text_error_correction.py
+++ b/modelscope/preprocessors/nlp/text_error_correction.py
@@ -3,6 +3,8 @@
 import os.path as osp
 from typing import Any, Dict
 
+import torch
+
 from modelscope.metainfo import Preprocessors
 from modelscope.preprocessors.base import Preprocessor
 from modelscope.preprocessors.builder import PREPROCESSORS
@@ -24,6 +26,8 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
         """
         super().__init__(*args, **kwargs)
         self.vocab = Dictionary.load(osp.join(model_dir, 'dict.src.txt'))
+        self.max_length = 100 + 1  # 1 is eos token
+        self.padding_value = 2
 
     def __call__(self, data: str) -> Dict[str, Any]:
         """process the raw input data
@@ -44,7 +48,12 @@ class TextErrorCorrectionPreprocessor(Preprocessor):
         text = ' '.join([x for x in data])
         inputs = self.vocab.encode_line(
             text, append_eos=True, add_if_not_exist=False)
-        lengths = inputs.size()
-        sample = dict()
-        sample['net_input'] = {'src_tokens': inputs, 'src_lengths': lengths}
-        return sample
+        lengths = inputs.size()[0]
+
+        padding = torch.tensor([self.padding_value] *  # noqa: W504
+                               (self.max_length - lengths))
+        inputs = torch.unsqueeze(torch.cat([padding, inputs]), dim=0)
+        lengths = torch.tensor([lengths])
+        out = {'src_tokens': inputs, 'src_lengths': lengths}
+
+        return out
diff --git a/modelscope/preprocessors/nlp/text_generation_preprocessor.py b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
index c1c32bf0..5f30b70a 100644
--- a/modelscope/preprocessors/nlp/text_generation_preprocessor.py
+++ b/modelscope/preprocessors/nlp/text_generation_preprocessor.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import os
 import os.path as osp
 from typing import Any, Dict, Optional, Union
 
@@ -161,12 +162,7 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
         output = self.nlp_tokenizer(sequence1, **kwargs)
         if self.mode != ModeKeys.INFERENCE:
             if sequence2 is not None:
-                self.nlp_tokenizer.tokenize_kwargs[
-                    'max_length'] = self.tgt_length
-                labels = self.nlp_tokenizer(sequence2)['input_ids']
-                self.nlp_tokenizer.tokenize_kwargs[
-                    'max_length'] = self.src_length
-
+                labels = self._get_labels_from_tgt(sequence2)
                 src_input_ids = output['input_ids']
                 src_attention_mask = output['attention_mask']
             else:
@@ -181,6 +177,12 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
             }
         return output
 
+    def _get_labels_from_tgt(self, sequence: str) -> torch.Tensor:
+        self.nlp_tokenizer.tokenize_kwargs['max_length'] = self.tgt_length
+        labels = self.nlp_tokenizer(sequence)['input_ids']
+        self.nlp_tokenizer.tokenize_kwargs['max_length'] = self.src_length
+        return labels
+
 
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer)
@@ -192,16 +194,11 @@ class TextGenerationJiebaPreprocessor(TextGenerationPreprocessorBase):
                  model_dir: str,
                  mode: str = ModeKeys.INFERENCE,
                  src_txt='src_txt',
-                 tgt_txt=None,
+                 tgt_txt='tgt_txt',
                  sequence_length: int = 128,
                  use_fast=None):
         from modelscope.models.nlp.gpt3 import JiebaBPETokenizer
         super().__init__(mode, src_txt, tgt_txt)
-        if self.tgt_txt is not None:
-            logger.warning(
-                f'TextGenerationJiebaPreprocessor currently does not support training, '
-                f'the {self.tgt_txt} of the tgt_txt field will be ignored.')
-        self.src_txt = src_txt
         self.tokenizer = JiebaBPETokenizer(
             osp.join(model_dir, 'tokenizer.json'))
         self.max_length = sequence_length
@@ -250,9 +247,70 @@ class TextGenerationJiebaPreprocessor(TextGenerationPreprocessorBase):
                 'tokens': tokens[:-1],
                 'labels': tokens[1:],
                 'prompt_length': prompt_length,
+                'is_pair': int(sequence2 is not None),
             }
 
 
+@PREPROCESSORS.register_module(
+    Fields.nlp, module_name=Preprocessors.sentence_piece)
+class TextGenerationSentencePiecePreprocessor(TextGenerationPreprocessorBase):
+
+    def __init__(self,
+                 model_dir: str,
+                 mode: str = ModeKeys.INFERENCE,
+                 src_txt='src_txt',
+                 tgt_txt=None,
+                 **kwargs):
+        """
+
+        Args:
+            model_dir: The model dir of the sentence piece model.
+            mode: The preprocessor mode, currently either mode will have the same behaviour.
+            src_txt: The key of input text, if input format is dict.
+            tgt_txt: The key of target text, used in training.
+
+        Examples:
+            >>> from modelscope.utils.hub import snapshot_download
+            >>> from modelscope.preprocessors import TextGenerationSentencePiecePreprocessor
+            >>> model_dir = snapshot_download('langboat/mengzi-gpt-neo-base')
+            >>> preprocessor = TextGenerationSentencePiecePreprocessor(model_dir)
+            >>> print(preprocessor('test word'))
+        """
+        if 'first_sequence' in kwargs:
+            src_txt = kwargs.pop('first_sequence')
+
+        import sentencepiece as spm
+        super().__init__(mode, src_txt, tgt_txt)
+        self.tokenizer = None
+        for file_name in os.listdir(model_dir):
+            if file_name.endswith('.model'):
+                m_file = osp.join(model_dir, file_name)
+                self.tokenizer = spm.SentencePieceProcessor(model_file=m_file)
+                break
+        assert self.tokenizer is not None, 'Can not find .model file'
+
+    def __call__(self, data: Union[Dict, str], **kwargs):
+        text_a, text_b = parse_text_and_label(data, self.mode, self.src_txt,
+                                              self.tgt_txt)[0:2]
+
+        return self._tokenize_text(text_a, text_b, **kwargs)
+
+    def _tokenize_text(self, sequence1, sequence2=None, **kwargs):
+        return torch.tensor(
+            self.tokenizer.encode([sequence1]), dtype=torch.long)
+
+    def decode(self, tokens, **kwargs):
+        """Decode the tokens to real text.
+
+        Args:
+            tokens: The output tokens from model's `forward` and `generate`
+
+        Returns:
+            The actual text.
+        """
+        return self.tokenizer.decode(tokens)
+
+
 @PREPROCESSORS.register_module(
     Fields.nlp, module_name=Preprocessors.text2text_gen_preprocessor)
 class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
@@ -283,3 +341,6 @@ class TextGenerationT5Preprocessor(TextGenerationTransformersPreprocessor):
             padding=kwargs.pop('padding', 'max_length'),
             return_token_type_ids=kwargs.pop('return_token_type_ids', False),
             **kwargs)
+
+
+SentencePiecePreprocessor = TextGenerationSentencePiecePreprocessor
diff --git a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
index f2ea73f6..a45c7309 100644
--- a/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
+++ b/modelscope/preprocessors/nlp/token_classification_thai_preprocessor.py
@@ -1,13 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, Tuple, Union
-
-import torch
+from typing import Any, Dict, Union
 
 from modelscope.metainfo import Preprocessors
-from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.builder import PREPROCESSORS
-from modelscope.utils.constant import Fields, ModeKeys
+from modelscope.utils.constant import Fields
 from modelscope.utils.type_assert import type_assert
 from .token_classification_preprocessor import \
     TokenClassificationTransformersPreprocessor
diff --git a/modelscope/preprocessors/nlp/transformers_tokenizer.py b/modelscope/preprocessors/nlp/transformers_tokenizer.py
index 2cec4b93..9a14ef9a 100644
--- a/modelscope/preprocessors/nlp/transformers_tokenizer.py
+++ b/modelscope/preprocessors/nlp/transformers_tokenizer.py
@@ -82,7 +82,7 @@ class NLPTokenizer:
                 model_dir) if model_dir is not None else tokenizer()
 
         if model_type in (Models.structbert, Models.gpt3, Models.palm,
-                          Models.plug):
+                          Models.plug, Models.megatron_bert):
             from transformers import BertTokenizer, BertTokenizerFast
             tokenizer = BertTokenizerFast if self.use_fast else BertTokenizer
             return tokenizer.from_pretrained(
diff --git a/modelscope/preprocessors/ofa/__init__.py b/modelscope/preprocessors/ofa/__init__.py
index ad6c3c48..a4faf3ff 100644
--- a/modelscope/preprocessors/ofa/__init__.py
+++ b/modelscope/preprocessors/ofa/__init__.py
@@ -3,7 +3,9 @@ from .asr import OfaASRPreprocessor
 from .image_captioning import OfaImageCaptioningPreprocessor
 from .image_classification import OfaImageClassificationPreprocessor
 from .ocr_recognition import OfaOcrRecognitionPreprocessor
+from .sudoku import OfaSudokuPreprocessor
 from .summarization import OfaSummarizationPreprocessor
+from .text2sql import OfaTextToSqlPreprocessor
 from .text_classification import OfaTextClassificationPreprocessor
 from .text_to_image_synthesis import OfaTextToImageSynthesisPreprocessor
 from .visual_entailment import OfaVisualEntailmentPreprocessor
diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py
index c2b61c5e..b8fd9ede 100644
--- a/modelscope/preprocessors/ofa/base.py
+++ b/modelscope/preprocessors/ofa/base.py
@@ -22,6 +22,9 @@ from .utils.random_help import set_torch_seed
 
 
 class OfaBasePreprocessor:
+    r"""
+    OFA base preprocessor for
+    """
 
     def __init__(self, cfg, model_dir, mode, *args, **kwargs):
         """preprocess the data via the vocab.txt from the `model_dir` path
@@ -45,6 +48,8 @@ class OfaBasePreprocessor:
         # there will be no need to use param: use_bpe
         tokenizer.add_tokens(['<code_{}>'.format(i) for i in range(8192)])
         tokenizer.add_tokens(['<bin_{}>'.format(i) for i in range(1000)])
+        if self.cfg.model.get('multimodal_type', 'default') == 'text2sql':
+            tokenizer.add_tokens(['>=', '<='])
         self.tokenizer = tokenizer
         self.bos_item = torch.LongTensor([tokenizer.bos_token_id])
         self.pad_item = torch.LongTensor([tokenizer.pad_token_id])
@@ -100,6 +105,20 @@ class OfaBasePreprocessor:
         self.test_audio_feature_transforms = None
 
     def tokenize_text(self, text, add_bos=True, add_eos=True):
+        r"""
+        Using `OFATokenizer` to tokenize text input.
+
+        Args:
+            text (`str`): Input text.
+            add_bos ('bool', **optional**, default to `True`)
+                Whether or not to add beginning of sentence token in
+                the front of sentence.
+            add_eos ('bool', **optional**, default to `True`)
+                Whether or not to add ending of sentence token in
+                the end of sentence.
+        Returns:
+            A list of tokens with the max length of `max_src_length + 2`
+        """
         if text is None:
             return None
         inputs = self.tokenizer(
@@ -116,6 +135,27 @@ class OfaBasePreprocessor:
 
     @staticmethod
     def pre_caption(caption, max_words=None):
+        r"""
+        Preprocessing for text sentence.
+
+        step 1. Get the lower case of input text.
+        step 2. Remove the words within `,.!?*#:;~ ` in the beginning
+            of the sentence.
+        step 3. Replace the words within `-/` or pattern `\s{2,}` with word ` `
+            and replace tag `<person>` with `person`.
+        step 4. Remove the `\n` in the end of the sentence.
+        step 5. Split the sentence with token ` `, If `max_words` is not None,
+            make a length truncation.
+
+        Args:
+            caption (`str`): Input text.
+            max_words (`int`, **optional**, default `None`):
+                The max length of input text. If None, do nothing, else
+                make a truncation.
+
+        Returns:
+            A sequence of `str`.
+        """
         caption = caption.lower().lstrip(',.!?*#:;~').replace('-', ' ') \
             .replace('/', ' ').replace('<person>', 'person')
 
@@ -136,6 +176,27 @@ class OfaBasePreprocessor:
 
     @staticmethod
     def pre_question(question, max_ques_words):
+        r"""
+        Preprocessing for text sentence.
+        Note that this function is very similar to `pre_caption`, should be merged in the future version.
+
+        step 1. Get the lower case of input text.
+        step 2. Remove the words within `,.!?*#:;~ ` in the beginning
+            of the sentence.
+        step 3. Replace the words within `-/` or pattern `\s{2,}` with word ` `.
+        step 4. Remove the `\n` in the end of the sentence.
+        step 5. Split the sentence with token ` `, If `max_words` is not None,
+            make a length truncation.
+
+        Args:
+            question (`str`): Input text.
+            max_ques_words (`int`, **optional**, default `None`):
+                The max length of input text. If None, do nothing, else
+                make a truncation.
+
+        Returns:
+            A sequence of `str`.
+        """
         question = question.lower().lstrip(',.!?*#:;~').replace('-',
                                                                 ' ').replace(
                                                                     '/', ' ')
@@ -156,6 +217,9 @@ class OfaBasePreprocessor:
         return question
 
     def add_constraint_mask(self, sample):
+        r"""
+        Add constraint mask.
+        """
         target_itm = sample['target']
         len_label_itm = target_itm.ne(self.pad_item).sum(dim=0).item()
         if self.constraint_trie:
@@ -171,6 +235,18 @@ class OfaBasePreprocessor:
             sample['constraint_mask'] = constraint_mask
 
     def get_img_pil(self, path_or_url_or_pil):
+        r"""
+        Get the pillow image. If the input is not a pillow image ,it will load
+        image from a local path or an external url.
+
+        Args:
+            path_or_url_or_pil (`Union[str, Image]`):
+                Can be:
+                    - A path or url reference to an image
+                    - A pillow image.
+        Returns:
+            A pillow image.
+        """
         image = path_or_url_or_pil if isinstance(path_or_url_or_pil, Image.Image) \
             else load_image(path_or_url_or_pil)
         return image
diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py
index 5fb83908..38a65681 100644
--- a/modelscope/preprocessors/ofa/image_captioning.py
+++ b/modelscope/preprocessors/ofa/image_captioning.py
@@ -9,6 +9,9 @@ from .base import OfaBasePreprocessor
 
 
 class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for image captioning task.
+    """
 
     def __init__(self,
                  cfg,
@@ -42,6 +45,24 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocess the data using the logic of `_build_infer_sample`
+            and make sure the label data in the result.
+        step 2. Preprocess the label data. Contains:
+            - remove tokens within `!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~` and tripe
+            - tokenize the label as `target` value without `bos` token.
+            - add `bos` token and remove `eos` token of `target` as `prev_output_tokens`.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`, `prompt`
+                and `label`, `image` refers the image input data, `prompt` refers the text
+                input data the `label` is the supervised data for training.
+        Return:
+            A dict object, contains source, image, mask, label, target tokens,
+            and previous output tokens data.
+        """
         sample = self._build_infer_sample(data)
         target = sample['label']
         target = target.translate(self.transtab).strip()
@@ -53,6 +74,21 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Get the pillow image.
+        step 2. Do some transforms for the pillow image as the image input,
+            such as resize, normalize, to tensor etc.
+        step 3. Tokenize the prompt as text input.
+        step 4. Determine Whether or not to add labels to the sample.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
+                the former refers the image input data, and the later refers the text input data.
+        Return:
+            A dict object, contains source, image, mask and label data.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
diff --git a/modelscope/preprocessors/ofa/image_classification.py b/modelscope/preprocessors/ofa/image_classification.py
index 038a9e15..750fd42a 100644
--- a/modelscope/preprocessors/ofa/image_classification.py
+++ b/modelscope/preprocessors/ofa/image_classification.py
@@ -18,6 +18,9 @@ Image.MAX_IMAGE_PIXELS = None
 
 
 class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for image classification task.
+    """
 
     def __init__(self,
                  cfg,
@@ -84,6 +87,25 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocess the data using the logic of `_build_infer_sample`
+            and make sure the label data in the result.
+        step 2. Preprocess the label data. Contains:
+            - add ` ` before the label value and add `ref_dict` value
+            - tokenize the label as `target` value without `bos` token.
+            - add `bos` token and remove `eos` token of `target` as `prev_output_tokens`.
+            - add constraints mask.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`,
+                `prompt` and `label`, `image` refers the image input data, `prompt`
+                refers the text input data the `label` is the supervised data for training.
+        Return:
+            A dict object, contains source, image, mask, label, target tokens,
+            and previous output tokens data.
+        """
         sample = self._build_infer_sample(data)
         target = ' {}'.format(sample['label'])
         sample['ref_dict'] = {sample['label']: 1.0}
@@ -105,6 +127,21 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Get the pillow image.
+        step 2. Do some transforms for the pillow image as the image input,
+            such as resize, normalize, to tensor etc.
+        step 3. Tokenize the prompt as text input.
+        step 4. Determine Whether or not to add labels to the sample.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
+                the former refers the image input data, and the later refers the text input data.
+        Return:
+            A dict object, contains source, image, mask and label data.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', ' what does the image describe?')
@@ -112,7 +149,8 @@ class OfaImageClassificationPreprocessor(OfaBasePreprocessor):
         sample = {
             'source': inputs,
             'patch_image': patch_image,
-            'patch_mask': torch.tensor([True])
+            'patch_mask': torch.tensor([True]),
+            'decoder_prompt': self.bos_item,
         }
         if 'text' in self.column_map and self.column_map['text'] in data:
             sample['label'] = data[self.column_map['text']]
diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py
index e15be93f..059bba28 100644
--- a/modelscope/preprocessors/ofa/ocr_recognition.py
+++ b/modelscope/preprocessors/ofa/ocr_recognition.py
@@ -13,6 +13,9 @@ from .base import OfaBasePreprocessor
 
 
 def ocr_resize(img, patch_image_size, is_document=False):
+    r"""
+    Image resize function for OCR tasks.
+    """
     img = img.convert('RGB')
     width, height = img.size
 
@@ -54,6 +57,9 @@ def ocr_resize(img, patch_image_size, is_document=False):
 
 
 class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for OCR recognition tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -87,6 +93,24 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocess the data using the logic of `_build_infer_sample`
+            and make sure the label data in the result.
+        step 2. Preprocess the label data. Contains:
+            - do tripe to the label value.
+            - tokenize the label as `target` value without `bos` token.
+            - add `bos` token and remove `eos` token of `target` as `prev_output_tokens`.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`, `prompt` and `label`,
+                the former refers the image input data, and the later refers the text input data
+                the `label` is the supervised data for training.
+        Return:
+            A dict object, contains source, image, mask, label, target tokens,
+            and previous output tokens data.
+        """
         sample = self._build_infer_sample(data)
         target = sample['label']
         target_token_list = target.strip().split()
@@ -97,6 +121,21 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Get the pillow image.
+        step 2. Do some transforms for the pillow image as the image input,
+            such as resize, normalize, to tensor etc.
+        step 3. Tokenize the prompt as text input.
+        step 4. Determine Whether or not to add labels to the sample.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
+                the former refers the image input data, and the later refers the text input data.
+        Return:
+            A dict object, contains source, image, image patch mask and label data.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         prompt = self.cfg.model.get('prompt', '图片上的文字是什么?')
diff --git a/modelscope/preprocessors/ofa/sudoku.py b/modelscope/preprocessors/ofa/sudoku.py
new file mode 100644
index 00000000..83c7f65c
--- /dev/null
+++ b/modelscope/preprocessors/ofa/sudoku.py
@@ -0,0 +1,110 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+import torch
+
+from modelscope.utils.constant import ModeKeys
+from .base import OfaBasePreprocessor
+
+
+class OfaSudokuPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for sudoku tasks
+    """
+
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
+        """
+        super(OfaSudokuPreprocessor, self).__init__(cfg, model_dir, mode,
+                                                    *args, **kwargs)
+
+        self.instruction_text = self.cfg.model.get('prompt',
+                                                   ' solve the sudoku .')
+        self.seg_embedding = self.cfg.get('seg_embedding', False)
+        self.max_struct_length = self.cfg.get('max_struct_length', 256)
+        if self.seg_embedding:
+            self.input_puzzle_row = []
+            self.input_puzzle_col = []
+            for idx in range(9):
+                for jdx in range(9):
+                    self.input_puzzle_row.append(jdx + 1)
+                    self.input_puzzle_col.append(idx + 1)
+                    if not (idx == 8 and jdx == 8):
+                        self.input_puzzle_row.append(0)
+                        self.input_puzzle_col.append(0)
+            self.input_puzzle_col = torch.tensor(self.input_puzzle_col)
+            self.input_puzzle_row = torch.tensor(self.input_puzzle_row)
+
+            instruct_seg = torch.zeros_like(
+                self.tokenize_text(self.instruction_text))
+            input_puzzle_col = torch.cat([self.input_puzzle_col, instruct_seg])
+            input_puzzle_row = torch.cat([self.input_puzzle_row, instruct_seg])
+            self.input_puzzle_col = torch.cat(
+                [self.bos_item, input_puzzle_col, self.eos_item])
+            self.input_puzzle_row = torch.cat(
+                [self.bos_item, input_puzzle_row, self.eos_item])
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        build sample for training tasks.
+
+        step 1. execute the `_build_infer_sample` function to get a batch sample
+            for inference.
+        step 2. process the label data for training.
+        """
+        sample = self._build_infer_sample(data)
+        target = sample['label']
+        target_token_list = target.lower().strip().split()
+        target = ' '.join(target_token_list[:self.max_tgt_length])
+        sample['target'] = self.tokenize_text(target, add_bos=False)
+        sample['prev_output_tokens'] = torch.cat(
+            [self.bos_item, sample['target'][:-1]])
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        build sample for inference tasks.
+
+        step 1. Get the input random masked sudoku text input, which shold be
+            generated like below pseudo code.
+            >>> sudo = np.random.randint(1, 9, size=(9, 9)) # a pseudo sudoku
+            >>> sudo_text = " | ".join(" : ".join(str(c) for c in row) \
+            >>>             for row in sudo)
+        step 2. Limit the length, tokenize the input text and add the bos token
+            to the front of the input as source input.
+        step 3. Add a pseodo ids for every input.
+        """
+        assert 'text' in self.column_map and 'text' in data, \
+            'there must be `text` column in task key map and source data'
+        text = data[self.column_map['text']]  # equal data['text']
+        text = ' '.join(text.lower().strip().split()[:self.max_struct_length])
+        src_item = self.tokenize_text(text + self.instruction_text)
+        src_item = src_item[:(self.max_src_length + self.max_struct_length)]
+
+        sample = {'id': 0.0, 'source': src_item}
+
+        if self.seg_embedding:
+            sample['seg_row_tokens'] = self.input_puzzle_row
+            sample['seg_col_tokens'] = self.input_puzzle_col
+
+        if 'solution' in self.column_map and self.column_map[
+                'solution'] in data:
+            sample['label'] = ' {}'.format(data[self.column_map['solution']])
+        return sample
diff --git a/modelscope/preprocessors/ofa/summarization.py b/modelscope/preprocessors/ofa/summarization.py
index d33e9d25..4eb45e32 100644
--- a/modelscope/preprocessors/ofa/summarization.py
+++ b/modelscope/preprocessors/ofa/summarization.py
@@ -8,6 +8,9 @@ from .base import OfaBasePreprocessor
 
 
 class OfaSummarizationPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for summarization tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -32,6 +35,26 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocess the data using the logic of `_build_infer_sample`
+            and make sure the label data in the result.
+        step 2. Preprocess the label data. Contains:
+            - Get the lower case of label, and using `pre_caption` function
+            to do the str preprocessing as new input label.
+            - Tokenize the new input label as `target` for model input.
+            - Add noise to the `target`
+            - Calculate the `prev_output_tokens` from noise `target` for model input.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`, `prompt` and
+                `label`, `image` refers the image input data, `prompt` refers the text input data
+                and the `label` is the supervised data for training.
+        Return:
+            A dict object, contains source, image, mask, label, target tokens,
+            and previous output tokens data.
+        """
         sample = self._build_infer_sample(data)
         target_str = sample['label'].lower()
         target = super().pre_caption(target_str, max_words=self.max_tgt_length)
@@ -44,6 +67,22 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Preprocessing the input text via `pre_cation` function, see more
+            details from the doc of `pre_cation`.
+        step 2. Uniform the unknown token, such as `<unk>` -> `unk` and `<unk>` -> `unk`.
+        step 3. Get the prompt from input, concatenate with the input text, as new input.
+        step 4. Tokenize the input text and generate the decoder prompt.
+        step 5. Determine Whether or not to add labels to the sample.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image` and `prompt`,
+                the former refers the image input data, and the later refers the text input data.
+        Return:
+            A dict object, contains text, decoder prompt and label data.
+        """
         source = super().pre_caption(
             data[self.column_map['text']], max_words=self.max_src_length)
         source = source.replace('[unk]', 'unk').replace('<unk>', 'unk')
@@ -66,6 +105,19 @@ class OfaSummarizationPreprocessor(OfaBasePreprocessor):
         return sample
 
     def add_noise_to_tgt(self, target):
+        r"""
+        Add noise token to the target sentence.
+
+        step 1. Sampling from uniform distribution to randomly select the
+            noise indices.
+        step 2. Sampling from normal distribution as noise token to replace
+            the relative token in the target.
+
+        Args:
+            target: A sequence of tokens.
+        Returns:
+            A sequence of tokens.
+        """
         noise_indices = torch.FloatTensor(
             target.size(0)).uniform_() < self.cfg.model.get(
                 'noise_ratio', 0.0)
diff --git a/modelscope/preprocessors/ofa/text2sql.py b/modelscope/preprocessors/ofa/text2sql.py
new file mode 100644
index 00000000..63d3dff8
--- /dev/null
+++ b/modelscope/preprocessors/ofa/text2sql.py
@@ -0,0 +1,446 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import random
+import re
+from typing import Any, Dict, List
+
+import torch
+
+from modelscope.utils.constant import ModeKeys
+from .base import OfaBasePreprocessor
+from .utils.bridge_content_encoder import get_database_matches
+from .utils.get_tables import dump_db_json_schema
+
+
+class OfaTextToSqlPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for text to sql tasks
+    """
+
+    def __init__(self,
+                 cfg,
+                 model_dir,
+                 mode=ModeKeys.INFERENCE,
+                 *args,
+                 **kwargs):
+        """preprocess the data
+
+        Args:
+            cfg(modelscope.utils.config.ConfigDict) : model config
+            model_dir (str): model path,
+            mode: preprocessor mode (model mode)
+        """
+        super(OfaTextToSqlPreprocessor, self).__init__(cfg, model_dir, mode,
+                                                       *args, **kwargs)
+
+        self.instruction_text = self.cfg.model.get('prompt',
+                                                   ' . generating sql code.')
+        self.max_struct_length = self.cfg.get('max_struct_length', 256)
+        self.separator = '\t'
+        self.db_schema_cache = {}
+        self.database_path = os.path.join(
+            os.path.abspath(model_dir), 'database')
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        if self.mode == ModeKeys.TRAIN:
+            return self._build_train_sample(data)
+        else:
+            return self._build_infer_sample(data)
+
+    def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        build sample for training tasks.
+
+        step 1. Get the input question and database id from text input
+        step 2. Get the database structure input
+        step 3. Add a pseudo ids for every input.
+        step 4. Calculate the target and previous output items.
+        """
+        assert 'text' in self.column_map and 'text' in data, \
+            'there must be `text` column in task key map and source data'
+        text = data[self.column_map['text']]  # equal data['text']
+        texts = text.split(self.separator)
+        assert len(
+            texts
+        ) == 3, 'invalid input, should contain query, question and database id'
+        query, question, db_id = texts
+
+        # construct struct input
+        if db_id not in self.db_schema_cache:
+            self.db_schema_cache[db_id] = dump_db_json_schema(
+                self.database_path + '/' + db_id + '/' + db_id + '.sqlite',
+                db_id)
+
+        question = ' '.join(question.strip().split()[:self.max_src_length])
+
+        seq_inputs = seq2seq_input(query, question, db_id, self.database_path,
+                                   self.db_schema_cache[db_id], self.cfg.model,
+                                   True)
+        struct_in = seq_inputs['struct_in']
+        text = seq_inputs['text_in']
+        seq_out = seq_inputs['seq_out']
+        db_struct = seq_inputs['db_struct']
+
+        text = '{} ; structured knowledge: {}'.format(
+            text, struct_in) + self.instruction_text
+        src_item = self.tokenize_text(text + self.instruction_text)
+        src_item = src_item[:(self.max_src_length + self.max_struct_length
+                              + 20)]
+
+        tgt_item = self.tokenize_text(
+            ' {}'.format(seq_out), add_bos=False,
+            add_eos=False)[:self.max_tgt_length]
+        target_item = torch.cat([tgt_item, self.eos_item])
+        prev_output_item = torch.cat([self.bos_item, tgt_item])
+
+        sample = {
+            'id': 0.0,
+            'source': src_item,
+            'target': target_item,
+            'prev_output_tokens': prev_output_item,
+            'db_struct': db_struct
+        }
+
+        return sample
+
+    def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        build sample for inference tasks.
+
+        step 1. Get the input question and database id from text input
+        step 2. Get the database structure input
+        step 3. Add a pseudo ids for every input.
+        """
+        assert 'text' in self.column_map and 'text' in data, \
+            'there must be `text` column in task key map and source data'
+        text = data[self.column_map['text']]  # equal data['text']
+        db_id = data.get(self.column_map['database'], 'culture_company')
+        db_id = db_id.strip()
+
+        # construct struct input
+        if db_id not in self.db_schema_cache:
+            self.db_schema_cache[db_id] = dump_db_json_schema(
+                self.database_path + '/' + db_id + '/' + db_id + '.sqlite',
+                db_id)
+
+        text = ' '.join(text.strip().split()[:self.max_src_length])
+
+        seq_inputs = seq2seq_input(None, text, db_id, self.database_path,
+                                   self.db_schema_cache[db_id], self.cfg.model)
+        struct_in = seq_inputs['struct_in']
+        db_struct = seq_inputs['db_struct']
+        text = '{} ; structured knowledge: {}'.format(
+            text, struct_in) + self.instruction_text
+        src_item = self.tokenize_text(text + self.instruction_text)
+        src_item = src_item[:(self.max_src_length + self.max_struct_length
+                              + 20)]
+
+        sample = {'id': 0.0, 'source': src_item, 'db_struct': db_struct}
+
+        if 'solution' in self.column_map and self.column_map[
+                'solution'] in data:
+            sample['label'] = ' {}'.format(data[self.column_map['solution']])
+        return sample
+
+
+def seq2seq_input(query,
+                  question,
+                  db_id,
+                  db_path,
+                  schema,
+                  args,
+                  is_train=False):
+    ex = form_input_for_construction(query, question, db_id, db_path, schema)
+    serialized_schema = spider_add_serialized_schema(
+        ex, args)['serialized_schema'].strip()
+    if not is_train:
+        return {
+            'struct_in': serialized_schema,
+            'text_in': question,
+            'db_struct': ex
+        }
+    question, seq_out = spider_pre_process_one_function(ex, args)
+    return {
+        'struct_in': serialized_schema,
+        'text_in': question,
+        'seq_out': seq_out,
+        'db_struct': ex
+    }
+
+
+def spider_pre_process_one_function(item: dict, args):
+    prefix = ''
+
+    seq_out = spider_get_target(
+        query=item['query'],
+        db_id=item['db_id'],
+        normalize_query=True,
+        target_with_db_id=args.target_with_db_id,
+    )
+
+    return prefix + item['question'].strip(), seq_out
+
+
+def spider_get_target(
+    query: str,
+    db_id: str,
+    normalize_query: bool,
+    target_with_db_id: bool,
+) -> str:
+    _normalize = normalize if normalize_query else (lambda x: x)
+    return f'{db_id} | {_normalize(query)}' if target_with_db_id else _normalize(
+        query)
+
+
+def normalize(query: str) -> str:
+
+    def comma_fix(s):
+        # Remove spaces in front of commas
+        return s.replace(' , ', ', ')
+
+    def white_space_fix(s):
+        # Remove double and triple spaces
+        return ' '.join(s.split())
+
+    def lower(s):
+        # Convert everything except text between (single or double) quotation marks to lower case
+        return re.sub(r"\b(?<!['\"])(\w+)(?!['\"])\b",
+                      lambda match: match.group(1).lower(), s)
+
+    return comma_fix(white_space_fix(lower(query)))
+
+
+def spider_add_serialized_schema(ex: dict, args) -> dict:
+    if getattr(args, 'schema_serialization_with_nl'):
+        serialized_schema = serialize_schema_natural_language(
+            question=ex['question'],
+            db_path=ex['db_path'],
+            db_id=ex['db_id'],
+            db_column_names=ex['db_column_names'],
+            db_table_names=ex['db_table_names'],
+            db_primary_keys=ex['db_primary_keys'],
+            db_foreign_keys=ex['db_foreign_keys'],
+            schema_serialization_with_db_content=args.
+            schema_serialization_with_db_content,
+            normalize_query=True,
+        )
+    else:
+        serialized_schema = serialize_schema(
+            question=ex['question'],
+            db_path=ex['db_path'],
+            db_id=ex['db_id'],
+            db_column_names=ex['db_column_names'],
+            db_table_names=ex['db_table_names'],
+            schema_serialization_type='peteshaw',
+            schema_serialization_randomized=False,
+            schema_serialization_with_db_id=True,
+            schema_serialization_with_db_content=args.
+            schema_serialization_with_db_content,
+            normalize_query=True,
+        )
+    return {'serialized_schema': serialized_schema}
+
+
+def serialize_schema_natural_language(
+    question: str,
+    db_path: str,
+    db_id: str,
+    db_column_names: Dict[str, str],
+    db_table_names: List[str],
+    db_primary_keys,
+    db_foreign_keys,
+    schema_serialization_with_db_content: bool = False,
+    normalize_query: bool = True,
+) -> str:
+    overall_description = f'{db_id} contains tables such as ' \
+                          f'{", ".join([name.lower() if normalize_query else name for name in db_table_names])}.'
+
+    def table_description_primary_key_template(primary_key):
+        return f'{primary_key} is the primary key.'
+
+    def table_description(name, column_names):
+        return f'Table {name} has columns such as {", ".join(column_names)}.'
+
+    def value_description(cv_pairs):
+        return f'{"".join(["The {} contains values such as {}.".format(column, value) for column, value in cv_pairs])}'
+
+    def foreign_key_description(table_1, column_1, table_2, column_2):
+        return f'The {column_1} of {table_1} is the foreign key of {column_2} of {table_2}.'
+
+    db_primary_keys = db_primary_keys['column_id']
+    db_foreign_keys = list(
+        zip(db_foreign_keys['column_id'], db_foreign_keys['other_column_id']))
+
+    descriptions = [overall_description]
+    db_table_name_strs = []
+    db_column_name_strs = []
+    value_sep = ', '
+    for table_id, table_name in enumerate(db_table_names):
+        table_name_str = table_name.lower() if normalize_query else table_name
+        db_table_name_strs.append(table_name_str)
+        columns = []
+        column_value_pairs = []
+        primary_keys = []
+        for column_id, (x, y) in enumerate(
+                zip(db_column_names['table_id'],
+                    db_column_names['column_name'])):
+            if column_id == 0:
+                continue
+            column_str = y.lower() if normalize_query else y
+            db_column_name_strs.append(column_str)
+            if x == table_id:
+                columns.append(column_str)
+                if column_id in db_primary_keys:
+                    primary_keys.append(column_str)
+                if schema_serialization_with_db_content:
+                    matches = get_database_matches(
+                        question=question,
+                        table_name=table_name,
+                        column_name=y,
+                        db_path=(db_path + '/' + db_id + '/' + db_id
+                                 + '.sqlite'),
+                    )
+                    if matches:
+                        column_value_pairs.append(
+                            (column_str, value_sep.join(matches)))
+
+        table_description_columns_str = table_description(
+            table_name_str, columns)
+        descriptions.append(table_description_columns_str)
+        table_description_primary_key_str = table_description_primary_key_template(
+            ', '.join(primary_keys))
+        descriptions.append(table_description_primary_key_str)
+        if len(column_value_pairs) > 0:
+            value_description_str = value_description(column_value_pairs)
+            descriptions.append(value_description_str)
+
+    for x, y in db_foreign_keys:
+        # get the table and column of x
+        x_table_name = db_table_name_strs[db_column_names['table_id'][x]]
+        x_column_name = db_column_name_strs[x]
+        # get the table and column of y
+        y_table_name = db_table_name_strs[db_column_names['table_id'][y]]
+        y_column_name = db_column_name_strs[y]
+        foreign_key_description_str = foreign_key_description(
+            x_table_name, x_column_name, y_table_name, y_column_name)
+        descriptions.append(foreign_key_description_str)
+    return ' '.join(descriptions)
+
+
+def serialize_schema(
+    question: str,
+    db_path: str,
+    db_id: str,
+    db_column_names: Dict[str, str],
+    db_table_names: List[str],
+    schema_serialization_type: str = 'peteshaw',
+    schema_serialization_randomized: bool = False,
+    schema_serialization_with_db_id: bool = True,
+    schema_serialization_with_db_content: bool = False,
+    normalize_query: bool = True,
+) -> str:
+    if schema_serialization_type == 'verbose':
+        db_id_str = 'Database: {db_id}. '
+        table_sep = '. '
+        table_str = 'Table: {table}. Columns: {columns}'
+        column_sep = ', '
+        column_str_with_values = '{column} ({values})'
+        column_str_without_values = '{column}'
+        value_sep = ', '
+    elif schema_serialization_type == 'peteshaw':
+        # see https://github.com/google-research/language/blob/master/language/nqg/tasks/spider/append_schema.py#L42
+        db_id_str = ' | {db_id}'
+        table_sep = ''
+        table_str = ' | {table} : {columns}'
+        column_sep = ' , '
+        column_str_with_values = '{column} ( {values} )'
+        column_str_without_values = '{column}'
+        value_sep = ' , '
+    else:
+        raise NotImplementedError
+
+    def get_column_str(table_name: str, column_name: str) -> str:
+        column_name_str = column_name.lower(
+        ) if normalize_query else column_name
+        if schema_serialization_with_db_content:
+            # print("testing")
+            matches = get_database_matches(
+                question=question,
+                table_name=table_name,
+                column_name=column_name,
+                db_path=(db_path + '/' + db_id + '/' + db_id + '.sqlite'),
+            )
+            if matches:
+                return column_str_with_values.format(
+                    column=column_name_str, values=value_sep.join(matches))
+            else:
+                return column_str_without_values.format(column=column_name_str)
+        else:
+            return column_str_without_values.format(column=column_name_str)
+
+    tables = [
+        table_str.format(
+            table=table_name.lower() if normalize_query else table_name,
+            columns=column_sep.join(
+                map(
+                    lambda y: get_column_str(
+                        table_name=table_name, column_name=y[1]),
+                    filter(
+                        lambda y: y[0] == table_id,
+                        zip(
+                            db_column_names['table_id'],
+                            db_column_names['column_name'],
+                        ),
+                    ),
+                )),
+        ) for table_id, table_name in enumerate(db_table_names)
+    ]
+    if schema_serialization_randomized:
+        random.shuffle(tables)
+    if schema_serialization_with_db_id:
+        serialized_schema = db_id_str.format(
+            db_id=db_id) + table_sep.join(tables)
+    else:
+        serialized_schema = table_sep.join(tables)
+    return serialized_schema
+
+
+def form_input_for_construction(query, question, db_id, db_path, schema):
+    return {
+        'query':
+        query,
+        'question':
+        question,
+        'db_id':
+        db_id,
+        'db_path':
+        db_path,
+        'db_table_names':
+        schema['table_names_original'],
+        'db_column_names': {
+            'table_id': [
+                table_id
+                for table_id, column_name in schema['column_names_original']
+            ],
+            'column_name': [
+                column_name
+                for table_id, column_name in schema['column_names_original']
+            ]
+        },
+        'db_column_types':
+        schema['column_types'],
+        'db_primary_keys': [{
+            'column_id': column_id
+        } for column_id in schema['primary_keys']],
+        'db_foreign_keys': {
+            'column_id': [
+                column_id
+                for column_id, other_column_id in schema['foreign_keys']
+            ],
+            'other_column_id': [
+                other_column_id
+                for column_id, other_column_id in schema['foreign_keys']
+            ]
+        },
+    }
diff --git a/modelscope/preprocessors/ofa/text_classification.py b/modelscope/preprocessors/ofa/text_classification.py
index 24c4f67e..a246e9eb 100644
--- a/modelscope/preprocessors/ofa/text_classification.py
+++ b/modelscope/preprocessors/ofa/text_classification.py
@@ -8,6 +8,9 @@ from .base import OfaBasePreprocessor
 
 
 class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for text classification tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -32,6 +35,18 @@ class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_instruction(self, data):
+        r"""
+        Building text classification task's instruction.
+
+        The `data` should contains key `text` and `text2`, and the final instruction
+        is like  ` can text1 " {} " imply text2 " {} "?`, the first `{}` refer to
+        the value of `text` and the latter refer to `text2`
+
+        step 1. Preprocess for input text `text` and `text2` in `data`.
+            - Do lower, stripe and restrict the maximum length as `max_src_length`.
+        step 2. Using instruction template to generate the final instruction.
+        step 3. Tokenize the instruction as result.
+        """
         text1 = ' '.join(
             data['text'].lower().strip().split()[:self.max_src_length])
         text2 = ' '.join(
@@ -42,6 +57,26 @@ class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
         return instruction_itm
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Building instruction for text classification using `_build_instruction`.
+        step 2. If the `label` is not text, transfer it to text using `label2ans`.
+        step 3. Tokenize the label data.
+        step 4. Concatenate the instruction and label tokens as the target item.
+            - padding the instruction tokens from target item as `target`.
+            - remove the eos token from target item as `prev_output_tokens`.
+        step 5. Add constraint mask.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `text`, `text2`
+                and `label`, both of them refer to a text input, and the target of this job
+                is to find whether or not `text` imply `text2`, the `label` is the supervised
+                data for training.
+        Return:
+            A dict object, contains source text input, target tokens and previous output
+            tokens and constraint mask.
+        """
         instruction_itm = self._build_instruction(data)
         assert 'label' in data, 'there must has `label` column in train phase '
         label = data['label']
@@ -65,16 +100,33 @@ class OfaTextClassificationPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Building instruction for text classification using `_build_instruction`.
+        step 2. Whether or not to add `prefix_token`.
+        step 3. Whether or not to add `label` data.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `text` and `text2`,
+                both of them refer to a text input, and the target of this job is to find
+                whether or not `text` imply `text2`.
+        Return:
+            A dict object, contains source text input, prefix tokens and label data.
+        """
         instruction_itm = self._build_instruction(data)
         if self.prompt_type == 'none':
             prefix_token = []
+            decoder_prompt = self.bos_item
         elif self.prompt_type == 'prev_output':
             prefix_token = instruction_itm[:-1]  # remove eos
+            decoder_prompt = instruction_itm[:-1]
         else:
             raise NotImplementedError
         sample = {
             'source': instruction_itm,
             'prefix_token': prefix_token,
+            'decoder_prompt': decoder_prompt,
         }
         if 'label' in data:
             sample['label'] = self.label2ans[data['label']]
diff --git a/modelscope/preprocessors/ofa/text_to_image_synthesis.py b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
index 2f6000eb..a6b2dd10 100644
--- a/modelscope/preprocessors/ofa/text_to_image_synthesis.py
+++ b/modelscope/preprocessors/ofa/text_to_image_synthesis.py
@@ -8,6 +8,9 @@ from .base import OfaBasePreprocessor
 
 
 class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for text to image synthesis tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -27,6 +30,24 @@ class OfaTextToImageSynthesisPreprocessor(OfaBasePreprocessor):
         self.max_src_length = 64
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building samples for inference.
+
+        step 1. Preprocessing for str input.
+            - do lower, strip and restrict the total length by `max_src_length`.
+        step 2. Building text to image synthesis instruction. The template of
+            the instruction is like `what is the complete image? caption: {}`,
+            while the `{}` will be replaced by the result of step 1.
+        step 3. Tokenize the instruction as model's inputs.
+
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `text`,
+                which refer to the description of synthesis image.
+        Return:
+            A dict object, contains source text input, patch images with `None` value
+            patch masks and code masks with `Tensor([False])` value.
+        """
         source = ' '.join(
             data['text'].lower().strip().split()[:self.max_src_length])
         source = 'what is the complete image? caption: {}'.format(source)
diff --git a/modelscope/preprocessors/ofa/utils/bridge_content_encoder.py b/modelscope/preprocessors/ofa/utils/bridge_content_encoder.py
new file mode 100644
index 00000000..cae7bc4a
--- /dev/null
+++ b/modelscope/preprocessors/ofa/utils/bridge_content_encoder.py
@@ -0,0 +1,266 @@
+"""
+ Copyright (c) 2020, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+
+ Encode DB content.
+"""
+
+import difflib
+import functools
+import sqlite3
+from typing import List, Optional, Tuple
+
+from rapidfuzz import fuzz
+
+# fmt: off
+_stopwords = {
+    'who', 'ourselves', 'down', 'only', 'were', 'him', 'at', "weren't", 'has',
+    'few', "it's", 'm', 'again', 'd', 'haven', 'been', 'other', 'we', 'an',
+    'own', 'doing', 'ma', 'hers', 'all', "haven't", 'in', 'but', "shouldn't",
+    'does', 'out', 'aren', 'you', "you'd", 'himself', "isn't", 'most', 'y',
+    'below', 'is', "wasn't", 'hasn', 'them', 'wouldn', 'against', 'this',
+    'about', 'there', 'don', "that'll", 'a', 'being', 'with', 'your', 'theirs',
+    'its', 'any', 'why', 'now', 'during', 'weren', 'if', 'should', 'those',
+    'be', 'they', 'o', 't', 'of', 'or', 'me', 'i', 'some', 'her', 'do', 'will',
+    'yours', 'for', 'mightn', 'nor', 'needn', 'the', 'until', "couldn't", 'he',
+    'which', 'yourself', 'to', "needn't", "you're", 'because', 'their',
+    'where', 'it', "didn't", 've', 'whom', "should've", 'can', "shan't", 'on',
+    'had', 'have', 'myself', 'am', "don't", 'under', 'was', "won't", 'these',
+    'so', 'as', 'after', 'above', 'each', 'ours', 'hadn', 'having', 'wasn',
+    's', 'doesn', "hadn't", 'than', 'by', 'that', 'both', 'herself', 'his',
+    "wouldn't", 'into', "doesn't", 'before', 'my', 'won', 'more', 'are',
+    'through', 'same', 'how', 'what', 'over', 'll', 'yourselves', 'up',
+    'mustn', "mustn't", "she's", 're', 'such', 'didn', "you'll", 'shan',
+    'when', "you've", 'themselves', "mightn't", 'she', 'from', 'isn', 'ain',
+    'between', 'once', 'here', 'shouldn', 'our', 'and', 'not', 'too', 'very',
+    'further', 'while', 'off', 'couldn', "hasn't", 'itself', 'then', 'did',
+    'just', "aren't"
+}
+# fmt: on
+
+_commonwords = {'no', 'yes', 'many'}
+
+
+def is_number(s: str) -> bool:
+    try:
+        float(s.replace(',', ''))
+        return True
+    except ValueError:
+        return False
+
+
+def is_stopword(s: str) -> bool:
+    return s.strip() in _stopwords
+
+
+def is_commonword(s: str) -> bool:
+    return s.strip() in _commonwords
+
+
+def is_common_db_term(s: str) -> bool:
+    return s.strip() in ['id']
+
+
+class Match(object):
+
+    def __init__(self, start: int, size: int) -> None:
+        self.start = start
+        self.size = size
+
+
+def is_span_separator(c: str) -> bool:
+    return c in "'\"()`,.?! "
+
+
+def split(s: str) -> List[str]:
+    return [c.lower() for c in s.strip()]
+
+
+def prefix_match(s1: str, s2: str) -> bool:
+    i, j = 0, 0
+    for i in range(len(s1)):
+        if not is_span_separator(s1[i]):
+            break
+    for j in range(len(s2)):
+        if not is_span_separator(s2[j]):
+            break
+    if i < len(s1) and j < len(s2):
+        return s1[i] == s2[j]
+    elif i >= len(s1) and j >= len(s2):
+        return True
+    else:
+        return False
+
+
+def get_effective_match_source(s: str, start: int, end: int) -> Match:
+    _start = -1
+
+    for i in range(start, start - 2, -1):
+        if i < 0:
+            _start = i + 1
+            break
+        if is_span_separator(s[i]):
+            _start = i
+            break
+
+    if _start < 0:
+        return None
+
+    _end = -1
+    for i in range(end - 1, end + 3):
+        if i >= len(s):
+            _end = i - 1
+            break
+        if is_span_separator(s[i]):
+            _end = i
+            break
+
+    if _end < 0:
+        return None
+
+    while _start < len(s) and is_span_separator(s[_start]):
+        _start += 1
+    while _end >= 0 and is_span_separator(s[_end]):
+        _end -= 1
+
+    return Match(_start, _end - _start + 1)
+
+
+def get_matched_entries(
+    s: str,
+    field_values: List[str],
+    m_theta: float = 0.85,
+    s_theta: float = 0.85
+) -> Optional[List[Tuple[str, Tuple[str, str, float, float, int]]]]:
+    if not field_values:
+        return None
+
+    if isinstance(s, str):
+        n_grams = split(s)
+    else:
+        n_grams = s
+
+    matched = dict()
+    for field_value in field_values:
+        if not isinstance(field_value, str):
+            continue
+        fv_tokens = split(field_value)
+        sm = difflib.SequenceMatcher(None, n_grams, fv_tokens)
+        match = sm.find_longest_match(0, len(n_grams), 0, len(fv_tokens))
+        if match.size > 0:
+            source_match = get_effective_match_source(n_grams, match.a,
+                                                      match.a + match.size)
+            if source_match and source_match.size > 1:
+                match_str = field_value[match.b:match.b + match.size]
+                source_match_str = s[source_match.start:source_match.start
+                                     + source_match.size]
+                c_match_str = match_str.lower().strip()
+                c_source_match_str = source_match_str.lower().strip()
+                c_field_value = field_value.lower().strip()
+                if (c_match_str and not is_number(c_match_str)
+                        and not is_common_db_term(c_match_str)):
+                    if (is_stopword(c_match_str)
+                            or is_stopword(c_source_match_str)
+                            or is_stopword(c_field_value)):
+                        continue
+                    if c_source_match_str.endswith(c_match_str + "'s"):
+                        match_score = 1.0
+                    else:
+                        if prefix_match(c_field_value, c_source_match_str):
+                            match_score = (
+                                fuzz.ratio(c_field_value, c_source_match_str)
+                                / 100)
+                        else:
+                            match_score = 0
+                    if (is_commonword(c_match_str)
+                            or is_commonword(c_source_match_str)
+                            or is_commonword(c_field_value)
+                        ) and match_score < 1:  # noqa
+                        continue
+                    s_match_score = match_score
+                    if match_score >= m_theta and s_match_score >= s_theta:
+                        if field_value.isupper(
+                        ) and match_score * s_match_score < 1:
+                            continue
+                        matched[match_str] = (
+                            field_value,
+                            source_match_str,
+                            match_score,
+                            s_match_score,
+                            match.size,
+                        )
+
+    if not matched:
+        return None
+    else:
+        return sorted(
+            matched.items(),
+            key=lambda x: (1e16 * x[1][2] + 1e8 * x[1][3] + x[1][4]),
+            reverse=True,
+        )
+
+
+@functools.lru_cache(maxsize=1000, typed=False)
+def get_column_picklist(table_name: str, column_name: str,
+                        db_path: str) -> list:
+    fetch_sql = 'SELECT DISTINCT `{}` FROM `{}`'.format(
+        column_name, table_name)
+    try:
+        conn = sqlite3.connect(db_path)
+        conn.text_factory = bytes
+        c = conn.cursor()
+        c.execute(fetch_sql)
+        picklist = set()
+        for x in c.fetchall():
+            if isinstance(x[0], str):
+                picklist.add(x[0].encode('utf-8'))
+            elif isinstance(x[0], bytes):
+                try:
+                    picklist.add(x[0].decode('utf-8'))
+                except UnicodeDecodeError:
+                    picklist.add(x[0].decode('latin-1'))
+            else:
+                picklist.add(x[0])
+        picklist = list(picklist)
+    finally:
+        conn.close()
+    return picklist
+
+
+def get_database_matches(
+    question: str,
+    table_name: str,
+    column_name: str,
+    db_path: str,
+    top_k_matches: int = 2,
+    match_threshold: float = 0.85,
+) -> List[str]:
+    picklist = get_column_picklist(
+        table_name=table_name, column_name=column_name, db_path=db_path)
+    matches = []
+    if picklist and isinstance(picklist[0], str):
+        matched_entries = get_matched_entries(
+            s=question,
+            field_values=picklist,
+            m_theta=match_threshold,
+            s_theta=match_threshold,
+        )
+        if matched_entries:
+            num_values_inserted = 0
+            for _match_str, (
+                    field_value,
+                    _s_match_str,
+                    match_score,
+                    s_match_score,
+                    _match_size,
+            ) in matched_entries:
+                if 'name' in column_name and match_score * s_match_score < 1:
+                    continue
+                if table_name != 'sqlite_sequence':  # Spider database artifact
+                    matches.append(field_value)
+                    num_values_inserted += 1
+                    if num_values_inserted >= top_k_matches:
+                        break
+    return matches
diff --git a/modelscope/preprocessors/ofa/utils/collate.py b/modelscope/preprocessors/ofa/utils/collate.py
index b5dacd04..4f96eee0 100644
--- a/modelscope/preprocessors/ofa/utils/collate.py
+++ b/modelscope/preprocessors/ofa/utils/collate.py
@@ -7,6 +7,9 @@ import torch
 
 
 def collate_fn(samples, pad_idx, eos_idx):
+    r"""
+    convert the sample to batch tensor.
+    """
     if len(samples) == 0:
         return {}
 
@@ -22,7 +25,7 @@ def collate_fn(samples, pad_idx, eos_idx):
     if samples[0].get('source', None) is not None:
         batch['net_input']['input_ids'] = merge('source')
     if samples[0].get('id', None) is not None:
-        batch['id'] = np.array([s.get['id'] for s in samples])
+        batch['id'] = np.array([s.get('id') for s in samples])
     if samples[0].get('target', None) is not None:
         batch['target'] = merge('target')
         tgt_lengths = torch.LongTensor(
@@ -88,6 +91,20 @@ def collate_fn(samples, pad_idx, eos_idx):
         batch['phone_length'] = torch.tensor(
             [s['phone_target'].size(0) for s in samples], dtype=torch.long)
 
+    # for sudoku
+    if samples[0].get('db_struct', None) is not None:
+        db_struct = [sample['db_struct'] for sample in samples]
+        batch['db_struct'] = db_struct
+    if samples[0].get('mask_ratio', None) is not None:
+        mask_ratio = [sample['mask_ratio'] for sample in samples]
+        batch['mask_ratio'] = mask_ratio
+    if samples[0].get('seg_col_tokens', None) is not None:
+        seg_col_tokens = merge('seg_col_tokens')
+        batch['net_input']['seg_col_tokens'] = seg_col_tokens
+    if samples[0].get('seg_row_tokens', None) is not None:
+        seg_row_tokens = merge('seg_row_tokens')
+        batch['net_input']['seg_row_tokens'] = seg_row_tokens
+
     return batch
 
 
diff --git a/modelscope/preprocessors/ofa/utils/constant.py b/modelscope/preprocessors/ofa/utils/constant.py
index 8a33092e..4a896aab 100644
--- a/modelscope/preprocessors/ofa/utils/constant.py
+++ b/modelscope/preprocessors/ofa/utils/constant.py
@@ -1,3 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
 from modelscope.utils.constant import Tasks
 
 OFA_TASK_KEY_MAPPING = {
@@ -11,4 +13,6 @@ OFA_TASK_KEY_MAPPING = {
     Tasks.visual_entailment: ['image', 'text', 'text2'],
     Tasks.text_to_image_synthesis: ['text'],
     Tasks.auto_speech_recognition: ['wav', 'text'],
+    Tasks.sudoku: ['text'],
+    Tasks.text2sql: ['text', 'database'],
 }
diff --git a/modelscope/preprocessors/ofa/utils/get_tables.py b/modelscope/preprocessors/ofa/utils/get_tables.py
new file mode 100644
index 00000000..e6be4191
--- /dev/null
+++ b/modelscope/preprocessors/ofa/utils/get_tables.py
@@ -0,0 +1,88 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import sqlite3
+import sys
+import traceback
+
+EXIST = {'atis', 'geo', 'advising', 'yelp', 'restaurants', 'imdb', 'academic'}
+
+
+def convert_fk_index(data):
+    fk_holder = []
+    for fk in data['foreign_keys']:
+        tn, col, ref_tn, ref_col = fk[0][0], fk[0][1], fk[1][0], fk[1][1]
+        ref_cid, cid = None, None
+        try:
+            tid = data['table_names_original'].index(tn)
+            ref_tid = data['table_names_original'].index(ref_tn)
+
+            for i, (tab_id,
+                    col_org) in enumerate(data['column_names_original']):
+                if tab_id == ref_tid and ref_col == col_org:
+                    ref_cid = i
+                elif tid == tab_id and col == col_org:
+                    cid = i
+            if ref_cid and cid:
+                fk_holder.append([cid, ref_cid])
+        except ValueError:
+            traceback.print_exc()
+            print('table_names_original: ', data['table_names_original'])
+            print('finding tab name: ', tn, ref_tn)
+            sys.exit()
+    return fk_holder
+
+
+def dump_db_json_schema(db, f):
+    """read table and column info"""
+    conn = sqlite3.connect(db)
+    conn.execute('pragma foreign_keys=ON')
+    cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
+
+    data = {
+        'db_id': f,
+        'table_names_original': [],
+        'table_names': [],
+        'column_names_original': [(-1, '*')],
+        'column_names': [(-1, '*')],
+        'column_types': ['text'],
+        'primary_keys': [],
+        'foreign_keys': [],
+    }
+
+    fk_holder = []
+    for i, item in enumerate(cursor.fetchall()):
+        table_name = item[0]
+        data['table_names_original'].append(table_name)
+        data['table_names'].append(table_name.lower().replace('_', ' '))
+        fks = conn.execute(
+            "PRAGMA foreign_key_list('{}') ".format(table_name)).fetchall()
+        # print("db:{} table:{} fks:{}".format(f,table_name,fks))
+        fk_holder.extend([[(table_name, fk[3]), (fk[2], fk[4])] for fk in fks])
+        cur = conn.execute("PRAGMA table_info('{}') ".format(table_name))
+        for j, col in enumerate(cur.fetchall()):
+            data['column_names_original'].append((i, col[1]))
+            data['column_names'].append((i, col[1].lower().replace('_', ' ')))
+            # varchar, '' -> text, int, numeric -> integer,
+            col_type = col[2].lower()
+            if ('char' in col_type or col_type == '' or 'text' in col_type
+                    or 'var' in col_type):
+                data['column_types'].append('text')
+            elif ('int' in col_type or 'numeric' in col_type
+                  or 'decimal' in col_type or 'number' in col_type
+                  or 'id' in col_type or 'real' in col_type
+                  or 'double' in col_type or 'float' in col_type):
+                data['column_types'].append('number')
+            elif 'date' in col_type or 'time' in col_type or 'year' in col_type:
+                data['column_types'].append('time')
+            elif 'boolean' in col_type:
+                data['column_types'].append('boolean')
+            else:
+                data['column_types'].append('others')
+
+            if col[5] == 1:
+                data['primary_keys'].append(len(data['column_names']) - 1)
+
+    data['foreign_keys'] = fk_holder
+    data['foreign_keys'] = convert_fk_index(data)
+
+    return data
diff --git a/modelscope/preprocessors/ofa/utils/random_help.py b/modelscope/preprocessors/ofa/utils/random_help.py
index e0dca54e..071c49f5 100644
--- a/modelscope/preprocessors/ofa/utils/random_help.py
+++ b/modelscope/preprocessors/ofa/utils/random_help.py
@@ -9,6 +9,9 @@ except ImportError:
 
 
 def get_rng_state():
+    r"""
+    Get random number generator state of torch, xla and cuda.
+    """
     state = {'torch_rng_state': torch.get_rng_state()}
     if xm is not None:
         state['xla_rng_state'] = xm.get_rng_state()
@@ -18,6 +21,9 @@ def get_rng_state():
 
 
 def set_rng_state(state):
+    r"""
+    Set random number generator state of torch, xla and cuda.
+    """
     torch.set_rng_state(state['torch_rng_state'])
     if xm is not None:
         xm.set_rng_state(state['xla_rng_state'])
@@ -26,6 +32,9 @@ def set_rng_state(state):
 
 
 class set_torch_seed(object):
+    r"""
+    Set random seed to torch, xla and cuda.
+    """
 
     def __init__(self, seed):
         assert isinstance(seed, int)
diff --git a/modelscope/preprocessors/ofa/visual_entailment.py b/modelscope/preprocessors/ofa/visual_entailment.py
index fff5bbd3..8e35ed6d 100644
--- a/modelscope/preprocessors/ofa/visual_entailment.py
+++ b/modelscope/preprocessors/ofa/visual_entailment.py
@@ -11,6 +11,9 @@ from .base import OfaBasePreprocessor
 
 
 class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for visual entailment tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -44,6 +47,30 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocess the data using the logic of `_build_infer_sample`
+            and make sure the label data in the result.
+        step 2. Preprocess the label data to generate the `target` and
+        `prev_output_tokens`.
+            - tokenize the label data.
+            - calculate the target item.
+                1) if `promp_type` is `None`, using tokenized label data.
+                2) if `promp_type` is `src`, concatenating the `source` data
+                and tokenized label data.
+                3) if `promp_type` is `prev_output`, concatenating the `source`
+                data without eos token and tokenized label data
+        step 3. Add constraint mask
+
+      Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `text`
+                `text2` and `label` are optional.
+        Return:
+            A dict object, contains source text input, patch images, patch masks
+            with `Tensor([True])` value, decoder prompt, label, target, previous
+            output tokens and constraint mask.
+        """
         sample = self._build_infer_sample(data)
         target = ' {}'.format(sample['label'])
         sample['ref_dict'] = {sample['label']: 1.0}
@@ -82,6 +109,32 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Preprocessing the image as model's image input.
+            - get the pillow image input from `data`
+            - do some transforms to the pillow image, such as resize, normalize etc.
+        step 2. Building the instruction as model's source text input.
+            - use text input to build instruction. so far, we support two kind of
+            input form, we will take different examples to both of them to explain
+            how to use them.
+                1) only `text` input in data. this setting can solve the tasks which
+                judge whether or not the input `text` describe the input image.
+                2) both `text` and `text2` input in data. this setting can solve the
+                tasks which judge whether or not the `text` together with input image
+                can imply the `text2`
+            - tokenize the instruction above.
+        step 3. Calculate the decoder prompt input.
+        step 4. Whether or not to add label data.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `text`
+                `text2` and `label` are optional.
+        Return:
+            A dict object, contains source text input, patch images, patch masks
+            with `Tensor([True])` value, decoder prompt and label.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         if 'text2' not in data:
@@ -101,10 +154,10 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
             text = prompt.format(caption, hypothesis)
         inputs = self.tokenize_text(text)
         if self.prompt_type == 'none':
+            prefix_token = []
             decoder_prompt = self.bos_item
-        elif self.prompt_type == 'src':
-            decoder_prompt = inputs
         elif self.prompt_type == 'prev_output':
+            prefix_token = inputs[:-1]  # remove eos
             decoder_prompt = inputs[:-1]
         else:
             raise NotImplementedError
@@ -112,6 +165,7 @@ class OfaVisualEntailmentPreprocessor(OfaBasePreprocessor):
             'source': inputs,
             'patch_image': patch_image,
             'patch_mask': torch.tensor([True]),
+            'prefix_token': prefix_token,
             'decoder_prompt': decoder_prompt,
         }
         if 'relation' in self.column_map and self.column_map[
diff --git a/modelscope/preprocessors/ofa/visual_grounding.py b/modelscope/preprocessors/ofa/visual_grounding.py
index 2da79670..733b341f 100644
--- a/modelscope/preprocessors/ofa/visual_grounding.py
+++ b/modelscope/preprocessors/ofa/visual_grounding.py
@@ -13,6 +13,9 @@ from .utils import transforms as T
 
 
 class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for visual grounding tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -60,6 +63,36 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocessing the image input for model's image input.
+            - get the pillow image.
+            - calculate the target boxes using for getting the exact area
+            in the pillow image for input text by input `region_coord`. in
+            training setting, `region_coord` will be a label data.
+            - getting the target image as patch images and do some transforms
+            such as resize, normalize etc.
+        step 2. Preprocessing the text input for model's source text input.
+            - do the str preprocessing to text input by function `pre_caption`.
+            - build the instruction. the default instruction is
+            ` which region does the text " {} " describe?`, `{}` refer to the
+            text input.
+            - tokenize the instruction as source text input.
+        step 3. Preprocessing the patch image boxes for model's target text input.
+            - quantize the coordinate of selected patch images
+            - concatenate the quantization results by blank
+            - tokenize the result above as target text input.
+        step 4. Get the previous output tokens using target item without eos token.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`
+                `text` and `region_coord`.
+        Return:
+            A dict object, contains source text input, patch images, patch masks
+            with `Tensor([True])` value, target, previous output tokens,
+            width scale ratio, height scale ratio and region coordinate.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         w, h = image.size
         boxes_target = {
@@ -114,6 +147,29 @@ class OfaVisualGroundingPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Preprocessing image input for model's image input.
+            - get pillow image from data.
+            - do some transforms to the pillow image, such as resize, normalize etc.
+        step 2. Preprocessing the text input for model's text input.
+            - do the str preprocessing to text input by function `pre_caption`.
+            - build the instruction. the default instruction is
+            ` which region does the text " {} " describe?`, `{}` refer to the
+            text input.
+            - tokenize the instruction as source text input.
+        step 3. Whether or not to add label data which refer to a region coordinate
+            in this task.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`
+                `text`.
+        Return:
+            A dict object, contains source text input, patch images, patch masks
+            with `Tensor([True])` value, width scale ratio, height scale ratio
+            and label.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         w, h = image.size
         patch_image = self.patch_resize_transform(image)
diff --git a/modelscope/preprocessors/ofa/visual_question_answering.py b/modelscope/preprocessors/ofa/visual_question_answering.py
index f5afabe3..11045dcd 100644
--- a/modelscope/preprocessors/ofa/visual_question_answering.py
+++ b/modelscope/preprocessors/ofa/visual_question_answering.py
@@ -11,6 +11,9 @@ from .base import OfaBasePreprocessor
 
 
 class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
+    r"""
+    OFA preprocessor for question answer tasks.
+    """
 
     def __init__(self,
                  cfg,
@@ -44,6 +47,31 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
             return self._build_infer_sample(data)
 
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building training samples.
+
+        step 1. Preprocess the data using the logic of `_build_infer_sample`
+            and make sure the label data in the result.
+        step 2. Preprocessing the label data to generate `target` and `prev_output_token`.
+            - add blank in the front out label data and tokenize it as `target` item.
+            - if `prompt_type` is `None`, add the bos token as previous output tokens,
+            add eos tokens as target items.
+            - if `prompt_type` is `src`, concatenate source text input with target item as
+            previous output tokens, remove the bos token and add eos token as target items.
+            - if `prompt_type` is `prev_output`, just like the `prompt_type` is src, the
+            difference is that it will remove the eos token in source text input in this
+            setting.
+            - padding the source item as final target item.
+        step 3. Add constraint mask.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`
+                `text` and `label`.
+        Return:
+            A dict object, contains source text input, patch images, patch masks
+            with `Tensor([True])`, decoder prompt, label, target previous output tokens
+            and constraint mask.
+        """
         sample = self._build_infer_sample(data)
         tgt_item = self.tokenize_text(
             ' {}'.format(sample['label']), add_bos=False, add_eos=False)
@@ -81,6 +109,29 @@ class OfaVisualQuestionAnsweringPreprocessor(OfaBasePreprocessor):
         return sample
 
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        r"""
+        Building inference samples.
+
+        step 1. Preprocessing image input for model's image input.
+            - get pillow image from data.
+            - do some transforms to the pillow image, such as resize, normalize etc.
+        step 2. Preprocessing the text input for model's text input.
+            - add blank in the front of input text.
+            - tokenize the result above as source text input.
+        step 3. Calculating the decoder prompt.
+            - if `prompt_type` is `None`, using bos token.
+            - if `prompt_type` is `src`, using source text input
+            - if `prompt_type` is `prev_output`, using source text input without eos token.
+        step 4. Whether or not to add label data which refer to an answer to the question
+            in this task.
+
+        Args:
+            data (`Dict[str, Any]`): Input data, should contains the key of `image`
+                `text`.
+        Return:
+            A dict object, contains source text input, patch images, patch masks
+            with `Tensor([True])`, decoder prompt and label.
+        """
         image = self.get_img_pil(data[self.column_map['image']])
         patch_image = self.patch_resize_transform(image)
         text = data[self.column_map['text']]
diff --git a/modelscope/preprocessors/video.py b/modelscope/preprocessors/video.py
index 794033b5..0f2c034a 100644
--- a/modelscope/preprocessors/video.py
+++ b/modelscope/preprocessors/video.py
@@ -92,18 +92,21 @@ def _interval_based_sampling(vid_length, vid_fps, target_fps, clip_idx,
                              num_clips, num_frames, interval, minus_interval):
     """
         Generates the frame index list using interval based sampling.
+
         Args:
-            vid_length  (int): the length of the whole video (valid selection range).
-            vid_fps     (int): the original video fps
-            target_fps  (int): the normalized video fps
-            clip_idx    (int): -1 for random temporal sampling, and positive values for sampling specific
-                                clip from the video
-            num_clips   (int): the total clips to be sampled from each video.
-                                combined with clip_idx, the sampled video is the "clip_idx-th" video from
-                                "num_clips" videos.
-            num_frames  (int): number of frames in each sampled clips.
-            interval    (int): the interval to sample each frame.
+            vid_length (int): the length of the whole video (valid selection range).
+            vid_fps (int): the original video fps
+            target_fps (int): the normalized video fps
+            clip_idx (int):
+                -1 for random temporal sampling, and positive values for sampling specific
+                clip from the video
+            num_clips (int):
+                the total clips to be sampled from each video. combined with clip_idx,
+                the sampled video is the "clip_idx-th" video from "num_clips" videos.
+            num_frames (int): number of frames in each sampled clips.
+            interval (int): the interval to sample each frame.
             minus_interval (bool): control the end index
+
         Returns:
             index (tensor): the sampled frame indexes
     """
diff --git a/modelscope/trainers/audio/__init__.py b/modelscope/trainers/audio/__init__.py
index ec18aea8..967f56fc 100644
--- a/modelscope/trainers/audio/__init__.py
+++ b/modelscope/trainers/audio/__init__.py
@@ -7,11 +7,15 @@ if TYPE_CHECKING:
     print('TYPE_CHECKING...')
     from .tts_trainer import KanttsTrainer
     from .ans_trainer import ANSTrainer
+    from .kws_nearfield_trainer import KWSNearfieldTrainer
+    from .kws_farfield_trainer import KWSFarfieldTrainer
 
 else:
     _import_structure = {
         'tts_trainer': ['KanttsTrainer'],
-        'ans_trainer': ['ANSTrainer']
+        'ans_trainer': ['ANSTrainer'],
+        'kws_nearfield_trainer': ['KWSNearfieldTrainer'],
+        'kws_farfield_trainer': ['KWSFarfieldTrainer'],
     }
 
     import sys
diff --git a/modelscope/trainers/audio/asr_trainer.py b/modelscope/trainers/audio/asr_trainer.py
new file mode 100644
index 00000000..4ea25863
--- /dev/null
+++ b/modelscope/trainers/audio/asr_trainer.py
@@ -0,0 +1,171 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+from typing import Dict, Optional, Union
+
+import json
+from funasr.bin import build_trainer
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
+                                       DEFAULT_DATASET_REVISION,
+                                       DEFAULT_MODEL_REVISION, ModelFile,
+                                       Tasks, TrainerStages)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_asr_trainer)
+class ASRTrainer(BaseTrainer):
+    DATA_DIR = 'data'
+
+    def __init__(self,
+                 model: str,
+                 work_dir: str = None,
+                 distributed: bool = False,
+                 dataset_type: str = 'small',
+                 data_dir: Optional[Union[MsDataset, str]] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 batch_bins: Optional[int] = None,
+                 max_epoch: Optional[int] = None,
+                 lr: Optional[float] = None,
+                 mate_params: Optional[dict] = None,
+                 **kwargs):
+        """ASR Trainer.
+
+        Args:
+            model (str) : model name
+            work_dir (str): output dir for saving results
+            distributed (bool): whether to enable DDP training
+            dataset_type (str): choose which dataset type to use
+            data_dir (str): the path of data
+            model_revision (str): set model version
+            batch_bins (str): batch size
+            max_epoch (int): the maximum epoch number for training
+            lr (float): learning rate
+            mate_params (dict): for saving other training args
+        Examples:
+        >>> import os
+        >>> from modelscope.metainfo import Trainers
+        >>> from modelscope.msdatasets import MsDataset
+        >>> from modelscope.trainers import build_trainer
+        >>> ds_dict = MsDataset.load('speech_asr_aishell1_trainsets')
+        >>> kwargs = dict(
+        >>>     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+        >>>     data_dir=ds_dict,
+        >>>     work_dir="./checkpoint")
+        >>> trainer = build_trainer(
+        >>>     Trainers.speech_asr_trainer, default_args=kwargs)
+        >>> trainer.train()
+
+        """
+        if not work_dir:
+            self.work_dir = tempfile.TemporaryDirectory().name
+            if not os.path.exists(self.work_dir):
+                os.makedirs(self.work_dir)
+        else:
+            self.work_dir = work_dir
+
+        if not os.path.exists(self.work_dir):
+            raise Exception(f'{self.work_dir} not exists')
+
+        logger.info(f'Set workdir to {self.work_dir}')
+
+        self.data_dir = os.path.join(self.work_dir, self.DATA_DIR)
+        self.raw_dataset_path = ''
+        self.distributed = distributed
+        self.dataset_type = dataset_type
+
+        shutil.rmtree(self.data_dir, ignore_errors=True)
+
+        os.makedirs(self.data_dir, exist_ok=True)
+
+        if os.path.exists(model):
+            model_dir = model
+        else:
+            model_dir = self.get_or_download_model_dir(model, model_revision)
+        self.model_dir = model_dir
+        self.model_cfg = os.path.join(self.model_dir, 'configuration.json')
+        self.cfg_dict = self.parse_cfg(self.model_cfg)
+
+        if 'raw_data_dir' not in data_dir:
+            self.train_data_dir, self.dev_data_dir = self.load_dataset_raw_path(
+                data_dir, self.data_dir)
+        else:
+            self.data_dir = data_dir['raw_data_dir']
+        self.trainer = build_trainer.build_trainer(
+            modelscope_dict=self.cfg_dict,
+            data_dir=self.data_dir,
+            output_dir=self.work_dir,
+            distributed=self.distributed,
+            dataset_type=self.dataset_type,
+            batch_bins=batch_bins,
+            max_epoch=max_epoch,
+            lr=lr,
+            mate_params=mate_params)
+
+    def parse_cfg(self, cfg_file):
+        cur_dir = os.path.dirname(cfg_file)
+        cfg_dict = dict()
+        with open(cfg_file, 'r', encoding='utf-8') as f:
+            config = json.load(f)
+            cfg_dict['mode'] = config['model']['model_config']['mode']
+            cfg_dict['model_dir'] = cur_dir
+            cfg_dict['am_model_file'] = os.path.join(
+                cur_dir, config['model']['am_model_name'])
+            cfg_dict['am_model_config'] = os.path.join(
+                cur_dir, config['model']['model_config']['am_model_config'])
+            cfg_dict['finetune_config'] = os.path.join(cur_dir,
+                                                       'finetune.yaml')
+            cfg_dict['cmvn_file'] = os.path.join(
+                cur_dir, config['model']['model_config']['mvn_file'])
+            cfg_dict['seg_dict'] = os.path.join(cur_dir, 'seg_dict')
+            if 'init_model' in config['model']['model_config']:
+                cfg_dict['init_model'] = os.path.join(
+                    cur_dir, config['model']['model_config']['init_model'])
+            else:
+                cfg_dict['init_model'] = cfg_dict['am_model_file']
+        return cfg_dict
+
+    def load_dataset_raw_path(self, dataset, output_data_dir):
+        if 'train' not in dataset:
+            raise Exception(
+                'dataset {0} does not contain a train split'.format(dataset))
+        train_data_dir = self.prepare_data(
+            dataset, output_data_dir, split='train')
+        if 'validation' not in dataset:
+            raise Exception(
+                'dataset {0} does not contain a dev split'.format(dataset))
+        dev_data_dir = self.prepare_data(
+            dataset, output_data_dir, split='validation')
+        return train_data_dir, dev_data_dir
+
+    def prepare_data(self, dataset, out_base_dir, split='train'):
+        out_dir = os.path.join(out_base_dir, split)
+        shutil.rmtree(out_dir, ignore_errors=True)
+        os.makedirs(out_dir, exist_ok=True)
+        data_cnt = len(dataset[split])
+        fp_wav_scp = open(os.path.join(out_dir, 'wav.scp'), 'w')
+        fp_text = open(os.path.join(out_dir, 'text'), 'w')
+        for i in range(data_cnt):
+            content = dataset[split][i]
+            wav_file = content['Audio:FILE']
+            text = content['Text:LABEL']
+            fp_wav_scp.write('\t'.join([os.path.basename(wav_file), wav_file])
+                             + '\n')
+            fp_text.write('\t'.join([os.path.basename(wav_file), text]) + '\n')
+        fp_text.close()
+        fp_wav_scp.close()
+        return out_dir
+
+    def train(self, *args, **kwargs):
+        self.trainer.run()
+
+    def evaluate(self, checkpoint_path: str, *args,
+                 **kwargs) -> Dict[str, float]:
+        raise NotImplementedError
diff --git a/modelscope/trainers/audio/kws_nearfield_trainer.py b/modelscope/trainers/audio/kws_nearfield_trainer.py
new file mode 100644
index 00000000..ba3f5f5f
--- /dev/null
+++ b/modelscope/trainers/audio/kws_nearfield_trainer.py
@@ -0,0 +1,471 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import datetime
+import math
+import os
+import random
+import re
+import sys
+from shutil import copyfile
+from typing import Callable, Dict, Optional
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import yaml
+from tensorboardX import SummaryWriter
+from torch import nn as nn
+from torch import optim as optim
+from torch.distributed import ReduceOp
+from torch.nn.utils import clip_grad_norm_
+from torch.utils.data import DataLoader
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model, TorchModel
+from modelscope.msdatasets.task_datasets.audio.kws_nearfield_dataset import \
+    kws_nearfield_dataset
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.audio.audio_utils import update_conf
+from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.data_utils import to_device
+from modelscope.utils.device import create_device
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist, is_master,
+                                          set_random_seed)
+from .kws_utils.batch_utils import executor_cv, executor_test, executor_train
+from .kws_utils.det_utils import compute_det
+from .kws_utils.file_utils import query_tokens_id, read_lexicon, read_token
+from .kws_utils.model_utils import (average_model, convert_to_kaldi,
+                                    count_parameters)
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(
+    module_name=Trainers.speech_kws_fsmn_char_ctc_nearfield)
+class KWSNearfieldTrainer(BaseTrainer):
+
+    def __init__(self,
+                 model: str,
+                 work_dir: str,
+                 cfg_file: Optional[str] = None,
+                 arg_parse_fn: Optional[Callable] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 **kwargs):
+        '''
+        Args:
+            work_dir (str): main directory for training
+            kwargs:
+                checkpoint (str): basemodel checkpoint, if None, default to use base.pt in model path
+                train_data (int): wave list with kaldi style for training
+                cv_data (int): wave list with kaldi style for cross validation
+                trans_data (str): transcription list with kaldi style, merge train and cv
+                tensorboard_dir (str): path to save tensorboard results,
+                                       create 'tensorboard_dir' in work_dir by default
+        '''
+        if isinstance(model, str):
+            self.model_dir = self.get_or_download_model_dir(
+                model, model_revision)
+            if cfg_file is None:
+                cfg_file = os.path.join(self.model_dir,
+                                        ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
+            self.model_dir = os.path.dirname(cfg_file)
+
+        super().__init__(cfg_file, arg_parse_fn)
+        configs = Config.from_file(cfg_file)
+
+        print(kwargs)
+        self.launcher = 'pytorch'
+        self.dist_backend = configs.train.get('dist_backend', 'nccl')
+        self.tensorboard_dir = kwargs.get('tensorboard_dir', 'tensorboard')
+        self.checkpoint = kwargs.get(
+            'checkpoint', os.path.join(self.model_dir, 'train/base.pt'))
+        self.avg_checkpoint = None
+
+        # 1. get rank info
+        set_random_seed(kwargs.get('seed', 666))
+        self.get_dist_info()
+        logger.info('RANK {}/{}/{}, Master addr:{}, Master port:{}'.format(
+            self.world_size, self.rank, self.local_rank, self.master_addr,
+            self.master_port))
+
+        self.work_dir = work_dir
+        if self.rank == 0:
+            if not os.path.exists(self.work_dir):
+                os.makedirs(self.work_dir)
+            logger.info(f'Current working dir is {work_dir}')
+
+        # 2. prepare dataset and dataloader
+        token_file = os.path.join(self.model_dir, 'train/tokens.txt')
+        assert os.path.exists(token_file), f'{token_file} is missing'
+        self.token_table = read_token(token_file)
+
+        lexicon_file = os.path.join(self.model_dir, 'train/lexicon.txt')
+        assert os.path.exists(lexicon_file), f'{lexicon_file} is missing'
+        self.lexicon_table = read_lexicon(lexicon_file)
+
+        assert kwargs['train_data'], 'please config train data in dict kwargs'
+        assert kwargs['cv_data'], 'please config cv data in dict kwargs'
+        assert kwargs[
+            'trans_data'], 'please config transcription data in dict kwargs'
+        self.train_data = kwargs['train_data']
+        self.cv_data = kwargs['cv_data']
+        self.trans_data = kwargs['trans_data']
+
+        train_conf = configs['preprocessor']
+        cv_conf = copy.deepcopy(train_conf)
+        cv_conf['speed_perturb'] = False
+        cv_conf['spec_aug'] = False
+        cv_conf['shuffle'] = False
+        self.train_dataset = kws_nearfield_dataset(self.train_data,
+                                                   self.trans_data, train_conf,
+                                                   self.token_table,
+                                                   self.lexicon_table, True)
+        self.cv_dataset = kws_nearfield_dataset(self.cv_data, self.trans_data,
+                                                cv_conf, self.token_table,
+                                                self.lexicon_table, True)
+
+        self.train_dataloader = DataLoader(
+            self.train_dataset,
+            batch_size=None,
+            pin_memory=kwargs.get('pin_memory', False),
+            persistent_workers=True,
+            num_workers=configs.train.dataloader.workers_per_gpu,
+            prefetch_factor=configs.train.dataloader.get('prefetch', 2))
+        self.cv_dataloader = DataLoader(
+            self.cv_dataset,
+            batch_size=None,
+            pin_memory=kwargs.get('pin_memory', False),
+            persistent_workers=True,
+            num_workers=configs.evaluation.dataloader.workers_per_gpu,
+            prefetch_factor=configs.evaluation.dataloader.get('prefetch', 2))
+
+        # 3. build model, and load checkpoint
+        feature_transform_file = os.path.join(
+            self.model_dir, 'train/feature_transform.txt.80dim-l2r2')
+        assert os.path.exists(feature_transform_file), \
+            f'{feature_transform_file} is missing'
+        configs.model['cmvn_file'] = feature_transform_file
+
+        # 3.1 Init kws model from configs
+        self.model = self.build_model(configs)
+        num_params = count_parameters(self.model)
+        if self.rank == 0:
+            # print(model)
+            logger.warning('the number of model params: {}'.format(num_params))
+
+        # 3.2 if specify checkpoint, load infos and params
+        if self.checkpoint is not None and os.path.exists(self.checkpoint):
+            load_checkpoint(self.checkpoint, self.model)
+            info_path = re.sub('.pt$', '.yaml', self.checkpoint)
+            infos = {}
+            if os.path.exists(info_path):
+                with open(info_path, 'r') as fin:
+                    infos = yaml.load(fin, Loader=yaml.FullLoader)
+        else:
+            logger.warning('Training with random initialized params')
+            infos = {}
+        self.start_epoch = infos.get('epoch', -1) + 1
+        configs['train']['start_epoch'] = self.start_epoch
+
+        lr_last_epoch = infos.get('lr', configs['train']['optimizer']['lr'])
+        configs['train']['optimizer']['lr'] = lr_last_epoch
+
+        # 3.3 model placement
+        self.device_name = kwargs.get('device', 'gpu')
+        if self.world_size > 1:
+            self.device_name = f'cuda:{self.local_rank}'
+        self.device = create_device(self.device_name)
+
+        if self.world_size > 1:
+            assert (torch.cuda.is_available())
+            # cuda model is required for nn.parallel.DistributedDataParallel
+            self.model.cuda()
+            self.model = torch.nn.parallel.DistributedDataParallel(self.model)
+        else:
+            self.model = self.model.to(self.device)
+
+        # 4. write config.yaml for inference and export
+        self.configs = configs
+        if self.rank == 0:
+            if not os.path.exists(self.work_dir):
+                os.makedirs(self.work_dir)
+            saved_config_path = os.path.join(self.work_dir, 'config.yaml')
+            with open(saved_config_path, 'w') as fout:
+                data = yaml.dump(configs.to_dict())
+                fout.write(data)
+
+    def train(self, *args, **kwargs):
+        logger.info('Start training...')
+
+        writer = None
+        if self.rank == 0:
+            os.makedirs(self.work_dir, exist_ok=True)
+            writer = SummaryWriter(
+                os.path.join(self.work_dir, self.tensorboard_dir))
+
+        log_interval = self.configs['train'].get('log_interval', 10)
+
+        optim_conf = self.configs['train']['optimizer']
+        optimizer = optim.Adam(
+            self.model.parameters(),
+            lr=optim_conf['lr'],
+            weight_decay=optim_conf['weight_decay'])
+        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            mode='min',
+            factor=0.5,
+            patience=3,
+            min_lr=1e-6,
+            threshold=0.01,
+        )
+
+        final_epoch = None
+        if self.start_epoch == 0 and self.rank == 0:
+            save_model_path = os.path.join(self.work_dir, 'init.pt')
+            save_checkpoint(self.model, save_model_path, None, None, None,
+                            False)
+
+        # Start training loop
+        logger.info('Start training...')
+        training_config = {}
+        training_config['grad_clip'] = optim_conf['grad_clip']
+        training_config['log_interval'] = log_interval
+        training_config['world_size'] = self.world_size
+        training_config['rank'] = self.rank
+        training_config['local_rank'] = self.local_rank
+
+        max_epoch = self.configs['train']['max_epochs']
+        totaltime = datetime.datetime.now()
+        for epoch in range(self.start_epoch, max_epoch):
+            self.train_dataset.set_epoch(epoch)
+            training_config['epoch'] = epoch
+
+            lr = optimizer.param_groups[0]['lr']
+            logger.info('Epoch {} TRAIN info lr {}'.format(epoch, lr))
+            executor_train(self.model, optimizer, self.train_dataloader,
+                           self.device, writer, training_config)
+            cv_loss = executor_cv(self.model, self.cv_dataloader, self.device,
+                                  training_config)
+            logger.info('Epoch {} EVAL info cv_loss {:.6f}'.format(
+                epoch, cv_loss))
+
+            if self.rank == 0:
+                save_model_path = os.path.join(self.work_dir,
+                                               '{}.pt'.format(epoch))
+                save_checkpoint(self.model, save_model_path, None, None, None,
+                                False)
+
+                info_path = re.sub('.pt$', '.yaml', save_model_path)
+                info_dict = dict(
+                    epoch=epoch,
+                    lr=lr,
+                    cv_loss=cv_loss,
+                )
+                with open(info_path, 'w') as fout:
+                    data = yaml.dump(info_dict)
+                    fout.write(data)
+
+                writer.add_scalar('epoch/cv_loss', cv_loss, epoch)
+                writer.add_scalar('epoch/lr', lr, epoch)
+            final_epoch = epoch
+            lr_scheduler.step(cv_loss)
+
+        if final_epoch is not None and self.rank == 0:
+            writer.close()
+
+        totaltime = datetime.datetime.now() - totaltime
+        logger.info('Total time spent: {:.2f} hours'.format(
+            totaltime.total_seconds() / 3600.0))
+
+    def evaluate(self, checkpoint_path: str, *args,
+                 **kwargs) -> Dict[str, float]:
+        '''
+        Args:
+            checkpoint_path (str): evaluating with ckpt or default average ckpt
+            kwargs:
+                test_dir (str): local path for saving test results
+                test_data (str): wave list with kaldi style
+                trans_data (str): transcription list with kaldi style
+                average_num (int): the NO. to do model averaging(checkpoint_path==None)
+                batch_size (int): batch size during evaluating
+                keywords (str): keyword string, split with ','
+                gpu (int): evaluating with cpu/gpu: -1 for cpu; >=0 for gpu,
+                           os.environ['CUDA_VISIBLE_DEVICES'] will be setted
+        '''
+        # 1. get checkpoint
+        if checkpoint_path is not None and checkpoint_path != '':
+            logger.warning(
+                f'evaluating with specific model: {checkpoint_path}')
+            eval_checkpoint = checkpoint_path
+        else:
+            if self.avg_checkpoint is None:
+                avg_num = kwargs.get('average_num', 5)
+                self.avg_checkpoint = os.path.join(self.work_dir,
+                                                   f'avg_{avg_num}.pt')
+                logger.warning(
+                    f'default average model not exist: {self.avg_checkpoint}')
+                avg_kwargs = dict(
+                    dst_model=self.avg_checkpoint,
+                    src_path=self.work_dir,
+                    val_best=True,
+                    avg_num=avg_num,
+                )
+                self.avg_checkpoint = average_model(**avg_kwargs)
+
+                model_cvt = self.build_model(self.configs)
+                kaldi_cvt = convert_to_kaldi(
+                    model_cvt,
+                    self.avg_checkpoint,
+                    self.work_dir,
+                )
+                logger.warning(f'average convert to kaldi: {kaldi_cvt}')
+
+            eval_checkpoint = self.avg_checkpoint
+            logger.warning(
+                f'evaluating with average model: {self.avg_checkpoint}')
+
+        # 2. get test data and trans
+        if kwargs.get('test_data', None) is not None and \
+           kwargs.get('trans_data', None) is not None:
+            logger.warning('evaluating with specific data and transcription')
+            test_data = kwargs['test_data']
+            trans_data = kwargs['trans_data']
+        else:
+            logger.warning(
+                'evaluating with cross validation data during training')
+            test_data = self.cv_data
+            trans_data = self.trans_data
+        logger.warning(f'test data: {test_data}')
+        logger.warning(f'trans data: {trans_data}')
+
+        # 3. prepare dataset and dataloader
+        test_conf = copy.deepcopy(self.configs['preprocessor'])
+        test_conf['filter_conf']['max_length'] = 102400
+        test_conf['filter_conf']['min_length'] = 0
+        test_conf['speed_perturb'] = False
+        test_conf['spec_aug'] = False
+        test_conf['shuffle'] = False
+        test_conf['feature_extraction_conf']['dither'] = 0.0
+        if kwargs.get('batch_size', None) is not None:
+            test_conf['batch_conf']['batch_size'] = kwargs['batch_size']
+
+        test_dataset = kws_nearfield_dataset(test_data, trans_data, test_conf,
+                                             self.token_table,
+                                             self.lexicon_table, False)
+        test_dataloader = DataLoader(
+            test_dataset,
+            batch_size=None,
+            pin_memory=kwargs.get('pin_memory', False),
+            persistent_workers=True,
+            num_workers=self.configs.evaluation.dataloader.workers_per_gpu,
+            prefetch_factor=self.configs.evaluation.dataloader.get(
+                'prefetch', 2))
+
+        # 4. parse keywords tokens
+        assert kwargs.get('keywords',
+                          None) is not None, 'at least one keyword is needed'
+        keywords_str = kwargs['keywords']
+        keywords_list = keywords_str.strip().replace(' ', '').split(',')
+        keywords_token = {}
+        keywords_tokenset = {0}
+        for keyword in keywords_list:
+            ids = query_tokens_id(keyword, self.token_table,
+                                  self.lexicon_table)
+            keywords_token[keyword] = {}
+            keywords_token[keyword]['token_id'] = ids
+            keywords_token[keyword]['token_str'] = ''.join('%s ' % str(i)
+                                                           for i in ids)
+            [keywords_tokenset.add(i) for i in ids]
+        logger.warning(f'Token set is: {keywords_tokenset}')
+
+        # 5. build model and load checkpoint
+        # support assign specific gpu device
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(kwargs.get('gpu', -1))
+        use_cuda = kwargs.get('gpu', -1) >= 0 and torch.cuda.is_available()
+
+        if kwargs.get('jit_model', None):
+            model = torch.jit.load(eval_checkpoint)
+            # For script model, only cpu is supported.
+            device = torch.device('cpu')
+        else:
+            # Init kws model from configs
+            model = self.build_model(self.configs)
+            load_checkpoint(eval_checkpoint, model)
+            device = torch.device('cuda' if use_cuda else 'cpu')
+        model = model.to(device)
+        model.eval()
+
+        testing_config = {}
+        if kwargs.get('test_dir', None) is not None:
+            testing_config['test_dir'] = kwargs['test_dir']
+        else:
+            base_name = os.path.basename(eval_checkpoint)
+            testing_config['test_dir'] = os.path.join(self.work_dir,
+                                                      'test_' + base_name)
+        self.test_dir = testing_config['test_dir']
+        if not os.path.exists(self.test_dir):
+            os.makedirs(self.test_dir)
+
+        # 6. executing evaluation and get score file
+        logger.info('Start evaluating...')
+        totaltime = datetime.datetime.now()
+        score_file = executor_test(model, test_dataloader, device,
+                                   keywords_token, keywords_tokenset,
+                                   testing_config)
+        totaltime = datetime.datetime.now() - totaltime
+        logger.info('Total time spent: {:.2f} hours'.format(
+            totaltime.total_seconds() / 3600.0))
+
+        # 7. compute det statistic file with score file
+        det_kwargs = dict(
+            keywords=keywords_str,
+            test_data=test_data,
+            trans_data=trans_data,
+            score_file=score_file,
+        )
+        det_results = compute_det(**det_kwargs)
+        print(det_results)
+
+    def build_model(self, configs) -> nn.Module:
+        """ Instantiate a pytorch model and return.
+
+        By default, we will create a model using config from configuration file. You can
+        override this method in a subclass.
+
+        """
+        model = Model.from_pretrained(
+            self.model_dir, cfg_dict=configs, training=True)
+        if isinstance(model, TorchModel) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    def get_dist_info(self):
+        if os.getenv('RANK', None) is None:
+            os.environ['RANK'] = '0'
+        if os.getenv('LOCAL_RANK', None) is None:
+            os.environ['LOCAL_RANK'] = '0'
+        if os.getenv('WORLD_SIZE', None) is None:
+            os.environ['WORLD_SIZE'] = '1'
+        if os.getenv('MASTER_ADDR', None) is None:
+            os.environ['MASTER_ADDR'] = 'localhost'
+        if os.getenv('MASTER_PORT', None) is None:
+            os.environ['MASTER_PORT'] = '29500'
+
+        self.rank = int(os.environ['RANK'])
+        self.local_rank = int(os.environ['LOCAL_RANK'])
+        self.world_size = int(os.environ['WORLD_SIZE'])
+        self.master_addr = os.environ['MASTER_ADDR']
+        self.master_port = os.environ['MASTER_PORT']
+
+        init_dist(self.launcher, self.dist_backend)
+        self.rank, self.world_size = get_dist_info()
+        self.local_rank = get_local_rank()
diff --git a/modelscope/trainers/audio/kws_utils/__init__.py b/modelscope/trainers/audio/kws_utils/__init__.py
new file mode 100644
index 00000000..5e3e009f
--- /dev/null
+++ b/modelscope/trainers/audio/kws_utils/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    print('TYPE_CHECKING...')
+    from .batch_utils import (executor_train, executor_cv, executor_test,
+                              token_score_filter, is_sublist, ctc_loss,
+                              ctc_prefix_beam_search)
+    from .det_utils import (load_data_and_score, load_stats_file, compute_det,
+                            plot_det)
+    from .model_utils import (count_parameters, load_checkpoint,
+                              save_checkpoint, average_model, convert_to_kaldi,
+                              convert_to_pytorch)
+    from .file_utils import (read_lists, make_pair, read_token, read_lexicon,
+                             query_tokens_id)
+    from .runtime_utils import make_runtime_res
+
+else:
+    _import_structure = {
+        'batch_utils': [
+            'executor_train', 'executor_cv', 'executor_test',
+            'token_score_filter', 'is_sublist', 'ctc_loss',
+            'ctc_prefix_beam_search'
+        ],
+        'det_utils':
+        ['load_data_and_score', 'load_stats_file', 'compute_det', 'plot_det'],
+        'model_utils': [
+            'count_parameters', 'load_checkpoint', 'save_checkpoint',
+            'average_model', 'convert_to_kaldi', 'convert_to_pytorch'
+        ],
+        'file_utils': [
+            'read_lists', 'make_pair', 'read_token', 'read_lexicon',
+            'query_tokens_id'
+        ],
+        'runtime_utils': ['make_runtime_res'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/trainers/audio/kws_utils/batch_utils.py b/modelscope/trainers/audio/kws_utils/batch_utils.py
new file mode 100644
index 00000000..8dc866e8
--- /dev/null
+++ b/modelscope/trainers/audio/kws_utils/batch_utils.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2021 Binbin Zhang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import math
+import os
+import sys
+from collections import defaultdict
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed import ReduceOp
+from torch.nn.utils import clip_grad_norm_
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+# torch.set_printoptions(threshold=np.inf)
+
+
+def executor_train(model, optimizer, data_loader, device, writer, args):
+    ''' Train one epoch
+    '''
+    model.train()
+    clip = args.get('grad_clip', 50.0)
+    log_interval = args.get('log_interval', 10)
+    epoch = args.get('epoch', 0)
+
+    rank = args.get('rank', 0)
+    local_rank = args.get('local_rank', 0)
+    world_size = args.get('world_size', 1)
+
+    # [For distributed] Because iteration counts are not always equals between
+    # processes, send stop-flag to the other processes if iterator is finished
+    iterator_stop = torch.tensor(0).to(device)
+
+    for batch_idx, batch in enumerate(data_loader):
+        if world_size > 1:
+            dist.all_reduce(iterator_stop, ReduceOp.SUM)
+        if iterator_stop > 0:
+            break
+
+        key, feats, target, feats_lengths, target_lengths = batch
+        feats = feats.to(device)
+        target = target.to(device)
+        feats_lengths = feats_lengths.to(device)
+        if target_lengths is not None:
+            target_lengths = target_lengths.to(device)
+        num_utts = feats_lengths.size(0)
+        if num_utts == 0:
+            continue
+        logits, _ = model(feats)
+        loss = ctc_loss(logits, target, feats_lengths, target_lengths)
+        optimizer.zero_grad()
+        loss.backward()
+        grad_norm = clip_grad_norm_(model.parameters(), clip)
+        if torch.isfinite(grad_norm):
+            optimizer.step()
+        if batch_idx % log_interval == 0:
+            logger.info(
+                'RANK {}/{}/{} TRAIN Batch {}/{} size {} loss {:.6f}'.format(
+                    world_size, rank, local_rank, epoch, batch_idx, num_utts,
+                    loss.item()))
+    else:
+        iterator_stop.fill_(1)
+        if world_size > 1:
+            dist.all_reduce(iterator_stop, ReduceOp.SUM)
+
+
+def executor_cv(model, data_loader, device, args):
+    ''' Cross validation on
+    '''
+    model.eval()
+    log_interval = args.get('log_interval', 10)
+    epoch = args.get('epoch', 0)
+    # in order to avoid division by 0
+    num_seen_utts = 1
+    total_loss = 0.0
+    # [For distributed] Because iteration counts are not always equals between
+    # processes, send stop-flag to the other processes if iterator is finished
+    iterator_stop = torch.tensor(0).to(device)
+    counter = torch.zeros((3, ), device=device)
+
+    rank = args.get('rank', 0)
+    local_rank = args.get('local_rank', 0)
+    world_size = args.get('world_size', 1)
+
+    with torch.no_grad():
+        for batch_idx, batch in enumerate(data_loader):
+            if world_size > 1:
+                dist.all_reduce(iterator_stop, ReduceOp.SUM)
+            if iterator_stop > 0:
+                break
+
+            key, feats, target, feats_lengths, target_lengths = batch
+            feats = feats.to(device)
+            target = target.to(device)
+            feats_lengths = feats_lengths.to(device)
+            if target_lengths is not None:
+                target_lengths = target_lengths.to(device)
+            num_utts = feats_lengths.size(0)
+            if num_utts == 0:
+                continue
+            logits, _ = model(feats)
+            loss = ctc_loss(logits, target, feats_lengths, target_lengths)
+            if torch.isfinite(loss):
+                num_seen_utts += num_utts
+                total_loss += loss.item() * num_utts
+                counter[0] += loss.item() * num_utts
+                counter[1] += num_utts
+
+            if batch_idx % log_interval == 0:
+                logger.info(
+                    'RANK {}/{}/{} CV Batch {}/{} size {} loss {:.6f} history loss {:.6f}'
+                    .format(world_size, rank, local_rank, epoch, batch_idx,
+                            num_utts, loss.item(), total_loss / num_seen_utts))
+        else:
+            iterator_stop.fill_(1)
+            if world_size > 1:
+                dist.all_reduce(iterator_stop, ReduceOp.SUM)
+
+    if world_size > 1:
+        dist.all_reduce(counter, ReduceOp.SUM)
+    logger.info('Total utts number is {}'.format(counter[1]))
+    counter = counter.to('cpu')
+
+    return counter[0].item() / counter[1].item()
+
+
+def executor_test(model, data_loader, device, keywords_token,
+                  keywords_tokenset, args):
+    ''' Test model with decoder
+    '''
+    assert args.get('test_dir', None) is not None, \
+        'Please config param: test_dir, to store score file'
+    score_abs_path = os.path.join(args['test_dir'], 'score.txt')
+    log_interval = args.get('log_interval', 10)
+
+    infer_seconds = 0.0
+    decode_seconds = 0.0
+    with torch.no_grad(), open(score_abs_path, 'w', encoding='utf8') as fout:
+        for batch_idx, batch in enumerate(data_loader):
+            batch_start_time = datetime.datetime.now()
+
+            keys, feats, target, feats_lengths, target_lengths = batch
+            feats = feats.to(device)
+            feats_lengths = feats_lengths.to(device)
+            if target_lengths is not None:
+                target_lengths = target_lengths.to(device)
+            num_utts = feats_lengths.size(0)
+            if num_utts == 0:
+                continue
+
+            logits, _ = model(feats)
+            logits = logits.softmax(2)  # (1, maxlen, vocab_size)
+            logits = logits.cpu()
+
+            infer_end_time = datetime.datetime.now()
+            for i in range(len(keys)):
+                key = keys[i]
+                score = logits[i][:feats_lengths[i]]
+                hyps = ctc_prefix_beam_search(score, feats_lengths[i],
+                                              keywords_tokenset)
+
+                hit_keyword = None
+                hit_score = 1.0
+                # start = 0; end = 0
+                for one_hyp in hyps:
+                    prefix_ids = one_hyp[0]
+                    # path_score = one_hyp[1]
+                    prefix_nodes = one_hyp[2]
+                    assert len(prefix_ids) == len(prefix_nodes)
+                    for word in keywords_token.keys():
+                        lab = keywords_token[word]['token_id']
+                        offset = is_sublist(prefix_ids, lab)
+                        if offset != -1:
+                            hit_keyword = word
+                            # start = prefix_nodes[offset]['frame']
+                            # end = prefix_nodes[offset+len(lab)-1]['frame']
+                            for idx in range(offset, offset + len(lab)):
+                                hit_score *= prefix_nodes[idx]['prob']
+                            break
+                    if hit_keyword is not None:
+                        hit_score = math.sqrt(hit_score)
+                        break
+
+                if hit_keyword is not None:
+                    # fout.write('{} detected [{:.2f} {:.2f}] {} {:.3f}\n'\
+                    #          .format(key, start*0.03, end*0.03, hit_keyword, hit_score))
+                    fout.write('{} detected {} {:.3f}\n'.format(
+                        key, hit_keyword, hit_score))
+                else:
+                    fout.write('{} rejected\n'.format(key))
+
+            decode_end_time = datetime.datetime.now()
+            infer_seconds += (infer_end_time
+                              - batch_start_time).total_seconds()
+            decode_seconds += (decode_end_time
+                               - infer_end_time).total_seconds()
+
+            if batch_idx % log_interval == 0:
+                logger.info('Progress batch {}'.format(batch_idx))
+                sys.stdout.flush()
+        logger.info(
+            'Total infer cost {:.2f} mins, decode cost {:.2f} mins'.format(
+                infer_seconds / 60.0,
+                decode_seconds / 60.0,
+            ))
+
+    return score_abs_path
+
+
+def is_sublist(main_list, check_list):
+    if len(main_list) < len(check_list):
+        return -1
+
+    if len(main_list) == len(check_list):
+        return 0 if main_list == check_list else -1
+
+    for i in range(len(main_list) - len(check_list)):
+        if main_list[i] == check_list[0]:
+            for j in range(len(check_list)):
+                if main_list[i + j] != check_list[j]:
+                    break
+            else:
+                return i
+    else:
+        return -1
+
+
+def ctc_loss(logits: torch.Tensor, target: torch.Tensor,
+             logits_lengths: torch.Tensor, target_lengths: torch.Tensor):
+    """ CTC Loss
+    Args:
+        logits: (B, D), D is the number of keywords plus 1 (non-keyword)
+        target: (B)
+        logits_lengths: (B)
+        target_lengths: (B)
+    Returns:
+        (float): loss of current batch
+    """
+
+    # logits: (B, L, D) -> (L, B, D)
+    logits = logits.transpose(0, 1)
+    logits = logits.log_softmax(2)
+    loss = F.ctc_loss(
+        logits, target, logits_lengths, target_lengths, reduction='sum')
+    loss = loss / logits.size(1)
+
+    return loss
+
+
+def ctc_prefix_beam_search(
+    logits: torch.Tensor,
+    logits_lengths: torch.Tensor,
+    keywords_tokenset: set = None,
+    score_beam_size: int = 3,
+    path_beam_size: int = 20,
+) -> Tuple[List[List[int]], torch.Tensor]:
+    """ CTC prefix beam search inner implementation
+
+    Args:
+        logits (torch.Tensor): (1, max_len, vocab_size)
+        logits_lengths (torch.Tensor): (1, )
+        keywords_tokenset (set): token set for filtering score
+        score_beam_size (int): beam size for score
+        path_beam_size (int): beam size for path
+
+    Returns:
+        List[List[int]]: nbest results
+    """
+    maxlen = logits.size(0)
+    # ctc_probs = logits.softmax(1)  # (1, maxlen, vocab_size)
+    ctc_probs = logits
+
+    cur_hyps = [(tuple(), (1.0, 0.0, []))]
+
+    # 2. CTC beam search step by step
+    for t in range(0, maxlen):
+        probs = ctc_probs[t]  # (vocab_size,)
+        # key: prefix, value (pb, pnb), default value(-inf, -inf)
+        next_hyps = defaultdict(lambda: (0.0, 0.0, []))
+
+        # 2.1 First beam prune: select topk best
+        top_k_probs, top_k_index = probs.topk(
+            score_beam_size)  # (score_beam_size,)
+
+        # filter prob score that is too small
+        filter_probs = []
+        filter_index = []
+        for prob, idx in zip(top_k_probs.tolist(), top_k_index.tolist()):
+            if prob > 0.05 and idx in keywords_tokenset:
+                filter_probs.append(prob)
+                filter_index.append(idx)
+
+        if len(filter_index) == 0:
+            continue
+
+        for s in filter_index:
+            ps = probs[s].item()
+
+            for prefix, (pb, pnb, cur_nodes) in cur_hyps:
+                last = prefix[-1] if len(prefix) > 0 else None
+                if s == 0:  # blank
+                    n_pb, n_pnb, nodes = next_hyps[prefix]
+                    n_pb = n_pb + pb * ps + pnb * ps
+                    nodes = cur_nodes.copy()
+                    next_hyps[prefix] = (n_pb, n_pnb, nodes)
+                elif s == last:
+                    if not math.isclose(pnb, 0.0, abs_tol=0.000001):
+                        # Update *ss -> *s;
+                        n_pb, n_pnb, nodes = next_hyps[prefix]
+                        n_pnb = n_pnb + pnb * ps
+                        nodes = cur_nodes.copy()
+                        if ps > nodes[-1]['prob']:  # update frame and prob
+                            nodes[-1]['prob'] = ps
+                            nodes[-1]['frame'] = t
+                        next_hyps[prefix] = (n_pb, n_pnb, nodes)
+
+                    if not math.isclose(pb, 0.0, abs_tol=0.000001):
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb, nodes = next_hyps[n_prefix]
+                        n_pnb = n_pnb + pb * ps
+                        nodes = cur_nodes.copy()
+                        nodes.append(dict(token=s, frame=t,
+                                          prob=ps))  # to record token prob
+                        next_hyps[n_prefix] = (n_pb, n_pnb, nodes)
+                else:
+                    n_prefix = prefix + (s, )
+                    n_pb, n_pnb, nodes = next_hyps[n_prefix]
+                    if nodes:
+                        if ps > nodes[-1]['prob']:  # update frame and prob
+                            nodes[-1]['prob'] = ps
+                            nodes[-1]['frame'] = t
+                    else:
+                        nodes = cur_nodes.copy()
+                        nodes.append(dict(token=s, frame=t,
+                                          prob=ps))  # to record token prob
+                    n_pnb = n_pnb + pb * ps + pnb * ps
+                    next_hyps[n_prefix] = (n_pb, n_pnb, nodes)
+
+        # 2.2 Second beam prune
+        next_hyps = sorted(
+            next_hyps.items(), key=lambda x: (x[1][0] + x[1][1]), reverse=True)
+
+        cur_hyps = next_hyps[:path_beam_size]
+
+    hyps = [(y[0], y[1][0] + y[1][1], y[1][2]) for y in cur_hyps]
+    return hyps
diff --git a/modelscope/trainers/audio/kws_utils/det_utils.py b/modelscope/trainers/audio/kws_utils/det_utils.py
new file mode 100644
index 00000000..97b0c2de
--- /dev/null
+++ b/modelscope/trainers/audio/kws_utils/det_utils.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#               2022 Shaoqing Yu(954793264@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+import json
+import matplotlib.font_manager as fm
+import matplotlib.pyplot as plt
+import numpy as np
+import torchaudio
+
+from modelscope.utils.logger import get_logger
+from .file_utils import make_pair, read_lists
+
+logger = get_logger()
+
+font = fm.FontProperties(size=15)
+
+
+def load_data_and_score(keywords_list, data_file, trans_file, score_file):
+    # score_table: {uttid: [keywordlist]}
+    score_table = {}
+    with open(score_file, 'r', encoding='utf8') as fin:
+        # read score file and store in table
+        for line in fin:
+            arr = line.strip().split()
+            key = arr[0]
+            is_detected = arr[1]
+            if is_detected == 'detected':
+                if key not in score_table:
+                    score_table.update(
+                        {key: {
+                            'kw': arr[2],
+                            'confi': float(arr[3])
+                        }})
+            else:
+                if key not in score_table:
+                    score_table.update({key: {'kw': 'unknown', 'confi': -1.0}})
+
+    wav_lists = read_lists(data_file)
+    trans_lists = read_lists(trans_file)
+    data_lists = make_pair(wav_lists, trans_lists)
+
+    # build empty structure for keyword-filler infos
+    keyword_filler_table = {}
+    for keyword in keywords_list:
+        keyword_filler_table[keyword] = {}
+        keyword_filler_table[keyword]['keyword_table'] = {}
+        keyword_filler_table[keyword]['keyword_duration'] = 0.0
+        keyword_filler_table[keyword]['filler_table'] = {}
+        keyword_filler_table[keyword]['filler_duration'] = 0.0
+
+    for obj in data_lists:
+        assert 'key' in obj
+        assert 'wav' in obj
+        assert 'txt' in obj
+        key = obj['key']
+        wav_file = obj['wav']
+        txt = obj['txt']
+        assert key in score_table
+
+        waveform, rate = torchaudio.load(wav_file)
+        frames = len(waveform[0])
+        duration = frames / float(rate)
+
+        for keyword in keywords_list:
+            if txt.find(keyword) != -1:
+                if keyword == score_table[key]['kw']:
+                    keyword_filler_table[keyword]['keyword_table'].update(
+                        {key: score_table[key]['confi']})
+                    keyword_filler_table[keyword][
+                        'keyword_duration'] += duration
+                else:
+                    # uttrance detected but not match this keyword
+                    keyword_filler_table[keyword]['keyword_table'].update(
+                        {key: -1.0})
+                    keyword_filler_table[keyword][
+                        'keyword_duration'] += duration
+            else:
+                keyword_filler_table[keyword]['filler_table'].update(
+                    {key: score_table[key]['confi']})
+                keyword_filler_table[keyword]['filler_duration'] += duration
+
+    return keyword_filler_table
+
+
+def load_stats_file(stats_file):
+    values = []
+    with open(stats_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            threshold, recall, fa_rate, fa_per_hour = arr
+            values.append([float(fa_per_hour), (1 - float(recall)) * 100])
+    values.reverse()
+    return np.array(values)
+
+
+def compute_det(**kwargs):
+    assert kwargs.get('keywords', None) is not None, \
+        'Please config param: keywords, preset keyword str, split with \',\''
+    keywords = kwargs['keywords']
+
+    assert kwargs.get('test_data', None) is not None, \
+        'Please config param: test_data, test waves in list'
+    test_data = kwargs['test_data']
+
+    assert kwargs.get('trans_data', None) is not None, \
+        'Please config param: trans_data, transcription of test waves'
+    trans_data = kwargs['trans_data']
+
+    assert kwargs.get('score_file', None) is not None, \
+        'Please config param: score_file, the output scores of test data'
+    score_file = kwargs['score_file']
+
+    if kwargs.get('stats_dir', None) is not None:
+        stats_dir = kwargs['stats_dir']
+    else:
+        stats_dir = os.path.dirname(score_file)
+    logger.info(f'store all keyword\'s stats file in {stats_dir}')
+    if not os.path.exists(stats_dir):
+        os.makedirs(stats_dir)
+
+    score_step = kwargs.get('score_step', 0.001)
+
+    keywords_list = keywords.replace(' ', '').strip().split(',')
+    keyword_filler_table = load_data_and_score(keywords_list, test_data,
+                                               trans_data, score_file)
+
+    stats_files = {}
+    for keyword in keywords_list:
+        keyword_dur = keyword_filler_table[keyword]['keyword_duration']
+        keyword_num = len(keyword_filler_table[keyword]['keyword_table'])
+        filler_dur = keyword_filler_table[keyword]['filler_duration']
+        filler_num = len(keyword_filler_table[keyword]['filler_table'])
+        assert keyword_num > 0, 'Can\'t compute det for {} without positive sample'
+        assert filler_num > 0, 'Can\'t compute det for {} without negative sample'
+
+        logger.info('Computing det for {}'.format(keyword))
+        logger.info('  Keyword duration: {} Hours, wave number: {}'.format(
+            keyword_dur / 3600.0, keyword_num))
+        logger.info('  Filler duration: {} Hours'.format(filler_dur / 3600.0))
+
+        stats_file = os.path.join(stats_dir, 'stats_' + keyword + '.txt')
+        with open(stats_file, 'w', encoding='utf8') as fout:
+            threshold = 0.0
+            while threshold <= 1.0:
+                num_false_reject = 0
+                num_true_detect = 0
+                # transverse the all keyword_table
+                for key, confi in keyword_filler_table[keyword][
+                        'keyword_table'].items():
+                    if confi < threshold:
+                        num_false_reject += 1
+                    else:
+                        num_true_detect += 1
+
+                num_false_alarm = 0
+                # transverse the all filler_table
+                for key, confi in keyword_filler_table[keyword][
+                        'filler_table'].items():
+                    if confi >= threshold:
+                        num_false_alarm += 1
+                        # print(f'false alarm: {keyword}, {key}, {confi}')
+
+                # false_reject_rate = num_false_reject / keyword_num
+                true_detect_rate = num_true_detect / keyword_num
+
+                num_false_alarm = max(num_false_alarm, 1e-6)
+                false_alarm_per_hour = num_false_alarm / (filler_dur / 3600.0)
+                false_alarm_rate = num_false_alarm / filler_num
+
+                fout.write('{:.3f} {:.6f} {:.6f} {:.6f}\n'.format(
+                    threshold, true_detect_rate, false_alarm_rate,
+                    false_alarm_per_hour))
+                threshold += score_step
+
+        stats_files[keyword] = stats_file
+
+    return stats_files
+
+
+def plot_det(**kwargs):
+    assert kwargs.get('dets_dir', None) is not None, \
+        'Please config param: dets_dir, to load det files'
+    dets_dir = kwargs['dets_dir']
+
+    det_title = kwargs.get('det_title', 'DetCurve')
+
+    assert kwargs.get('figure_file', None) is not None, \
+        'Please config param: figure_file, path to save det curve'
+    figure_file = kwargs['figure_file']
+
+    xlim = kwargs.get('xlim', '[0,2]')
+    # xstep = kwargs.get('xstep', '1')
+    ylim = kwargs.get('ylim', '[15,30]')
+    # ystep = kwargs.get('ystep', '5')
+
+    plt.figure(dpi=200)
+    plt.rcParams['xtick.direction'] = 'in'
+    plt.rcParams['ytick.direction'] = 'in'
+    plt.rcParams['font.size'] = 12
+
+    for file in glob.glob(f'{dets_dir}/*stats*.txt'):
+        logger.info(f'reading det data from {file}')
+        label = os.path.basename(file).split('.')[0]
+        values = load_stats_file(file)
+        plt.plot(values[:, 0], values[:, 1], label=label)
+
+    xlim_splits = xlim.strip().replace('[', '').replace(']', '').split(',')
+    assert len(xlim_splits) == 2
+    ylim_splits = ylim.strip().replace('[', '').replace(']', '').split(',')
+    assert len(ylim_splits) == 2
+
+    plt.xlim(float(xlim_splits[0]), float(xlim_splits[1]))
+    plt.ylim(float(ylim_splits[0]), float(ylim_splits[1]))
+
+    # plt.xticks(range(0, xlim + x_step, x_step))
+    # plt.yticks(range(0, ylim + y_step, y_step))
+    plt.xlabel('False Alarm Per Hour')
+    plt.ylabel('False Rejection Rate (\\%)')
+    plt.title(det_title, fontproperties=font)
+    plt.grid(linestyle='--')
+    # plt.legend(loc='best', fontsize=6)
+    plt.legend(loc='upper right', fontsize=5)
+    # plt.show()
+    plt.savefig(figure_file)
diff --git a/modelscope/trainers/audio/kws_utils/file_utils.py b/modelscope/trainers/audio/kws_utils/file_utils.py
new file mode 100644
index 00000000..95a37153
--- /dev/null
+++ b/modelscope/trainers/audio/kws_utils/file_utils.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+remove_str = ['!sil', '(noise)', '(noise', 'noise)', '·', '’']
+
+
+def read_lists(list_file):
+    lists = []
+    with open(list_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            lists.append(line.strip())
+    return lists
+
+
+def make_pair(wav_lists, trans_lists):
+    trans_table = {}
+    for line in trans_lists:
+        arr = line.strip().replace('\t', ' ').split()
+        if len(arr) < 2:
+            logger.debug('invalid line in trans file: {}'.format(line.strip()))
+            continue
+
+        trans_table[arr[0]] = line.replace(arr[0], '')\
+                                  .replace(' ', '')\
+                                  .replace('(noise)', '')\
+                                  .replace('noise)', '')\
+                                  .replace('(noise', '')\
+                                  .replace('!sil', '')\
+                                  .replace('·', '')\
+                                  .replace('’', '').strip()
+
+    lists = []
+    for line in wav_lists:
+        arr = line.strip().replace('\t', ' ').split()
+        if len(arr) == 2 and arr[0] in trans_table:
+            lists.append(
+                dict(
+                    key=arr[0],
+                    txt=trans_table[arr[0]],
+                    wav=arr[1],
+                    sample_rate=16000))
+        else:
+            logger.debug("can't find corresponding trans for key: {}".format(
+                arr[0]))
+            continue
+
+    return lists
+
+
+def read_token(token_file):
+    tokens_table = {}
+    with open(token_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            tokens_table[arr[0]] = int(arr[1]) - 1
+    fin.close()
+    return tokens_table
+
+
+def read_lexicon(lexicon_file):
+    lexicon_table = {}
+    with open(lexicon_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().replace('\t', ' ').split()
+            assert len(arr) >= 2
+            lexicon_table[arr[0]] = arr[1:]
+    fin.close()
+    return lexicon_table
+
+
+def query_tokens_id(txt, symbol_table, lexicon_table):
+    label = tuple()
+    tokens = []
+
+    parts = [txt.replace(' ', '').strip()]
+    for part in parts:
+        for ch in part:
+            if ch == ' ':
+                ch = '▁'
+            tokens.append(ch)
+
+    for ch in tokens:
+        if ch in symbol_table:
+            label = label + (symbol_table[ch], )
+        elif ch in lexicon_table:
+            for sub_ch in lexicon_table[ch]:
+                if sub_ch in symbol_table:
+                    label = label + (symbol_table[sub_ch], )
+                else:
+                    label = label + (symbol_table['<blk>'], )
+        else:
+            label = label + (symbol_table['<blk>'], )
+
+    return label
diff --git a/modelscope/trainers/audio/kws_utils/model_utils.py b/modelscope/trainers/audio/kws_utils/model_utils.py
new file mode 100644
index 00000000..c2224efe
--- /dev/null
+++ b/modelscope/trainers/audio/kws_utils/model_utils.py
@@ -0,0 +1,137 @@
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+# Author: di.wu@mobvoi.com (DI WU)
+
+import glob
+import os
+import re
+from shutil import copyfile
+
+import numpy as np
+import torch
+import yaml
+
+from modelscope.utils.checkpoint import load_checkpoint, save_checkpoint
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def average_model(**kwargs):
+    assert kwargs.get('dst_model', None) is not None, \
+        'Please config param: dst_model, to save averaged model'
+    dst_model = kwargs['dst_model']
+
+    assert kwargs.get('src_path', None) is not None, \
+        'Please config param: src_path, path of checkpoints to be averaged'
+    src_path = kwargs['src_path']
+
+    val_best = kwargs.get('val_best',
+                          'True')  # average with best loss or final models
+
+    avg_num = kwargs.get('avg_num', 5)  # nums for averaging model
+
+    min_epoch = kwargs.get('min_epoch',
+                           5)  # min epoch used for averaging model
+    max_epoch = kwargs.get('max_epoch',
+                           65536)  # max epoch used for averaging model
+
+    val_scores = []
+    if val_best:
+        yamls = glob.glob('{}/[!config]*.yaml'.format(src_path))
+        for y in yamls:
+            with open(y, 'r') as f:
+                dic_yaml = yaml.load(f, Loader=yaml.FullLoader)
+                print(y, dic_yaml)
+                loss = dic_yaml['cv_loss']
+                epoch = dic_yaml['epoch']
+                if epoch >= min_epoch and epoch <= max_epoch:
+                    val_scores += [[epoch, loss]]
+        val_scores = np.array(val_scores)
+        sort_idx = np.argsort(val_scores[:, -1])
+        sorted_val_scores = val_scores[sort_idx][::1]
+        logger.info('best val scores = ' + str(sorted_val_scores[:avg_num, 1]))
+        logger.info('selected epochs = '
+                    + str(sorted_val_scores[:avg_num, 0].astype(np.int64)))
+        path_list = [
+            src_path + '/{}.pt'.format(int(epoch))
+            for epoch in sorted_val_scores[:avg_num, 0]
+        ]
+    else:
+        path_list = glob.glob('{}/[!avg][!final]*.pt'.format(src_path))
+        path_list = sorted(path_list, key=os.path.getmtime)
+        path_list = path_list[-avg_num:]
+
+    logger.info(path_list)
+    avg = None
+
+    # assert num == len(path_list)
+    if avg_num > len(path_list):
+        logger.info(
+            'insufficient epochs for averaging, exist num:{}, need:{}'.format(
+                len(path_list), avg_num))
+        logger.info('select epoch on best val:{}'.format(path_list[0]))
+        path_list = [path_list[0]]
+
+    for path in path_list:
+        logger.info('Processing {}'.format(path))
+        states = torch.load(path, map_location=torch.device('cpu'))
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            # pytorch 1.6 use true_divide instead of /=
+            # avg[k] = torch.true_divide(avg[k], num)
+            avg[k] = torch.true_divide(avg[k], len(path_list))
+    logger.info('Saving to {}'.format(dst_model))
+    torch.save(avg, dst_model)
+
+    return dst_model
+
+
+def convert_to_kaldi(
+    model: torch.nn.Module,
+    network_file: str,
+    model_dir: str,
+):
+    copyfile(network_file, os.path.join(model_dir, 'origin.torch.pt'))
+    load_checkpoint(network_file, model)
+
+    kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
+    with open(kaldi_text, 'w', encoding='utf8') as fout:
+        nnet_desp = model.to_kaldi_net()
+        fout.write(nnet_desp)
+    fout.close()
+
+    return kaldi_text
+
+
+def convert_to_pytorch(
+    model: torch.nn.Module,
+    network_file: str,
+    model_dir: str,
+):
+    num_params = count_parameters(model)
+    logger.info('the number of model params: {}'.format(num_params))
+
+    copyfile(network_file, os.path.join(model_dir, 'origin.kaldi.txt'))
+    model.to_pytorch_net(network_file)
+
+    save_model_path = os.path.join(model_dir, 'convert.torch.pt')
+    save_checkpoint(model, save_model_path, None, None, None, False)
+
+    logger.info('convert torch format back to kaldi for recheck...')
+    kaldi_text = os.path.join(model_dir, 'convert.kaldi.txt')
+    with open(kaldi_text, 'w', encoding='utf8') as fout:
+        nnet_desp = model.to_kaldi_net()
+        fout.write(nnet_desp)
+    fout.close()
+
+    return save_model_path
diff --git a/modelscope/trainers/audio/kws_utils/runtime_utils.py b/modelscope/trainers/audio/kws_utils/runtime_utils.py
new file mode 100644
index 00000000..38f4fdd4
--- /dev/null
+++ b/modelscope/trainers/audio/kws_utils/runtime_utils.py
@@ -0,0 +1,85 @@
+import codecs
+import os
+import re
+import stat
+import sys
+from collections import OrderedDict
+from shutil import copyfile
+
+import json
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+def make_runtime_res(model_dir, dest_path, kaldi_text, keywords):
+    if not os.path.exists(dest_path):
+        os.makedirs(dest_path)
+    logger.info(f'making runtime resource in {dest_path} for {keywords}')
+
+    # keywords split with ',', like 'keyword1,keyword2, ...'
+    keywords_list = keywords.strip().replace(' ', '').split(',')
+
+    kaldi_path = os.path.join(model_dir, 'train')
+    kaldi_tool = os.path.join(model_dir, 'train/nnet-copy')
+    kaldi_net = os.path.join(dest_path, 'kwsr.net')
+    os.environ['PATH'] = f'{kaldi_path}:$PATH'
+    os.environ['LD_LIBRARY_PATH'] = f'{kaldi_path}:$LD_LIBRARYPATH'
+    assert os.path.exists(kaldi_tool)
+    os.chmod(kaldi_tool, stat.S_IRWXU | stat.S_IRGRP | stat.S_IROTH)
+    os.system(f'{kaldi_tool} --binary=true {kaldi_text} {kaldi_net}')
+
+    copyfile(
+        os.path.join(model_dir, 'kwsr.ccl'),
+        os.path.join(dest_path, 'kwsr.ccl'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.cfg'),
+        os.path.join(dest_path, 'kwsr.cfg'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.gbg'),
+        os.path.join(dest_path, 'kwsr.gbg'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.lex'),
+        os.path.join(dest_path, 'kwsr.lex'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.mdl'),
+        os.path.join(dest_path, 'kwsr.mdl'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.mvn'),
+        os.path.join(dest_path, 'kwsr.mvn'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.phn'),
+        os.path.join(dest_path, 'kwsr.phn'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.tree'),
+        os.path.join(dest_path, 'kwsr.tree'))
+    copyfile(
+        os.path.join(model_dir, 'kwsr.prior'),
+        os.path.join(dest_path, 'kwsr.prior'))
+
+    # build keywords grammar
+    keywords_grammar = os.path.join(dest_path, 'keywords.json')
+
+    keywords_root = {}
+    keywords_root['word_list'] = []
+    for keyword in keywords_list:
+        one_dict = OrderedDict()
+        one_dict['name'] = keyword
+        one_dict['type'] = 'wakeup'
+        one_dict['activation'] = True
+        one_dict['is_main'] = True
+        one_dict['lm_boost'] = 0.0
+        one_dict['am_boost'] = 0.0
+        one_dict['threshold1'] = 0.0
+        one_dict['threshold2'] = -1.0
+        one_dict['subseg_threshold'] = -0.6
+        one_dict['high_threshold'] = 90.0
+        one_dict['min_dur'] = 0.4
+        one_dict['max_dur'] = 2.5
+        one_dict['cc_name'] = 'commoncc'
+        keywords_root['word_list'].append(one_dict)
+
+    with codecs.open(keywords_grammar, 'w', encoding='utf-8') as fh:
+        json.dump(keywords_root, fh, indent=4, ensure_ascii=False)
+        fh.close()
diff --git a/modelscope/trainers/audio/separation_trainer.py b/modelscope/trainers/audio/separation_trainer.py
new file mode 100644
index 00000000..c425325c
--- /dev/null
+++ b/modelscope/trainers/audio/separation_trainer.py
@@ -0,0 +1,561 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import csv
+import os
+from typing import Dict, Optional, Union
+
+import numpy as np
+import speechbrain as sb
+import speechbrain.nnet.schedulers as schedulers
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torch.cuda.amp import autocast
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model, TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.device import create_device
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import (get_dist_info, get_local_rank,
+                                          init_dist)
+
+EVAL_KEY = 'si-snr'
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.speech_separation)
+class SeparationTrainer(BaseTrainer):
+    """A trainer is used for speech separation.
+
+    Args:
+        model: id or local path of the model
+        work_dir: local path to store all training outputs
+        cfg_file: config file of the model
+        train_dataset: dataset for training
+        eval_dataset: dataset for evaluation
+        model_revision: the git version of model on modelhub
+    """
+
+    def __init__(self,
+                 model: str,
+                 work_dir: str,
+                 cfg_file: Optional[str] = None,
+                 train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+                 eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 **kwargs):
+
+        if isinstance(model, str):
+            self.model_dir = self.get_or_download_model_dir(
+                model, model_revision)
+            if cfg_file is None:
+                cfg_file = os.path.join(self.model_dir,
+                                        ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
+            self.model_dir = os.path.dirname(cfg_file)
+
+        BaseTrainer.__init__(self, cfg_file)
+
+        self.model = self.build_model()
+        self.work_dir = work_dir
+        if kwargs.get('launcher', None) is not None:
+            init_dist(kwargs['launcher'])
+        _, world_size = get_dist_info()
+        self._dist = world_size > 1
+
+        device_name = kwargs.get('device', 'gpu')
+        if self._dist:
+            local_rank = get_local_rank()
+            device_name = f'cuda:{local_rank}'
+        self.device = create_device(device_name)
+
+        if 'max_epochs' not in kwargs:
+            assert hasattr(
+                self.cfg.train, 'max_epochs'
+            ), 'max_epochs is missing from the configuration file'
+            self._max_epochs = self.cfg.train.max_epochs
+        else:
+            self._max_epochs = kwargs['max_epochs']
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+
+        hparams_file = os.path.join(self.model_dir, 'hparams.yaml')
+        overrides = {
+            'output_folder':
+            self.work_dir,
+            'seed':
+            self.cfg.train.seed,
+            'lr':
+            self.cfg.train.optimizer.lr,
+            'weight_decay':
+            self.cfg.train.optimizer.weight_decay,
+            'clip_grad_norm':
+            self.cfg.train.optimizer.clip_grad_norm,
+            'factor':
+            self.cfg.train.lr_scheduler.factor,
+            'patience':
+            self.cfg.train.lr_scheduler.patience,
+            'dont_halve_until_epoch':
+            self.cfg.train.lr_scheduler.dont_halve_until_epoch,
+        }
+        # load hyper params
+        from hyperpyyaml import load_hyperpyyaml
+        with open(hparams_file) as fin:
+            self.hparams = load_hyperpyyaml(fin, overrides=overrides)
+        # Create experiment directory
+        sb.create_experiment_directory(
+            experiment_directory=self.work_dir,
+            hyperparams_to_save=hparams_file,
+            overrides=overrides,
+        )
+
+        run_opts = {
+            'debug': False,
+            'device': 'cpu',
+            'data_parallel_backend': False,
+            'distributed_launch': False,
+            'distributed_backend': 'nccl',
+            'find_unused_parameters': False
+        }
+        if self.device.type == 'cuda':
+            run_opts['device'] = f'{self.device.type}:{self.device.index}'
+        self.epoch_counter = sb.utils.epoch_loop.EpochCounter(self._max_epochs)
+        self.hparams['epoch_counter'] = self.epoch_counter
+        self.hparams['checkpointer'].add_recoverables(
+            {'counter': self.epoch_counter})
+        modules = self.model.as_dict()
+        self.hparams['checkpointer'].add_recoverables(modules)
+        # Brain class initialization
+        self.separator = Separation(
+            modules=modules,
+            opt_class=self.hparams['optimizer'],
+            hparams=self.hparams,
+            run_opts=run_opts,
+            checkpointer=self.hparams['checkpointer'],
+        )
+
+    def build_model(self) -> torch.nn.Module:
+        """ Instantiate a pytorch model and return.
+        """
+        model = Model.from_pretrained(
+            self.model_dir, cfg_dict=self.cfg, training=True)
+        if isinstance(model, TorchModel) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, torch.nn.Module):
+            return model
+
+    def train(self, *args, **kwargs):
+        self.separator.fit(
+            self.epoch_counter,
+            self.train_dataset,
+            self.eval_dataset,
+            train_loader_kwargs=self.hparams['dataloader_opts'],
+            valid_loader_kwargs=self.hparams['dataloader_opts'],
+        )
+
+    def evaluate(self, checkpoint_path: str, *args,
+                 **kwargs) -> Dict[str, float]:
+        if checkpoint_path:
+            self.hparams.checkpointer.checkpoints_dir = checkpoint_path
+        else:
+            self.model.load_check_point(device=self.device)
+        value = self.separator.evaluate(
+            self.eval_dataset,
+            test_loader_kwargs=self.hparams['dataloader_opts'],
+            min_key=EVAL_KEY)
+        return {EVAL_KEY: value}
+
+
+class Separation(sb.Brain):
+    """A subclass of speechbrain.Brain implements training steps."""
+
+    def compute_forward(self, mix, targets, stage, noise=None):
+        """Forward computations from the mixture to the separated signals."""
+
+        # Unpack lists and put tensors in the right device
+        mix, mix_lens = mix
+        mix, mix_lens = mix.to(self.device), mix_lens.to(self.device)
+
+        # Convert targets to tensor
+        targets = torch.cat(
+            [
+                targets[i][0].unsqueeze(-1)
+                for i in range(self.hparams.num_spks)
+            ],
+            dim=-1,
+        ).to(self.device)
+
+        # Add speech distortions
+        if stage == sb.Stage.TRAIN:
+            with torch.no_grad():
+                if self.hparams.use_speedperturb or self.hparams.use_rand_shift:
+                    mix, targets = self.add_speed_perturb(targets, mix_lens)
+
+                    mix = targets.sum(-1)
+
+                if self.hparams.use_wavedrop:
+                    mix = self.hparams.wavedrop(mix, mix_lens)
+
+                if self.hparams.limit_training_signal_len:
+                    mix, targets = self.cut_signals(mix, targets)
+
+        # Separation
+        mix_w = self.modules['encoder'](mix)
+        est_mask = self.modules['masknet'](mix_w)
+        mix_w = torch.stack([mix_w] * self.hparams.num_spks)
+        sep_h = mix_w * est_mask
+
+        # Decoding
+        est_source = torch.cat(
+            [
+                self.modules['decoder'](sep_h[i]).unsqueeze(-1)
+                for i in range(self.hparams.num_spks)
+            ],
+            dim=-1,
+        )
+        # T changed after conv1d in encoder, fix it here
+        T_origin = mix.size(1)
+        T_est = est_source.size(1)
+        if T_origin > T_est:
+            est_source = F.pad(est_source, (0, 0, 0, T_origin - T_est))
+        else:
+            est_source = est_source[:, :T_origin, :]
+
+        return est_source, targets
+
+    def compute_objectives(self, predictions, targets):
+        """Computes the sinr loss"""
+        return self.hparams.loss(targets, predictions)
+
+    # yapf: disable
+    def fit_batch(self, batch):
+        """Trains one batch"""
+        # Unpacking batch list
+        mixture = batch.mix_sig
+        targets = [batch.s1_sig, batch.s2_sig]
+
+        if self.hparams.num_spks == 3:
+            targets.append(batch.s3_sig)
+
+        if self.auto_mix_prec:
+            with autocast():
+                predictions, targets = self.compute_forward(
+                    mixture, targets, sb.Stage.TRAIN)
+                loss = self.compute_objectives(predictions, targets)
+                # hard threshold the easy dataitems
+                if self.hparams.threshold_byloss:
+                    th = self.hparams.threshold
+                    loss_to_keep = loss[loss > th]
+                    if loss_to_keep.nelement() > 0:
+                        loss = loss_to_keep.mean()
+                    else:
+                        print('loss has zero elements!!')
+                else:
+                    loss = loss.mean()
+
+            # the fix for computational problems
+            if loss < self.hparams.loss_upper_lim and loss.nelement() > 0:
+                self.scaler.scale(loss).backward()
+                if self.hparams.clip_grad_norm >= 0:
+                    self.scaler.unscale_(self.optimizer)
+                    torch.nn.utils.clip_grad_norm_(
+                        self.modules.parameters(),
+                        self.hparams.clip_grad_norm,
+                    )
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+            else:
+                self.nonfinite_count += 1
+                logger.info(
+                    'infinite loss or empty loss! it happened {} times so far - skipping this batch'
+                    .format(self.nonfinite_count))
+                loss.data = torch.tensor(0).to(self.device)
+        else:
+            predictions, targets = self.compute_forward(
+                mixture, targets, sb.Stage.TRAIN)
+            loss = self.compute_objectives(predictions, targets)
+            if self.hparams.threshold_byloss:
+                th = self.hparams.threshold
+                loss_to_keep = loss[loss > th]
+                if loss_to_keep.nelement() > 0:
+                    loss = loss_to_keep.mean()
+            else:
+                loss = loss.mean()
+            # the fix for computational problems
+            if loss < self.hparams.loss_upper_lim and loss.nelement() > 0:
+                loss.backward()
+                if self.hparams.clip_grad_norm >= 0:
+                    torch.nn.utils.clip_grad_norm_(self.modules.parameters(),
+                                                   self.hparams.clip_grad_norm)
+                self.optimizer.step()
+            else:
+                self.nonfinite_count += 1
+                logger.info(
+                    'infinite loss or empty loss! it happened {} times so far - skipping this batch'
+                    .format(self.nonfinite_count))
+                loss.data = torch.tensor(0).to(self.device)
+        self.optimizer.zero_grad()
+        return loss.detach().cpu()
+    # yapf: enable
+
+    def evaluate_batch(self, batch, stage):
+        """Computations needed for validation/test batches"""
+        snt_id = batch.id
+        mixture = batch.mix_sig
+        targets = [batch.s1_sig, batch.s2_sig]
+        if self.hparams.num_spks == 3:
+            targets.append(batch.s3_sig)
+
+        with torch.no_grad():
+            predictions, targets = self.compute_forward(
+                mixture, targets, stage)
+            loss = self.compute_objectives(predictions, targets)
+
+        # Manage audio file saving
+        if stage == sb.Stage.TEST and self.hparams.save_audio:
+            if hasattr(self.hparams, 'n_audio_to_save'):
+                if self.hparams.n_audio_to_save > 0:
+                    self.save_audio(snt_id[0], mixture, targets, predictions)
+                    self.hparams.n_audio_to_save += -1
+            else:
+                self.save_audio(snt_id[0], mixture, targets, predictions)
+
+        return loss.mean().detach()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a epoch."""
+        # Compute/store important stats
+        stage_stats = {'si-snr': stage_loss}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+
+        # Perform end-of-iteration things, like annealing, logging, etc.
+        if stage == sb.Stage.VALID:
+            # Learning rate annealing
+            if isinstance(self.hparams.lr_scheduler,
+                          schedulers.ReduceLROnPlateau):
+                current_lr, next_lr = self.hparams.lr_scheduler(
+                    [self.optimizer], epoch, stage_loss)
+                schedulers.update_learning_rate(self.optimizer, next_lr)
+            else:
+                # if we do not use the reducelronplateau, we do not change the lr
+                current_lr = self.hparams.optimizer.optim.param_groups[0]['lr']
+
+            self.hparams.train_logger.log_stats(
+                stats_meta={
+                    'epoch': epoch,
+                    'lr': current_lr
+                },
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+            self.checkpointer.save_and_keep_only(
+                meta={'si-snr': stage_stats['si-snr']},
+                min_keys=['si-snr'],
+            )
+
+    def add_speed_perturb(self, targets, targ_lens):
+        """Adds speed perturbation and random_shift to the input signals"""
+
+        min_len = -1
+        recombine = False
+
+        if self.hparams.use_speedperturb:
+            # Performing speed change (independently on each source)
+            new_targets = []
+            recombine = True
+
+            for i in range(targets.shape[-1]):
+                new_target = self.hparams.speedperturb(targets[:, :, i],
+                                                       targ_lens)
+                new_targets.append(new_target)
+                if i == 0:
+                    min_len = new_target.shape[-1]
+                else:
+                    if new_target.shape[-1] < min_len:
+                        min_len = new_target.shape[-1]
+
+            if self.hparams.use_rand_shift:
+                # Performing random_shift (independently on each source)
+                recombine = True
+                for i in range(targets.shape[-1]):
+                    rand_shift = torch.randint(self.hparams.min_shift,
+                                               self.hparams.max_shift, (1, ))
+                    new_targets[i] = new_targets[i].to(self.device)
+                    new_targets[i] = torch.roll(
+                        new_targets[i], shifts=(rand_shift[0], ), dims=1)
+
+            # Re-combination
+            if recombine:
+                if self.hparams.use_speedperturb:
+                    targets = torch.zeros(
+                        targets.shape[0],
+                        min_len,
+                        targets.shape[-1],
+                        device=targets.device,
+                        dtype=torch.float,
+                    )
+                for i, new_target in enumerate(new_targets):
+                    targets[:, :, i] = new_targets[i][:, 0:min_len]
+
+        mix = targets.sum(-1)
+        return mix, targets
+
+    def cut_signals(self, mixture, targets):
+        """This function selects a random segment of a given length within the mixture.
+        The corresponding targets are selected accordingly"""
+        randstart = torch.randint(
+            0,
+            1 + max(0, mixture.shape[1] - self.hparams.training_signal_len),
+            (1, ),
+        ).item()
+        targets = targets[:, randstart:randstart
+                          + self.hparams.training_signal_len, :]
+        mixture = mixture[:, randstart:randstart
+                          + self.hparams.training_signal_len]
+        return mixture, targets
+
+    def reset_layer_recursively(self, layer):
+        """Reinitializes the parameters of the neural networks"""
+        if hasattr(layer, 'reset_parameters'):
+            layer.reset_parameters()
+        for child_layer in layer.modules():
+            if layer != child_layer:
+                self.reset_layer_recursively(child_layer)
+
+    def save_results(self, test_data):
+        """This script computes the SDR and SI-SNR metrics and saves
+        them into a csv file"""
+
+        # This package is required for SDR computation
+        from mir_eval.separation import bss_eval_sources
+
+        # Create folders where to store audio
+        save_file = os.path.join(self.hparams.output_folder,
+                                 'test_results.csv')
+
+        # Variable init
+        all_sdrs = []
+        all_sdrs_i = []
+        all_sisnrs = []
+        all_sisnrs_i = []
+        csv_columns = ['snt_id', 'sdr', 'sdr_i', 'si-snr', 'si-snr_i']
+
+        test_loader = sb.dataio.dataloader.make_dataloader(
+            test_data, **self.hparams.dataloader_opts)
+
+        with open(save_file, 'w') as results_csv:
+            writer = csv.DictWriter(results_csv, fieldnames=csv_columns)
+            writer.writeheader()
+
+            # Loop over all test sentence
+            with tqdm(test_loader, dynamic_ncols=True) as t:
+                for i, batch in enumerate(t):
+
+                    # Apply Separation
+                    mixture, mix_len = batch.mix_sig
+                    snt_id = batch.id
+                    targets = [batch.s1_sig, batch.s2_sig]
+                    if self.hparams.num_spks == 3:
+                        targets.append(batch.s3_sig)
+
+                    with torch.no_grad():
+                        predictions, targets = self.compute_forward(
+                            batch.mix_sig, targets, sb.Stage.TEST)
+
+                    # Compute SI-SNR
+                    sisnr = self.compute_objectives(predictions, targets)
+
+                    # Compute SI-SNR improvement
+                    mixture_signal = torch.stack(
+                        [mixture] * self.hparams.num_spks, dim=-1)
+                    mixture_signal = mixture_signal.to(targets.device)
+                    sisnr_baseline = self.compute_objectives(
+                        mixture_signal, targets)
+                    sisnr_i = sisnr.mean() - sisnr_baseline.mean()
+
+                    # Compute SDR
+                    sdr, _, _, _ = bss_eval_sources(
+                        targets[0].t().cpu().numpy(),
+                        predictions[0].t().detach().cpu().numpy(),
+                    )
+
+                    sdr_baseline, _, _, _ = bss_eval_sources(
+                        targets[0].t().cpu().numpy(),
+                        mixture_signal[0].t().detach().cpu().numpy(),
+                    )
+
+                    sdr_i = sdr.mean() - sdr_baseline.mean()
+
+                    # Saving on a csv file
+                    row = {
+                        'snt_id': snt_id[0],
+                        'sdr': sdr.mean(),
+                        'sdr_i': sdr_i,
+                        'si-snr': -sisnr.item(),
+                        'si-snr_i': -sisnr_i.item(),
+                    }
+                    writer.writerow(row)
+
+                    # Metric Accumulation
+                    all_sdrs.append(sdr.mean())
+                    all_sdrs_i.append(sdr_i.mean())
+                    all_sisnrs.append(-sisnr.item())
+                    all_sisnrs_i.append(-sisnr_i.item())
+
+                row = {
+                    'snt_id': 'avg',
+                    'sdr': np.array(all_sdrs).mean(),
+                    'sdr_i': np.array(all_sdrs_i).mean(),
+                    'si-snr': np.array(all_sisnrs).mean(),
+                    'si-snr_i': np.array(all_sisnrs_i).mean(),
+                }
+                writer.writerow(row)
+
+        logger.info('Mean SISNR is {}'.format(np.array(all_sisnrs).mean()))
+        logger.info('Mean SISNRi is {}'.format(np.array(all_sisnrs_i).mean()))
+        logger.info('Mean SDR is {}'.format(np.array(all_sdrs).mean()))
+        logger.info('Mean SDRi is {}'.format(np.array(all_sdrs_i).mean()))
+
+    def save_audio(self, snt_id, mixture, targets, predictions):
+        'saves the test audio (mixture, targets, and estimated sources) on disk'
+
+        # Create outout folder
+        save_path = os.path.join(self.hparams.save_folder, 'audio_results')
+        if not os.path.exists(save_path):
+            os.mkdir(save_path)
+
+        for ns in range(self.hparams.num_spks):
+
+            # Estimated source
+            signal = predictions[0, :, ns]
+            signal = signal / signal.abs().max() * 0.5
+            save_file = os.path.join(
+                save_path, 'item{}_source{}hat.wav'.format(snt_id, ns + 1))
+            torchaudio.save(save_file,
+                            signal.unsqueeze(0).cpu(),
+                            self.hparams.sample_rate)
+
+            # Original source
+            signal = targets[0, :, ns]
+            signal = signal / signal.abs().max() * 0.5
+            save_file = os.path.join(
+                save_path, 'item{}_source{}.wav'.format(snt_id, ns + 1))
+            torchaudio.save(save_file,
+                            signal.unsqueeze(0).cpu(),
+                            self.hparams.sample_rate)
+
+        # Mixture
+        signal = mixture[0][0, :]
+        signal = signal / signal.abs().max() * 0.5
+        save_file = os.path.join(save_path, 'item{}_mix.wav'.format(snt_id))
+        torchaudio.save(save_file,
+                        signal.unsqueeze(0).cpu(), self.hparams.sample_rate)
diff --git a/modelscope/trainers/audio/tts_trainer.py b/modelscope/trainers/audio/tts_trainer.py
index bd38bc4d..e835f24e 100644
--- a/modelscope/trainers/audio/tts_trainer.py
+++ b/modelscope/trainers/audio/tts_trainer.py
@@ -106,11 +106,8 @@ class KanttsTrainer(BaseTrainer):
                     version=train_dataset_revision)
                 logger.info(f'train dataset:{train_dataset.config_kwargs}')
             self.raw_dataset_path = self.load_dataset_raw_path(train_dataset)
-        model_dir = None
-        if os.path.exists(model):
-            model_dir = model
-        else:
-            model_dir = self.get_or_download_model_dir(model, model_revision)
+
+        model_dir = self.get_or_download_model_dir(model, model_revision)
         shutil.copytree(model_dir, self.orig_model_dir)
         self.model_dir = self.orig_model_dir
 
diff --git a/modelscope/trainers/base.py b/modelscope/trainers/base.py
index 98f97859..665d9180 100644
--- a/modelscope/trainers/base.py
+++ b/modelscope/trainers/base.py
@@ -5,6 +5,7 @@ import time
 from abc import ABC, abstractmethod
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
+from modelscope.hub.check_model import check_local_model_is_latest
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.trainers.builder import TRAINERS
 from modelscope.utils.config import Config
@@ -40,6 +41,8 @@ class BaseTrainer(ABC):
         if os.path.exists(model):
             model_cache_dir = model if os.path.isdir(
                 model) else os.path.dirname(model)
+            check_local_model_is_latest(
+                model_cache_dir, user_agent={Invoke.KEY: Invoke.LOCAL_TRAINER})
         else:
             model_cache_dir = snapshot_download(
                 model,
diff --git a/modelscope/trainers/builder.py b/modelscope/trainers/builder.py
index 87e99b30..387024a4 100644
--- a/modelscope/trainers/builder.py
+++ b/modelscope/trainers/builder.py
@@ -5,7 +5,6 @@ from modelscope.utils.constant import Tasks
 from modelscope.utils.registry import Registry, build_from_cfg
 
 TRAINERS = Registry('trainers')
-HOOKS = Registry('hooks')
 
 
 def build_trainer(name: str = Trainers.default, default_args: dict = None):
diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py
index 32c38de2..2f682b81 100644
--- a/modelscope/trainers/cv/__init__.py
+++ b/modelscope/trainers/cv/__init__.py
@@ -10,6 +10,7 @@ if TYPE_CHECKING:
     from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
     from .image_inpainting_trainer import ImageInpaintingTrainer
     from .referring_video_object_segmentation_trainer import ReferringVideoObjectSegmentationTrainer
+    from .image_defrcn_fewshot_detection_trainer import ImageDefrcnFewshotTrainer
 
 else:
     _import_structure = {
@@ -20,7 +21,9 @@ else:
         'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'],
         'image_inpainting_trainer': ['ImageInpaintingTrainer'],
         'referring_video_object_segmentation_trainer':
-        ['ReferringVideoObjectSegmentationTrainer']
+        ['ReferringVideoObjectSegmentationTrainer'],
+        'image_defrcn_fewshot_detection_trainer':
+        ['ImageDefrcnFewshotTrainer']
     }
 
     import sys
diff --git a/modelscope/trainers/cv/image_classifition_trainer.py b/modelscope/trainers/cv/image_classifition_trainer.py
index 998eb9f1..f15fd5e3 100644
--- a/modelscope/trainers/cv/image_classifition_trainer.py
+++ b/modelscope/trainers/cv/image_classifition_trainer.py
@@ -192,19 +192,14 @@ class ImageClassifitionTrainer(BaseTrainer):
         from mmcv.runner import get_dist_info, init_dist
         from mmcls.apis import set_random_seed
         from mmcls.utils import collect_env
+        from mmcv.utils import get_logger as mmcv_get_logger
         import modelscope.models.cv.image_classification.backbones
 
         self._seed = seed
         set_random_seed(self._seed)
         if isinstance(model, str):
-            if os.path.exists(model):
-                self.model_dir = model if os.path.isdir(
-                    model) else os.path.dirname(model)
-            else:
-                self.model_dir = snapshot_download(
-                    model,
-                    revision=model_revision,
-                    user_agent={Invoke.KEY: Invoke.TRAINER})
+            self.model_dir = self.get_or_download_model_dir(
+                model, model_revision=model_revision)
             if cfg_file is None:
                 cfg_file = os.path.join(self.model_dir,
                                         ModelFile.CONFIGURATION)
@@ -283,6 +278,7 @@ class ImageClassifitionTrainer(BaseTrainer):
             distributed = False
 
         # init the logger before other steps
+        mmcv_get_logger('modelscope')  # set name of mmcv logger
         timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
         log_file = osp.join(self.work_dir, f'{timestamp}.log')
         logger = get_logger(log_file=log_file)
@@ -315,6 +311,11 @@ class ImageClassifitionTrainer(BaseTrainer):
         # dataset
         self.train_dataset = train_dataset
         self.eval_dataset = eval_dataset
+        # set img_prefix for image data path in csv files.
+        if cfg.dataset.get('data_prefix', None) is None:
+            self.data_prefix = ''
+        else:
+            self.data_prefix = cfg.dataset.data_prefix
 
         # model
         model = build_classifier(self.cfg.model.mm_model)
@@ -356,7 +357,8 @@ class ImageClassifitionTrainer(BaseTrainer):
             MmDataset(
                 self.train_dataset,
                 pipeline=self.cfg.preprocessor.train,
-                classes=classes)
+                classes=classes,
+                data_prefix=self.data_prefix)
         ]
 
         if len(self.cfg.train.workflow) == 2:
@@ -366,7 +368,10 @@ class ImageClassifitionTrainer(BaseTrainer):
                 )
             val_data_pipeline = self.cfg.preprocessor.train
             val_dataset = MmDataset(
-                self.eval_dataset, pipeline=val_data_pipeline, classes=classes)
+                self.eval_dataset,
+                pipeline=val_data_pipeline,
+                classes=classes,
+                data_prefix=self.data_prefix)
             datasets.append(val_dataset)
 
         # save mmcls version, config file content and class names in
@@ -382,7 +387,8 @@ class ImageClassifitionTrainer(BaseTrainer):
             val_dataset = MmDataset(
                 self.eval_dataset,
                 pipeline=preprocess_transform(self.cfg.preprocessor.val),
-                classes=classes)
+                classes=classes,
+                data_prefix=self.data_prefix)
 
         # add an attribute for visualization convenience
         train_model(
@@ -427,7 +433,8 @@ class ImageClassifitionTrainer(BaseTrainer):
         dataset = MmDataset(
             self.eval_dataset,
             pipeline=preprocess_transform(self.cfg.preprocessor.val),
-            classes=classes)
+            classes=classes,
+            data_prefix=self.data_prefix)
         # the extra round_up data will be removed during gpu/cpu collect
         data_loader = build_dataloader(
             dataset,
diff --git a/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
new file mode 100644
index 00000000..04b2967a
--- /dev/null
+++ b/modelscope/trainers/cv/image_defrcn_fewshot_detection_trainer.py
@@ -0,0 +1,316 @@
+# The implementation is adopted from er-muyue/DeFRCN
+# made publicly available under the MIT License at
+# https://github.com/er-muyue/DeFRCN/blob/main/defrcn/engine/defaults.py
+# https://github.com/er-muyue/DeFRCN/blob/main/tools/model_surgery.py
+
+import os
+from typing import Callable, Optional, Union
+
+import torch
+from detectron2.engine import SimpleTrainer, hooks
+from detectron2.evaluation import DatasetEvaluators, verify_results
+from detectron2.utils import comm
+from torch import nn
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
+from modelscope.utils.logger import get_logger
+
+
+class DefaultTrainer(SimpleTrainer):
+
+    def __init__(self, model, cfg):
+
+        from collections import OrderedDict
+        from fvcore.nn.precise_bn import get_bn_modules
+        from torch.nn.parallel import DistributedDataParallel
+
+        from detectron2.data.build import build_detection_train_loader, build_detection_test_loader
+        from detectron2.solver.build import build_optimizer, build_lr_scheduler
+        from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
+        from detectron2.utils.logger import setup_logger
+
+        setup_logger()
+
+        optimizer = build_optimizer(cfg, model)
+        data_loader = build_detection_train_loader(cfg)
+
+        if comm.get_world_size() > 1:
+            model = DistributedDataParallel(
+                model,
+                device_ids=[comm.get_local_rank()],
+                broadcast_buffers=False,
+                find_unused_parameters=True)
+        super().__init__(model, data_loader, optimizer)
+
+        self.scheduler = build_lr_scheduler(cfg, optimizer)
+
+        self.checkpointer = DetectionCheckpointer(
+            model,
+            cfg.OUTPUT_DIR,
+            optimizer=optimizer,
+            scheduler=self.scheduler,
+        )
+        self.start_iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.cfg = cfg
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        # The checkpoint stores the training iteration that just finished, thus we start
+        # at the next iteration (or iter zero if there's no checkpoint).
+        self.start_iter = (
+            self.checkpointer.resume_or_load(
+                self.cfg.MODEL.WEIGHTS, resume=resume).get('iteration', -1)
+            + 1)
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(self.optimizer, self.scheduler),
+            hooks.PreciseBN(
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                build_detection_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+
+        if comm.is_main_process():
+            ret.append(
+                hooks.PeriodicCheckpointer(self.checkpointer,
+                                           cfg.SOLVER.CHECKPOINT_PERIOD))
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicWriter(self.build_writers(), period=20))
+        return ret
+
+    def build_writers(self):
+        from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+
+        return [
+            CommonMetricPrinter(self.max_iter),
+            JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, 'metrics.json')),
+            TensorboardXWriter(self.cfg.OUTPUT_DIR),
+        ]
+
+    def train(self):
+        """
+        Run training.
+
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+        if hasattr(self, '_last_eval_results') and comm.is_main_process():
+            verify_results(self.cfg, self._last_eval_results)
+            return self._last_eval_results
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        from detectron2.data import MetadataCatalog
+
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, 'inference')
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type == 'coco':
+            from detectron2.evaluation import COCOEvaluator
+            evaluator_list.append(
+                COCOEvaluator(dataset_name, True, output_folder))
+        if evaluator_type == 'pascal_voc':
+            from detectron2.evaluation import PascalVOCDetectionEvaluator
+            return PascalVOCDetectionEvaluator(dataset_name)
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                'no Evaluator for the dataset {} with the type {}'.format(
+                    dataset_name, evaluator_type))
+        if len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def test(cls, cfg, model, evaluators=None):
+        from detectron2.engine.defaults import DefaultTrainer as _DefaultTrainer
+        _DefaultTrainer.build_evaluator = cls.build_evaluator
+
+        return _DefaultTrainer.test(cfg, model, evaluators)
+
+
+@TRAINERS.register_module(module_name=Trainers.image_fewshot_detection)
+class ImageDefrcnFewshotTrainer(BaseTrainer):
+
+    def __init__(self,
+                 model: Optional[Union[TorchModel, nn.Module, str]] = None,
+                 cfg_file: Optional[str] = None,
+                 arg_parse_fn: Optional[Callable] = None,
+                 model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+                 seed: int = 0,
+                 cfg_modify_fn: Optional[Callable] = None,
+                 **kwargs):
+
+        if isinstance(model, str):
+            self.model_dir = self.get_or_download_model_dir(
+                model, model_revision)
+            if cfg_file is None:
+                cfg_file = os.path.join(self.model_dir,
+                                        ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!'
+            self.model_dir = os.path.dirname(cfg_file)
+
+        super().__init__(cfg_file, arg_parse_fn)
+
+        if cfg_modify_fn is not None:
+            self.cfg = cfg_modify_fn(self.cfg)
+
+        self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
+
+        if isinstance(model, (TorchModel, nn.Module)):
+            self.model = model
+        else:
+            self.model = self.build_model(**kwargs)
+
+        self.model_cfg = self.model.get_model_cfg()
+
+        if 'datasets_train' in kwargs:
+            self.model_cfg.merge_from_list(
+                ['DATASETS.TRAIN', kwargs['datasets_train']])
+        if 'datasets_test' in kwargs:
+            self.model_cfg.merge_from_list(
+                ['DATASETS.TEST', kwargs['datasets_test']])
+        if 'work_dir' in kwargs:
+            self.model_cfg.merge_from_list(['OUTPUT_DIR', kwargs['work_dir']])
+
+        if not os.path.exists(self.model_cfg.OUTPUT_DIR):
+            os.makedirs(self.model_cfg.OUTPUT_DIR)
+
+        self.model_cfg.freeze()
+
+        self.data_dir = kwargs.get('data_dir', None)
+        self.data_type = kwargs.get('data_type', 'pascal_voc')
+
+        self.register_data(self.data_type, self.data_dir)
+
+        self.trainer = DefaultTrainer(self.model, self.model_cfg)
+
+    def train(self, *args, **kwargs):
+        self.trainer.resume_or_load()
+        self.trainer.train()
+
+    def evaluate(self, checkpoint_path: str, *args, **kwargs):
+        from detectron2.checkpoint.detection_checkpoint import DetectionCheckpointer
+
+        DetectionCheckpointer(
+            self.model,
+            save_dir=self.model_cfg.OUTPUT_DIR).resume_or_load(checkpoint_path)
+        metric_values = DefaultTrainer.test(self.model_cfg, self.model)
+        return metric_values
+
+    def build_model(self, *args, **kwargs) -> Union[nn.Module, TorchModel]:
+        model = Model.from_pretrained(self.model_dir, **kwargs)
+        if not isinstance(model, nn.Module) and hasattr(model, 'model'):
+            return model.model
+        elif isinstance(model, nn.Module):
+            return model
+
+    @classmethod
+    def register_data(cls, data_type='pascal_voc', data_dir=None):
+
+        if data_type == 'pascal_voc':
+            from modelscope.models.cv.image_defrcn_fewshot.utils.voc_register import register_all_voc
+            if data_dir:
+                register_all_voc(data_dir)
+            else:
+                register_all_voc()
+        else:
+            raise NotImplementedError(
+                'no {} dataset was registered'.format(data_type))
+
+    @classmethod
+    def model_surgery(cls,
+                      src_path,
+                      save_dir,
+                      data_type='pascal_voc',
+                      method='remove'):
+
+        assert method in ['remove',
+                          'randinit'], '{} not implemented'.format(method)
+
+        def _surgery(param_name, is_weight, tar_size, ckpt):
+            weight_name = param_name + ('.weight' if is_weight else '.bias')
+            pretrained_weight = ckpt['model'][weight_name]
+            prev_cls = pretrained_weight.size(0)
+            if 'cls_score' in param_name:
+                prev_cls -= 1
+            if is_weight:
+                feat_size = pretrained_weight.size(1)
+                new_weight = torch.rand((tar_size, feat_size))
+                torch.nn.init.normal_(new_weight, 0, 0.01)
+            else:
+                new_weight = torch.zeros(tar_size)
+
+            new_weight[:prev_cls] = pretrained_weight[:prev_cls]
+            if 'cls_score' in param_name:
+                new_weight[-1] = pretrained_weight[-1]  # bg class
+            ckpt['model'][weight_name] = new_weight
+
+        if data_type == 'pascal_voc':
+            TAR_SIZE = 20
+            params_name = [
+                'model.roi_heads.box_predictor.cls_score',
+                'model.roi_heads.box_predictor.bbox_pred'
+            ]
+
+            save_name = 'model_reset_' + ('remove' if method == 'remove' else
+                                          'surgery') + '.pth'
+            save_path = os.path.join(save_dir, save_name)
+            os.makedirs(save_dir, exist_ok=True)
+
+            ckpt = torch.load(src_path)
+
+            if 'scheduler' in ckpt:
+                del ckpt['scheduler']
+            if 'optimizer' in ckpt:
+                del ckpt['optimizer']
+            if 'iteration' in ckpt:
+                ckpt['iteration'] = 0
+
+            if method == 'remove':
+                for param_name in params_name:
+                    del ckpt['model'][param_name + '.weight']
+                    if param_name + '.bias' in ckpt['model']:
+                        del ckpt['model'][param_name + '.bias']
+            else:
+                tar_sizes = [TAR_SIZE + 1, TAR_SIZE * 4]
+                for idx, (param_name,
+                          tar_size) in enumerate(zip(params_name, tar_sizes)):
+                    _surgery(param_name, True, tar_size, ckpt)
+                    _surgery(param_name, False, tar_size, ckpt)
+
+            torch.save(ckpt, save_path)
+        else:
+            NotImplementedError(
+                '{} dataset does not supported'.format(data_type))
diff --git a/modelscope/trainers/cv/image_detection_damoyolo_trainer.py b/modelscope/trainers/cv/image_detection_damoyolo_trainer.py
new file mode 100644
index 00000000..e9c4cc20
--- /dev/null
+++ b/modelscope/trainers/cv/image_detection_damoyolo_trainer.py
@@ -0,0 +1,547 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import datetime
+import math
+import os
+import os.path as osp
+import time
+from typing import Callable, Dict, Optional
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+from easydict import EasyDict as easydict
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.tinynas_detection.damo.apis.detector_evaluater import \
+    Evaluater
+from modelscope.models.cv.tinynas_detection.damo.apis.detector_inference import \
+    inference
+from modelscope.models.cv.tinynas_detection.damo.base_models.losses.distill_loss import \
+    FeatureLoss
+from modelscope.models.cv.tinynas_detection.damo.detectors.detector import (
+    build_ddp_model, build_local_model)
+from modelscope.models.cv.tinynas_detection.damo.utils import (
+    cosine_scheduler, ema_model)
+from modelscope.msdatasets.task_datasets.damoyolo import (build_dataloader,
+                                                          build_dataset)
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.checkpoint import save_checkpoint
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.metric import MeterBuffer
+from modelscope.utils.torch_utils import get_rank, synchronize
+
+
+@TRAINERS.register_module(module_name=Trainers.tinynas_damoyolo)
+class ImageDetectionDamoyoloTrainer(BaseTrainer):
+
+    def __init__(self,
+                 model: str = None,
+                 cfg_file: str = None,
+                 load_pretrain: bool = True,
+                 cache_path: str = None,
+                 *args,
+                 **kwargs):
+        """ High-level finetune api for Damoyolo.
+
+        Args:
+            model: Model id of modelscope models.
+            cfg_file: Path to configuration file.
+            load_pretrain: Whether load pretrain model for finetune.
+                if False, means training from scratch.
+            cache_path: cache path of model files.
+        """
+        if model is not None:
+            self.cache_path = self.get_or_download_model_dir(model)
+            if cfg_file is None:
+                self.cfg_file = os.path.join(self.cache_path,
+                                             ModelFile.CONFIGURATION)
+        else:
+            assert cfg_file is not None and cache_path is not None, \
+                'cfg_file and cache_path is needed, if model is not provided'
+
+        if cfg_file is not None:
+            self.cfg_file = cfg_file
+            if cache_path is not None:
+                self.cache_path = cache_path
+        super().__init__(self.cfg_file)
+        cfg = self.cfg
+        cfg.model.backbone.structure_file = os.path.join(
+            self.cache_path, cfg.model.backbone.structure_file)
+        if load_pretrain:
+            if 'pretrain_model' in kwargs:
+                cfg.train.finetune_path = kwargs['pretrain_model']
+            else:
+                cfg.train.finetune_path = os.path.join(self.cache_path,
+                                                       self.cfg.model.weights)
+
+        if 'framework' in self.cfg:
+            cfg = self._config_transform(cfg)
+
+        if 'gpu_ids' in kwargs:
+            cfg.train.gpu_ids = kwargs['gpu_ids']
+        if 'batch_size' in kwargs:
+            cfg.train.batch_size = kwargs['batch_size']
+        if 'max_epochs' in kwargs:
+            cfg.train.total_epochs = kwargs['max_epochs']
+        if 'train_image_dir' in kwargs:
+            cfg.dataset.train_image_dir = kwargs['train_image_dir']
+        if 'val_image_dir' in kwargs:
+            cfg.dataset.val_image_dir = kwargs['val_image_dir']
+        if 'train_ann' in kwargs:
+            cfg.dataset.train_ann = kwargs['train_ann']
+        if 'val_ann' in kwargs:
+            cfg.dataset.val_ann = kwargs['val_ann']
+        if 'num_classes' in kwargs:
+            cfg.model.head.num_classes = kwargs['num_classes']
+        if 'base_lr_per_img' in kwargs:
+            cfg.train.base_lr_per_img = kwargs['base_lr_per_img']
+
+        self.gpu_ids = cfg.train.gpu_ids
+        self.world_size = len(self.gpu_ids)
+
+        self.cfg = cfg
+
+    def _train(self, local_rank, world_size, cfg):
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+            'nccl',
+            init_method='tcp://127.0.0.1:12344',
+            rank=local_rank,
+            world_size=world_size)
+        trainer = DamoyoloTrainer(cfg, None, None)
+        trainer.train(local_rank)
+
+    def train(self):
+        if len(self.cfg.train.gpu_ids) > 1:
+            mp.spawn(
+                self._train,
+                nprocs=self.world_size,
+                args=(self.world_size, self.cfg),
+                join=True)
+        else:
+            trainer = DamoyoloTrainer(self.cfg, None, None)
+            trainer.train(local_rank=0)
+
+    def evaluate(self,
+                 checkpoint_path: str = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        if checkpoint_path is not None:
+            self.cfg.test.checkpoint_path = checkpoint_path
+        evaluater = Evaluater(self.cfg)
+        evaluater.evaluate()
+
+    def _config_transform(self, config):
+        new_config = easydict({})
+        new_config.miscs = config.train.miscs
+        new_config.miscs.num_workers = config.train.dataloader.workers_per_gpu
+        new_config.miscs.output_dir = config.train.work_dir
+        new_config.model = config.model
+        new_config.dataset = config.dataset
+        new_config.train = config.train
+        new_config.test = config.evaluation
+
+        new_config.train.augment = config.preprocessor.train
+        new_config.test.augment = config.preprocessor.evaluation
+
+        new_config.train.warmup_start_lr = config.train.lr_scheduler.warmup_start_lr
+        new_config.train.min_lr_ratio = config.train.lr_scheduler.min_lr_ratio
+        new_config.train.warmup_epochs = config.train.lr_scheduler.warmup_epochs
+
+        new_config.train.batch_size = len(
+            config.train.gpu_ids) * config.train.dataloader.batch_size_per_gpu
+        new_config.train.base_lr_per_img = config.train.optimizer.lr / new_config.train.batch_size
+        new_config.train.momentum = config.train.optimizer.momentum
+        new_config.train.weight_decay = config.train.optimizer.weight_decay
+        new_config.train.total_epochs = config.train.max_epochs
+
+        del new_config['train']['miscs']
+        del new_config['train']['lr_scheduler']
+        del new_config['train']['optimizer']
+        del new_config['train']['dataloader']
+
+        return new_config
+
+
+class DamoyoloTrainer:
+
+    def __init__(self, cfg, args, tea_cfg=None):
+        self.cfg = cfg
+        self.tea_cfg = tea_cfg
+        self.args = args
+        self.output_dir = cfg.miscs.output_dir
+        self.exp_name = cfg.miscs.exp_name
+        self.device = 'cuda'
+
+        if len(self.cfg.train.gpu_ids) > 1:
+            self.distributed = True
+        else:
+            self.distributed = False
+        # metric record
+        self.meter = MeterBuffer(window_size=cfg.miscs.print_interval_iters)
+        self.file_name = os.path.join(cfg.miscs.output_dir, cfg.miscs.exp_name)
+
+        # setup logger
+        if get_rank() == 0:
+            os.makedirs(self.file_name, exist_ok=True)
+        self.logger = get_logger(os.path.join(self.file_name, 'train_log.txt'))
+
+        # logger
+        self.logger.info('args info: {}'.format(self.args))
+        self.logger.info('cfg value:\n{}'.format(self.cfg))
+
+    def get_data_loader(self, cfg, distributed=False):
+
+        train_dataset = build_dataset(
+            cfg,
+            cfg.dataset.train_image_dir,
+            cfg.dataset.train_ann,
+            is_train=True,
+            mosaic_mixup=cfg.train.augment.mosaic_mixup)
+        val_dataset = build_dataset(
+            cfg,
+            cfg.dataset.val_image_dir,
+            cfg.dataset.val_ann,
+            is_train=False)
+
+        iters_per_epoch = math.ceil(
+            len(train_dataset[0])
+            / cfg.train.batch_size)  # train_dataset is a list, however,
+
+        train_loader = build_dataloader(
+            train_dataset,
+            cfg.train.augment,
+            batch_size=cfg.train.batch_size,
+            start_epoch=self.start_epoch,
+            total_epochs=cfg.train.total_epochs,
+            num_workers=cfg.miscs.num_workers,
+            is_train=True,
+            size_div=32,
+            distributed=distributed)
+
+        val_loader = build_dataloader(
+            val_dataset,
+            cfg.test.augment,
+            batch_size=cfg.test.batch_size,
+            num_workers=cfg.miscs.num_workers,
+            is_train=False,
+            size_div=32,
+            distributed=distributed)
+
+        return train_loader, val_loader, iters_per_epoch
+
+    def setup_iters(self, iters_per_epoch, start_epoch, total_epochs,
+                    warmup_epochs, no_aug_epochs, eval_interval_epochs,
+                    ckpt_interval_epochs, print_interval_iters):
+        self.iters_per_epoch = iters_per_epoch
+        self.total_epochs = total_epochs
+        self.iters_per_epoch = iters_per_epoch
+        self.start_iter = start_epoch * iters_per_epoch
+        self.total_iters = total_epochs * iters_per_epoch
+        self.warmup_iters = warmup_epochs * iters_per_epoch
+        self.no_aug_iters = no_aug_epochs * iters_per_epoch
+        self.no_aug = self.start_iter >= self.total_iters - self.no_aug_iters
+        self.eval_interval_iters = eval_interval_epochs * iters_per_epoch
+        self.ckpt_interval_iters = ckpt_interval_epochs * iters_per_epoch
+        self.print_interval_iters = print_interval_iters
+
+    def build_optimizer(self, momentum, weight_decay):
+
+        bn_group, weight_group, bias_group = [], [], []
+
+        for k, v in self.model.named_modules():
+            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                bias_group.append(v.bias)
+            if isinstance(v, nn.BatchNorm2d) or 'bn' in k:
+                bn_group.append(v.weight)
+            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+                weight_group.append(v.weight)
+
+        if self.distill:
+            for k, v in self.feature_loss.named_modules():
+                if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                    bias_group.append(v.bias)
+                if isinstance(v, nn.BatchNorm2d) or 'bn' in k:
+                    bn_group.append(v.weight)
+                elif hasattr(v, 'weight') and isinstance(
+                        v.weight, nn.Parameter):
+                    weight_group.append(v.weight)
+
+        optimizer = torch.optim.SGD(
+            bn_group,
+            lr=1e-3,  # only used to init optimizer,
+            # and will be overwrited
+            momentum=momentum,
+            nesterov=True)
+        optimizer.add_param_group({
+            'params': weight_group,
+            'weight_decay': weight_decay
+        })
+        optimizer.add_param_group({'params': bias_group})
+        self.optimizer = optimizer
+
+        return self.optimizer
+
+    def train(self, local_rank):
+        # build model
+        self.model = build_local_model(self.cfg, self.device)
+        if self.distributed:
+            self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
+
+        if self.tea_cfg is not None:
+            self.distill = True
+            self.grad_clip = 30
+            self.tea_model = build_local_model(self.tea_cfg, self.device)
+            self.tea_model.eval()
+            tea_ckpt = torch.load(args.tea_ckpt, map_location=self.device)
+            if 'model' in tea_ckpt:
+                self.tea_model.load_state_dict(tea_ckpt['model'], strict=True)
+            elif 'state_dict' in tea_ckpt:
+                self.tea_model.load_state_dict(tea_ckpt['model'], strict=True)
+            self.feature_loss = FeatureLoss(
+                self.model.neck.out_channels,
+                self.tea_model.neck.out_channels,
+                distiller='cwd').to(self.device)
+        else:
+            self.distill = False
+            self.grad_clip = None
+
+        self.optimizer = self.build_optimizer(self.cfg.train.momentum,
+                                              self.cfg.train.weight_decay)
+        # resume model
+        if self.cfg.train.finetune_path is not None:
+            self.logger.info(f'finetune from {self.cfg.train.finetune_path}')
+            self.model.load_pretrain_detector(self.cfg.train.finetune_path)
+            self.epoch = 0
+            self.start_epoch = 0
+        elif self.cfg.train.resume_path is not None:
+            resume_epoch = self.resume_model(
+                self.cfg.train.resume_path, need_optimizer=True)
+            self.epoch = resume_epoch
+            self.start_epoch = resume_epoch
+            self.logger.info('Resume Training from Epoch: {}'.format(
+                self.epoch))
+        else:
+            self.epoch = 0
+            self.start_epoch = 0
+            self.logger.info('Start Training...')
+
+        if self.cfg.train.ema:
+            self.logger.info(
+                'Enable ema model! Ema model will be evaluated and saved.')
+            self.ema_model = ema_model(self.model, self.cfg.train.ema_momentum)
+        else:
+            self.ema_model = None
+
+        # dataloader
+        self.train_loader, self.val_loader, iters = self.get_data_loader(
+            self.cfg, self.distributed)
+
+        # setup iters according epochs and iters_per_epoch
+        self.setup_iters(iters, self.start_epoch, self.cfg.train.total_epochs,
+                         self.cfg.train.warmup_epochs,
+                         self.cfg.train.no_aug_epochs,
+                         self.cfg.miscs.eval_interval_epochs,
+                         self.cfg.miscs.ckpt_interval_epochs,
+                         self.cfg.miscs.print_interval_iters)
+
+        self.lr_scheduler = cosine_scheduler(
+            self.cfg.train.base_lr_per_img, self.cfg.train.batch_size,
+            self.cfg.train.min_lr_ratio, self.total_iters, self.no_aug_iters,
+            self.warmup_iters, self.cfg.train.warmup_start_lr)
+
+        self.mosaic_mixup = 'mosaic_mixup' in self.cfg.train.augment
+
+        # distributed model init
+        if self.distributed:
+            self.model = build_ddp_model(self.model, local_rank)
+        else:
+            self.model = self.model.to('cuda')
+
+        self.logger.info('Training start...')
+
+        # ----------- start training ------------------------- #
+        self.model.train()
+        iter_start_time = time.time()
+        iter_end_time = time.time()
+        for data_iter, (inps, targets, ids) in enumerate(self.train_loader):
+            cur_iter = self.start_iter + data_iter
+
+            lr = self.lr_scheduler.get_lr(cur_iter)
+            for param_group in self.optimizer.param_groups:
+                param_group['lr'] = lr
+
+            inps = inps.to(self.device)  # ImageList: tensors, img_size
+            targets = [target.to(self.device)
+                       for target in targets]  # BoxList: bbox, num_boxes ...
+
+            model_start_time = time.time()
+
+            if self.distill:
+                outputs, fpn_outs = self.model(inps, targets, stu=True)
+                loss = outputs['total_loss']
+                with torch.no_grad():
+                    fpn_outs_tea = self.tea_model(inps, targets, tea=True)
+                distill_weight = (
+                    (1 - math.cos(cur_iter * math.pi / len(self.train_loader)))
+                    / 2) * (0.1 - 1) + 1
+
+                distill_loss = distill_weight * self.feature_loss(
+                    fpn_outs, fpn_outs_tea)
+                loss += distill_loss
+                outputs['distill_loss'] = distill_loss
+
+            else:
+
+                outputs = self.model(inps, targets)
+                loss = outputs['total_loss']
+
+            self.optimizer.zero_grad()
+            loss.backward()
+            if self.grad_clip is not None:
+                nn.utils.clip_grad_norm_(
+                    self.model.parameters(),
+                    max_norm=self.grad_clip,
+                    norm_type=2)  # for stable training
+
+            self.optimizer.step()
+
+            if self.ema_model is not None:
+                self.ema_model.update(cur_iter, self.model)
+
+            iter_start_time = iter_end_time
+            iter_end_time = time.time()
+
+            outputs_array = {_name: _v.item() for _name, _v in outputs.items()}
+            self.meter.update(
+                iter_time=iter_end_time - iter_start_time,
+                model_time=iter_end_time - model_start_time,
+                lr=lr,
+                **outputs_array,
+            )
+
+            if cur_iter + 1 > self.total_iters - self.no_aug_iters:
+                if self.mosaic_mixup:
+                    self.logger.info('--->turn OFF mosaic aug now!')
+                    self.train_loader.batch_sampler.set_mosaic(False)
+                    self.eval_interval_iters = self.iters_per_epoch
+                    self.ckpt_interval_iters = self.iters_per_epoch
+                    self.mosaic_mixup = False
+
+            # log needed information
+            if (cur_iter + 1) % self.print_interval_iters == 0:
+                left_iters = self.total_iters - (cur_iter + 1)
+                eta_seconds = self.meter['iter_time'].global_avg * left_iters
+                eta_str = 'ETA: {}'.format(
+                    datetime.timedelta(seconds=int(eta_seconds)))
+
+                progress_str = 'epoch: {}/{}, iter: {}/{}'.format(
+                    self.epoch + 1, self.total_epochs,
+                    (cur_iter + 1) % self.iters_per_epoch,
+                    self.iters_per_epoch)
+                loss_meter = self.meter.get_filtered_meter('loss')
+                loss_str = ', '.join([
+                    '{}: {:.1f}'.format(k, v.avg)
+                    for k, v in loss_meter.items()
+                ])
+
+                time_meter = self.meter.get_filtered_meter('time')
+                time_str = ', '.join([
+                    '{}: {:.3f}s'.format(k, v.avg)
+                    for k, v in time_meter.items()
+                ])
+
+                self.logger.info('{}, {}, {}, lr: {:.3e}'.format(
+                    progress_str,
+                    time_str,
+                    loss_str,
+                    self.meter['lr'].latest,
+                ) + (', size: ({:d}, {:d}), {}'.format(
+                    inps.tensors.shape[2], inps.tensors.shape[3], eta_str)))
+                self.meter.clear_meters()
+
+            if (cur_iter + 1) % self.ckpt_interval_iters == 0:
+                self.save_ckpt(
+                    'epoch_%d_ckpt.pth' % (self.epoch + 1),
+                    local_rank=local_rank)
+
+            if (cur_iter + 1) % self.eval_interval_iters == 0:
+                time.sleep(0.003)
+                self.evaluate(local_rank, self.cfg.dataset.val_ann)
+                self.model.train()
+            synchronize()
+
+            if (cur_iter + 1) % self.iters_per_epoch == 0:
+                self.epoch = self.epoch + 1
+
+        self.save_ckpt(ckpt_name='latest_ckpt.pth', local_rank=local_rank)
+
+    def save_ckpt(self, ckpt_name, local_rank, update_best_ckpt=False):
+        if local_rank == 0:
+            if self.ema_model is not None:
+                save_model = self.ema_model.model
+            else:
+                if isinstance(self.model, DDP):
+                    save_model = self.model.module
+                else:
+                    save_model = self.model
+            ckpt_name = os.path.join(self.file_name, ckpt_name)
+            self.logger.info('Save weights to {}'.format(ckpt_name))
+            meta = {'epoch': self.epoch + 1}
+            if self.distill:
+                meta.update(feature_loss=self.feature_loss.state_dict())
+            save_checkpoint(
+                model=save_model,
+                filename=ckpt_name,
+                optimizer=self.optimizer,
+                meta=meta,
+                with_meta=True)
+
+    def resume_model(self, resume_path, load_optimizer=False):
+        ckpt_file_path = resume_path
+        ckpt = torch.load(ckpt_file_path, map_location=self.device)
+        if 'state_dict' in ckpt:
+            self.model.load_state_dict(ckpt['state_dict'])
+        elif 'model' in ckpt:
+            self.model.load_state_dict(ckpt['model'])
+
+        if load_optimizer:
+            if 'optimizer' in ckpt:
+                self.optimizer.load_state_dict(ckpt['optimizer'])
+            if self.distill:
+                if 'meta' in ckpt:
+                    self.feature_loss.load_state_dict(
+                        ckpt['meta']['feature_loss'])
+                elif 'feature_loss' in ckpt:
+                    self.feature_loss.load_state_dict(ckpt['feature_loss'])
+        if 'meta' in ckpt:
+            resume_epoch = ckpt['meta']['epoch']
+        elif 'epoch' in ckpt:
+            resume_epoch = ckpt['epoch']
+        return resume_epoch
+
+    def evaluate(self, local_rank, val_ann):
+        if self.ema_model is not None:
+            evalmodel = self.ema_model.model
+        else:
+            evalmodel = self.model
+            if isinstance(evalmodel, DDP):
+                evalmodel = evalmodel.module
+
+        output_folder = os.path.join(self.output_dir, self.exp_name,
+                                     'inference')
+        if local_rank == 0:
+            os.makedirs(output_folder, exist_ok=True)
+
+        for data_loader_val in self.val_loader:
+            inference(
+                evalmodel,
+                data_loader_val,
+                device=self.device,
+                output_folder=output_folder,
+            )
diff --git a/modelscope/trainers/easycv/utils/metric.py b/modelscope/trainers/easycv/utils/metric.py
index 53937b67..d952ec3e 100644
--- a/modelscope/trainers/easycv/utils/metric.py
+++ b/modelscope/trainers/easycv/utils/metric.py
@@ -50,3 +50,13 @@ class EasyCVMetric(Metric):
         metric_values = self.trainer.eval_dataset.evaluate(
             results, self.evaluators)
         return metric_values
+
+    def merge(self, other: 'EasyCVMetric'):
+        self.preds.extend(other.preds)
+
+    def __getstate__(self):
+        return self.preds
+
+    def __setstate__(self, state):
+        self.__init__()
+        self.preds = state
diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py
index 4d70cf70..e76f46e4 100644
--- a/modelscope/trainers/hooks/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint_hook.py
@@ -394,7 +394,8 @@ class BestCkptSaverHook(CheckpointHook):
     def remove_obsolete_checkpoints(self):
 
         def extract_metric_from_filename(name1):
-            metric1 = float(name1.split(self.metric_key)[1].split('.')[0])
+            metric1 = float('.'.join(
+                name1.split(self.metric_key)[1].split('.')[:-1]))
             if self.rule == 'max':
                 return -metric1
             else:
diff --git a/modelscope/trainers/hooks/compression/utils.py b/modelscope/trainers/hooks/compression/utils.py
index 59418201..cb1f148d 100644
--- a/modelscope/trainers/hooks/compression/utils.py
+++ b/modelscope/trainers/hooks/compression/utils.py
@@ -132,7 +132,7 @@ def convert_sparse_network(
 ):
     compress_module = [nn.Linear]
     try:
-        from megatron import mpu
+        from megatron_util import mpu
         compress_module.extend(
             [mpu.RowParallelLinear, mpu.ColumnParallelLinear])
     except ImportError:
diff --git a/modelscope/trainers/hooks/deepspeed_hook.py b/modelscope/trainers/hooks/deepspeed_hook.py
index 60f03066..d27f01ca 100644
--- a/modelscope/trainers/hooks/deepspeed_hook.py
+++ b/modelscope/trainers/hooks/deepspeed_hook.py
@@ -3,7 +3,7 @@ import os
 from types import MethodType
 
 import deepspeed
-from megatron import mpu
+from megatron_util import mpu
 
 from modelscope.metainfo import Hooks
 from modelscope.trainers.hooks import (BestCkptSaverHook, CheckpointHook,
diff --git a/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py b/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py
new file mode 100644
index 00000000..6079a8a8
--- /dev/null
+++ b/modelscope/trainers/multi_modal/mgeo_ranking_trainer.py
@@ -0,0 +1,323 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import Model, TorchModel
+from modelscope.models.nlp import BertForTextRanking
+from modelscope.msdatasets.ms_dataset import MsDataset
+from modelscope.preprocessors.base import Preprocessor
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@dataclass
+class GroupCollator():
+    """
+    Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to
+    List[qry], List[psg] and pass batch separately to the actual collator.
+    Abstract out data detail for the model.
+    """
+
+    def get_gis(self, gis, inps):
+        gis_input_ids, gis_token_type_ids, gis_rel_type_ids = ([], [], [])
+        gis_absolute_position_ids, gis_relative_position_ids = ([], [])
+        gis_prov_ids, gis_city_ids, gis_dist_ids = ([], [], [])
+        china_version = False
+        for doc in inps:
+            if len(doc) == 0:
+                continue
+            if len(doc[0]) == 6:
+                for geom_id, geom_type, rel_type, absolute_position, relative_position, lxly in doc:
+                    gis_input_ids.append(geom_id)
+                    gis_token_type_ids.append(geom_type)
+                    gis_rel_type_ids.append(rel_type)
+                    gis_absolute_position_ids.append(absolute_position)
+                    gis_relative_position_ids.append(relative_position)
+            elif len(doc[0]) == 9:
+                china_version = True
+                for geom_id, geom_type, rel_type, absolute_position, relative_position, \
+                        prov_id, city_id, dist_id, lxly in doc:
+                    gis_input_ids.append(geom_id)
+                    gis_token_type_ids.append(geom_type)
+                    gis_rel_type_ids.append(rel_type)
+                    gis_absolute_position_ids.append(absolute_position)
+                    gis_relative_position_ids.append(relative_position)
+                    gis_prov_ids.append(prov_id)
+                    gis_city_ids.append(city_id)
+                    gis_dist_ids.append(dist_id)
+
+        gis.update(gis_input_ids, gis_token_type_ids, gis_rel_type_ids,
+                   gis_absolute_position_ids, gis_relative_position_ids,
+                   gis_prov_ids, gis_city_ids, gis_dist_ids, china_version)
+        return gis
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        if isinstance(features[0], list):
+            features = sum(features, [])
+        keys = features[0].keys()
+        batch = {k: list() for k in keys}
+        for ele in features:
+            for k, v in ele.items():
+                batch[k].append(v)
+        merged_batch = {}
+        gis_list = []
+        gis_tp = []
+        for k in batch:
+            if 'sentence1_gis' == k:
+                gis = batch['gis1'][0]
+                gis = self.get_gis(gis, batch['sentence1_gis'])
+                if gis.prov_ids is not None:
+                    gis_list.append({
+                        'input_ids': gis.input_ids,
+                        'attention_mask': gis.attention_mask,
+                        'token_type_ids': gis.token_type_ids,
+                        'rel_type_ids': gis.rel_type_ids,
+                        'absolute_position_ids': gis.absolute_position_ids,
+                        'relative_position_ids': gis.relative_position_ids,
+                        'prov_ids': gis.prov_ids,
+                        'city_ids': gis.city_ids,
+                        'dist_ids': gis.dist_ids
+                    })
+                else:
+                    gis_list.append({
+                        'input_ids':
+                        gis.input_ids,
+                        'attention_mask':
+                        gis.attention_mask,
+                        'token_type_ids':
+                        gis.token_type_ids,
+                        'rel_type_ids':
+                        gis.rel_type_ids,
+                        'absolute_position_ids':
+                        gis.absolute_position_ids,
+                        'relative_position_ids':
+                        gis.relative_position_ids
+                    })
+                gis_tp.append(torch.LongTensor([1]).to(gis.input_ids.device))
+            elif 'sentence2_gis' == k:
+                gis = batch['gis2'][0]
+                gis = self.get_gis(gis, batch['sentence2_gis'])
+                if gis.prov_ids is not None:
+                    gis_list.append({
+                        'input_ids': gis.input_ids,
+                        'attention_mask': gis.attention_mask,
+                        'token_type_ids': gis.token_type_ids,
+                        'rel_type_ids': gis.rel_type_ids,
+                        'absolute_position_ids': gis.absolute_position_ids,
+                        'relative_position_ids': gis.relative_position_ids,
+                        'prov_ids': gis.prov_ids,
+                        'city_ids': gis.city_ids,
+                        'dist_ids': gis.dist_ids
+                    })
+                else:
+                    gis_list.append({
+                        'input_ids':
+                        gis.input_ids,
+                        'attention_mask':
+                        gis.attention_mask,
+                        'token_type_ids':
+                        gis.token_type_ids,
+                        'rel_type_ids':
+                        gis.rel_type_ids,
+                        'absolute_position_ids':
+                        gis.absolute_position_ids,
+                        'relative_position_ids':
+                        gis.relative_position_ids
+                    })
+                gis_tp.append(torch.LongTensor([0]).to(gis.input_ids.device))
+            elif 'qid' in k or 'labels' in k:
+                merged_batch[k] = torch.cat(batch[k], dim=0)
+            elif not k.startswith('gis'):
+                k_t = [it.t() for it in batch[k]]
+                pad = torch.nn.utils.rnn.pad_sequence(k_t)
+                if len(pad.size()) <= 2:
+                    merged_batch[k] = pad.t()
+                else:
+                    l, b1, b2 = pad.size()
+                    merged_batch[k] = pad.view(l, b1 * b2).t()
+        if len(gis_list) > 0:
+            merged_batch['gis_list'] = gis_list
+        if len(gis_tp) > 0:
+            merged_batch['gis_tp'] = gis_tp
+        return merged_batch
+
+
+@TRAINERS.register_module(module_name=Trainers.mgeo_ranking_trainer)
+class MGeoRankingTrainer(NlpEpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Callable] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
+            preprocessor: Optional[Preprocessor] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            **kwargs):
+
+        if data_collator is None:
+            data_collator = GroupCollator()
+
+        super().__init__(
+            model=model,
+            cfg_file=cfg_file,
+            cfg_modify_fn=cfg_modify_fn,
+            arg_parse_fn=arg_parse_fn,
+            data_collator=data_collator,
+            preprocessor=preprocessor,
+            optimizers=optimizers,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            model_revision=model_revision,
+            **kwargs)
+
+    def compute_mrr(self, result, k=10):
+        mrr = 0
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: x[0], reverse=True)
+            ar = 0
+            for index, ele in enumerate(sorted_res[:k]):
+                if str(ele[1]) == '1':
+                    ar = 1.0 / (index + 1)
+                    break
+            mrr += ar
+        return mrr / len(result)
+
+    def compute_ndcg(self, result, k=10):
+        ndcg = 0
+        from sklearn import ndcg_score
+        for res in result.values():
+            sorted_res = sorted(res, key=lambda x: [0], reverse=True)
+            labels = np.array([[ele[1] for ele in sorted_res]])
+            scores = np.array([[ele[0] for ele in sorted_res]])
+            ndcg += float(ndcg_score(labels, scores, k=k))
+        ndcg = ndcg / len(result)
+        return ndcg
+
+    def to_device(self, val, device):
+        if isinstance(val, torch.Tensor):
+            return val.to(device)
+        elif isinstance(val, list):
+            return [self.to_device(item, device) for item in val]
+        elif isinstance(val, dict):
+            new_val = {}
+            for key in val:
+                new_val[key] = self.to_device(val[key], device)
+            return new_val
+        print('can not convert to device')
+        raise Exception('can not convert to device')
+
+    def evaluate(self,
+                 checkpoint_path: Optional[str] = None,
+                 *args,
+                 **kwargs) -> Dict[str, float]:
+        """evaluate a dataset
+
+        evaluate a dataset via a specific model from the `checkpoint_path` path,
+        if the `checkpoint_path` does not exist, read from the config file.
+
+        Args:
+            checkpoint_path (Optional[str], optional): the model path. Defaults
+            to None.
+
+        Returns:
+            Dict[str, float]: the results about the evaluation Example:
+            {"accuracy": 0.5091743119266054, "f1": 0.673780487804878}
+        """
+        # get the raw online dataset
+        self.eval_dataloader = self._build_dataloader_with_dataset(
+            self.eval_dataset,
+            **self.cfg.evaluation.get('dataloader', {}),
+            collate_fn=self.eval_data_collator)
+        # generate a standard dataloader
+        # generate a model
+        if checkpoint_path is not None:
+            model = BertForTextRanking.from_pretrained(checkpoint_path)
+        else:
+            model = self.model
+
+        # copy from easynlp (start)
+        model.eval()
+        total_samples = 0
+
+        logits_list = list()
+        label_list = list()
+        qid_list = list()
+
+        total_spent_time = 0.0
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        model.to(device)
+        for _step, batch in enumerate(tqdm(self.eval_dataloader)):
+            try:
+                batch = self.to_device(batch, device)
+            except RuntimeError:
+                batch = {key: val for key, val in batch.items()}
+
+            infer_start_time = time.time()
+            with torch.no_grad():
+                label_ids = batch.pop('labels').detach().cpu().numpy()
+                qids = batch.pop('qid').detach().cpu().numpy()
+                outputs = model(**batch)
+            infer_end_time = time.time()
+            total_spent_time += infer_end_time - infer_start_time
+            total_samples += self.eval_dataloader.batch_size
+
+            def sigmoid(logits):
+                return np.exp(logits) / (1 + np.exp(logits))
+
+            logits = outputs['logits'].squeeze(-1).detach().cpu().numpy()
+            logits = sigmoid(logits).tolist()
+
+            label_list.extend(label_ids)
+            logits_list.extend(logits)
+            qid_list.extend(qids)
+
+        logger.info('Inference time = {:.2f}s, [{:.4f} ms / sample] '.format(
+            total_spent_time, total_spent_time * 1000 / total_samples))
+
+        rank_result = {}
+        for qid, score, label in zip(qid_list, logits_list, label_list):
+            if qid not in rank_result:
+                rank_result[qid] = []
+            rank_result[qid].append((score, label))
+
+        for qid in rank_result:
+            rank_result[qid] = sorted(rank_result[qid], key=lambda x: x[0])
+
+        eval_outputs = list()
+        for metric in self.metrics:
+            if metric.startswith('mrr'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                mrr = self.compute_mrr(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, mrr))
+                eval_outputs.append((metric, mrr))
+            elif metric.startswith('ndcg'):
+                k = metric.split('@')[-1]
+                k = int(k)
+                ndcg = self.compute_ndcg(rank_result, k=k)
+                logger.info('{}: {}'.format(metric, ndcg))
+                eval_outputs.append(('ndcg', ndcg))
+            else:
+                raise NotImplementedError('Metric %s not implemented' % metric)
+
+        return dict(eval_outputs)
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
index f7801f09..885ca118 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py
@@ -2,15 +2,19 @@
 
 import math
 import os
+import shutil
+import tempfile
 from functools import partial
 from shutil import ignore_patterns
 from typing import Callable, Dict, Optional, Tuple, Union
 
+import json
 import torch
 from torch import distributed as dist
 from torch import nn
 from torch.utils.data import Dataset
 
+from modelscope.hub.file_download import model_file_download
 from modelscope.metainfo import Trainers
 from modelscope.models.base import Model, TorchModel
 from modelscope.msdatasets.ms_dataset import MsDataset
@@ -30,6 +34,36 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
 
 @TRAINERS.register_module(module_name=Trainers.ofa)
 class OFATrainer(EpochBasedTrainer):
+    r"""
+    OFA trainer for MaaS.
+
+    Args:
+        model (`str`): A model dir or a model id to be loaded
+        cfg_file (`str`, **optional**, default to `None`):
+            A config dir
+        cfg_modify_fn (`Callable`, **optional**, default to `None`):
+            A function which can rebuild the config file.
+        arg_parse_fn (`Callable`, **optional**, default to `None`):
+            Same as ``parse_fn`` in :obj:`Config.to_args`.
+        data_collator (`Callable`, **optional**, default to `None`):
+            The function to use to form a batch from a list of elements
+            of `train_dataset` or `eval_dataset`.
+        train_dataset (:obj:`MsDataset` or :obj:`Dataset`, **optional**, default to `None`):
+            Dataset for training.
+        eval_dataset (:obj:`MsDataset` or :obj:`Dataset`, **optional**, default to `None`):
+            Dataset for evaluation.
+        preprocessor (:obj:`Preprocessor`, **optional**, default to `None`):
+            The optional preprocessor.
+            NOTE: If the preprocessor has been called before the dataset fed into this trainer by user's custom code,
+            this parameter should be None, meanwhile remove the 'preprocessor' key from the cfg_file.
+            Else the preprocessor will be instantiated from the cfg_file or assigned from this parameter and
+            this preprocessing action will be executed every time the dataset's __getitem__ is called.
+        model_revision (`str`, **optional**, default to `None`):
+            The revision used when the model_name_or_path is
+                a model id of the remote hub. default `None`.
+        seed (`int`, **optional**, default to `42`):
+            The optional random seed for torch, cuda, numpy and random.
+    """
 
     def __init__(
             self,
@@ -53,18 +87,26 @@ class OFATrainer(EpochBasedTrainer):
             model, revision=model_revision, invoked_by=Invoke.TRAINER)
         model_dir = model.model_dir
         self.cfg_modify_fn = cfg_modify_fn
-        cfg = self.rebuild_config(Config.from_file(cfg_file))
-        if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0:
-            work_dir = cfg.train.work_dir
-        else:
-            work_dir = kwargs['work_dir']
 
+        work_dir = kwargs.get('work_dir', 'workspace')
         os.makedirs(work_dir, exist_ok=True)
         ignore_file_set = set()
-        ignore_file_set.add(ModelFile.CONFIGURATION)
+        if cfg_file is not None:
+            cfg_file = self.get_config_file(cfg_file)
+            dst = os.path.abspath(
+                os.path.join(work_dir, ModelFile.CONFIGURATION))
+            src = os.path.abspath(cfg_file)
+            if src != dst:
+                shutil.copy(src, work_dir)
+            ignore_file_set.add(ModelFile.CONFIGURATION)
         recursive_overwrite(
             model_dir, work_dir, ignore=ignore_patterns(*ignore_file_set))
-
+        cfg_file = os.path.join(work_dir, ModelFile.CONFIGURATION)
+        cfg = self.rebuild_config(Config.from_file(cfg_file))
+        if cfg_modify_fn is not None:
+            cfg = self.cfg_modify_fn(cfg)
+            with open(cfg_file, 'w') as writer:
+                json.dump(dict(cfg), fp=writer, indent=4)
         if preprocessor is None:
             preprocessor = {
                 ConfigKeys.train:
@@ -113,6 +155,7 @@ class OFATrainer(EpochBasedTrainer):
             model=model,
             cfg_file=cfg_file,
             arg_parse_fn=arg_parse_fn,
+            cfg_modify_fn=cfg_modify_fn,
             data_collator=data_collator,
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
@@ -123,11 +166,47 @@ class OFATrainer(EpochBasedTrainer):
         )
 
     def rebuild_config(self, cfg: Config):
+        r"""
+        rebuild config if `cfg_modify_fn` is not `None`.
+        """
         if self.cfg_modify_fn is not None:
             cfg = self.cfg_modify_fn(cfg)
         return cfg
 
+    def get_config_file(self, config_file: str):
+        r"""
+        support local file/ url or model_id with revision
+        """
+        if os.path.exists(config_file):
+            return config_file
+        else:
+            temp_name = tempfile.TemporaryDirectory().name
+            if len(config_file.split('#')) == 2:
+                model_id = config_file.split('#')[0]
+                revision = config_file.split('#')[-1].split('=')[-1]
+            else:
+                model_id = config_file
+                revision = DEFAULT_MODEL_REVISION
+            file_name = model_file_download(
+                model_id,
+                file_path=ModelFile.CONFIGURATION,
+                revision=revision,
+                cache_dir=temp_name)
+            return file_name
+
     def train_step(self, model, inputs):
+        r"""
+        A single training step.
+
+        step 1. Let the model in a trainable state.
+        step 2. Execute the criterion function.
+        step 3. Update the logging variable's value.
+        step 4. Update the training result.
+
+        Args:
+            model (:obj:`torch.nn.Module` or :obj:`TorchModel`): The model to be run.
+            inputs (`dict`): model inputs.
+        """
         model = model.module if self._dist or is_parallel(model) else model
         model.train()
         loss, sample_size, logging_output = self.criterion(model, inputs)
diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
index ffd4cf78..3f1a3784 100644
--- a/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
+++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer_utils.py
@@ -31,6 +31,13 @@ def recursive_overwrite(src, dst, ignore=None):
 
 
 def construct_rdrop_sample(x):
+    r"""
+    Construct a new sample which doubles each value.
+
+    .. note::
+        This function seems to only work when the type if `x` is `Tensor`,
+        other types should check the correctness.
+    """
     if isinstance(x, dict):
         for key in x:
             x[key] = construct_rdrop_sample(x[key])
@@ -46,6 +53,23 @@ def construct_rdrop_sample(x):
 
 
 def kl_loss(p, q):
+    r"""
+    The Kullback-Leibler divergence loss using in OFA
+
+    step 1. Calculate the Kullback-leibler divergence for each setting, see
+    more from :class:`~torch.nn.functional.kl_div` for details:
+        - `p` as input, `q` as target
+        - `q` as input, `p` as target
+    step 2. Average the two kl divergences as final loss.
+
+    Args:
+        p (Tensor): Tensor with arbitrary shape.
+        q (Tensor): Tensor with the same shape as p.
+
+    .. note::
+        :attr:`p` and :attr:`q` should be in the log space of observation and model
+        prediction values.
+    """
     p_loss = F.kl_div(p, torch.exp(q), reduction='sum')
     q_loss = F.kl_div(q, torch.exp(p), reduction='sum')
     loss = (p_loss + q_loss) / 2
@@ -64,6 +88,51 @@ def label_smoothed_nll_loss(lprobs,
                             constraint_masks=None,
                             constraint_start=None,
                             constraint_end=None):
+    r"""
+    Computing label smoothed negative log likelihood loss.
+
+    step 1. Calculating the negative log likelihood loss as `nll_loss`.
+    step 2. Calculating the smooth loss which is the sum of last dimension of
+        `nll_loss` as `smooth_loss`
+    step 3. Calculating the `esp_i`, which is the scale factor of `nll_loss`
+        and `smooth_loss` while calculating the `loss`.
+    step 4. Calculating the `loss` using :attr:`epsilon`, `eps_i`, `nll_loss`
+        and `smooth_loss`.
+    step 5. If `use_rdrop` is True, computing the Kullback-Leilber divergence
+        loss, making the doubled samples keep close after dropout. Add the kl
+        loss to the final `loss`.
+
+    Args:
+        lprobs (`Tensor` with shape `[bsz*seq_len, embed_dim]`):
+            log probabilities of the model.
+        target (`Tensor` with shape `[bsz*seq_len]`):
+            the target tokens
+        epsilon (`float`): scale factor of combine `nll_loss` and `smooth_loss`.
+        update_num (`int`): the number of updating parameters.
+        drop_worst_ratio (`float`, **optional**, default to `0.0`):
+            the ratio of dropped tokens whose score is worse then others.
+        drop_worst_after (`int`, **optional**, default to `0`):
+            the number of tokens after dropped by score.
+        use_rdrop (`bool`, **optional**, default to `False`):
+            whether or not to add Kullback-leilber divergence loss. if true, the
+            sample should be doubled in the preprocessing.
+        reg_alpha (`float`, **optional**, default to `1.0`):
+            the regular factor to add kl divergence loss to total loss.
+        constraint_masks (`tensor`, **optional**, default to `None`):
+            bool tensor with arbitrary shape which can be broadcast to the
+            shape of `lporbs`
+        constraint_start(`int`, **optional**, default to `None`):
+            the start of the token index.
+        constraint_start(`int`, **optional**, default to `None`):
+            the end of the token index.
+
+    Returns:
+        A tuple of:
+         - loss, scalar tensor with average total loss of total tokens.
+         - nll_loss, scalar tensor with average negative log likelihood loss
+         of total tokens.
+         - ntokens, the number of total tokens, should be `bsz * seq_len`.
+    """
     if target.dim() == lprobs.dim() - 1:
         target = target.unsqueeze(-1)
     nll_loss = -lprobs.gather(dim=-1, index=target).squeeze(-1)
@@ -176,6 +245,38 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
         return loss, sample_size, logging_output
 
     def get_lprobs_and_target(self, logits, sample):
+        r"""
+        Calculating the log probabilities from model's output `logits`, and processing the
+        target from `sample`.
+
+        step 1. Get the log probabilities from model's output logits.
+            - Get the scale factor `conf`, default is `1`.
+            - If some constrains are available, let the logits values out of
+            constraints be :obj:`-math.inf`
+            - Calculate the log softmax result and multiply scale factor `conf`,
+             see :class:`~torch.nn.functional.log_softmax` for details.
+            - If some ignore configs are available, remove the ignore token's
+            log probabilities.
+        step 2. Processing the target
+            - If some ignore configs are available, remove the ignore tokens
+            in the target.
+        step 3. Get the constraint mask
+            - If some ignore configs are available, remove the ignore tokens
+            in the constraint mask.
+
+        Args:
+            logits (:obj:`Tensor` with shape `[bsz, seq_len, embed_dim]`):
+                Model's output logits.
+            sample (`Dict[str, Tensor]`):
+                A sample for model's input, the key`target` must be in the
+                sample for training.
+
+        Returns:
+            A tuple of:
+             - log probabilities with shape `[bsz * (seq_len - 1), embed_dim]`
+             - target token index with shape `[bsz * (seq_len - 1),]`
+             - constraint mask with shape `[bsz * (seq_len - 1),]`
+        """
         conf = sample['conf'][:, None, None] if 'conf' in sample and sample[
             'conf'] is not None else 1
         constraint_masks = None
@@ -208,6 +309,32 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
                            lprobs.size(-1)), target.view(-1), constraint_masks
 
     def compute_loss(self, logits, sample, update_num, reduce=True):
+        r"""
+        Computing loss for adjust label smoothed cross entropy.
+
+        step 1. Getting log probabilities and target and constraints mask.
+        step 2. Remove the padding token result.
+        step 3. Computing the label smoothed negative log likelihood loss
+        as the final result.
+
+        Args:
+            logits (:obj:`Tensor` with shape `[bsz, seq_len, embed_dim]`):
+                Model's output logits.
+            sample (`Dict[str, Tensor]`):
+                A sample for model's input, the key`target` must be in the
+                sample for training.
+            update_num (`int`): The number of updating parameters.
+
+        .. note::
+            The parameter `reduce` is never used in this function, should be
+            removed.
+
+        Returns:
+            A tuple of:
+             - loss, a scalar tensor, the final loss.
+             - nll_loss, a scalar tensor, the negative log likelihood loss
+             - ntokens, int, the number of tokens in calculating the loss.
+        """
         lprobs, target, constraint_masks = self.get_lprobs_and_target(
             logits, sample)
         if constraint_masks is not None:
@@ -257,6 +384,14 @@ class AdjustLabelSmoothedCrossEntropyCriterion(_Loss):
 
 
 def get_schedule(scheduler):
+    r"""
+    Get the relative scheduler class and args by different input scheduler.
+    So far, we support for types of input scheduler:
+        - `const`
+        - `linear`
+        - `cosine`
+        - `polynomial_decay`
+    """
 
     if scheduler.name == 'const':
         scheduler_class = transformers.get_constant_schedule_with_warmup
diff --git a/modelscope/trainers/nlp/faq_question_answering_trainer.py b/modelscope/trainers/nlp/faq_question_answering_trainer.py
new file mode 100644
index 00000000..a4a78cf7
--- /dev/null
+++ b/modelscope/trainers/nlp/faq_question_answering_trainer.py
@@ -0,0 +1,316 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import contextlib
+from collections import defaultdict
+from dataclasses import dataclass
+from distutils.version import LooseVersion
+from functools import partial
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+from modelscope.metainfo import Trainers
+from modelscope.models.base import TorchModel
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors import Preprocessor
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.nlp_trainer import EpochBasedTrainer
+from modelscope.trainers.trainer import worker_init_fn
+from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModeKeys
+from modelscope.utils.logger import get_logger
+from modelscope.utils.torch_utils import get_dist_info
+
+logger = get_logger()
+
+
+@contextlib.contextmanager
+def numpy_seed(seed, *addl_seeds):
+    """Context manager which seeds the NumPy PRNG with the specified seed and
+    restores the state afterward"""
+    if seed is None:
+        yield
+        return
+    if len(addl_seeds) > 0:
+        seed = int(hash((seed, *addl_seeds)) % 1e6)
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
+
+
+class EpisodeSampler(torch.utils.data.BatchSampler):
+
+    def __init__(self, dataset, k_shot, n_way, r_query, min_labels, seed,
+                 n_iter, rank, world_size):
+        self.dataset = dataset
+        self.k_shot = k_shot
+        self.n_way = n_way
+        self.r_query = r_query
+        self.min_labels = min_labels
+        self.seed = seed
+        self.rank = rank
+        self.world_size = world_size
+        self.step = 0
+        self.label_field = 'label'
+        self.text_field = 'text'
+        self.domain_field = 'domain'
+        self.default_domain = 'default_domain'
+        self.episode = n_iter
+        domain_label_sampleid = {}
+        bad_sample_ids = self.get_bad_sampleids(dataset)
+        for sample_index, sample in enumerate(dataset):
+            if sample_index in bad_sample_ids:
+                continue
+            label = self._get_field(sample, self.label_field)
+            text = self._get_field(sample, self.text_field)
+            if label is None or text is None:
+                continue
+            domain = self._get_field(sample, self.domain_field,
+                                     self.default_domain)
+            label_tokens = domain_label_sampleid.get(domain, {})
+            domain_label_sampleid[domain] = label_tokens
+            sample_list = label_tokens.get(label, [])
+            label_tokens[label] = sample_list
+            sample_list.append(sample_index)
+        self.domain_label_tokens = self.remove_invalid_labels(
+            domain_label_sampleid)
+        self.domains = sorted(list(self.domain_label_tokens.keys()))
+        domain_label_cnt = [
+            len(self.domain_label_tokens[domain]) for domain in self.domains
+        ]
+        total = float(sum(domain_label_cnt))
+        self.domain_to_prob = [
+            domain_label_cnt[i] / total
+            for i, domain in enumerate(self.domains)
+        ]
+        data_size = 0
+        for domain, label_tokens in self.domain_label_tokens.items():
+            for label, tokens in label_tokens.items():
+                data_size += len(tokens)
+        if dataset.mode == 'train':
+            logger.info(
+                f'{dataset.mode}: label size:{total}, data size:{data_size}')
+
+    def __iter__(self):
+        for i in range(self.episode):
+            seed = self.step * self.world_size + self.rank
+            with numpy_seed(*(seed, self.seed)):
+                self.step += 1
+                domain = np.random.choice(
+                    self.domains, p=self.domain_to_prob, size=1,
+                    replace=False)[0]
+                all_labels = sorted(
+                    list(self.domain_label_tokens[domain].keys()))
+                N = min(self.n_way, len(all_labels))
+                labels = np.random.choice(
+                    all_labels, size=min(N, len(all_labels)), replace=False)
+                batch = []
+                for label in labels[:N]:
+                    candidates = self.domain_label_tokens[domain][label]
+                    K = min(len(candidates), int((self.k_shot + self.r_query)))
+                    tmp = np.random.choice(candidates, size=K, replace=False)
+                    batch.extend(tmp)
+                batch = [int(n) for n in batch]
+                yield batch
+
+    def _get_field(self, obj, key, default=None):
+        value = getattr(obj, key, default) or obj.get(key, default)
+        if value is not None:
+            return str(value)
+        return None
+
+    def remove_invalid_labels(self, domain_label_sampleid):
+        removed_labels = set()
+        removed_domains = set()
+        result = {}
+        for domain, label_to_samples in domain_label_sampleid.items():
+            result[domain] = {}
+            for label, samples in label_to_samples.items():
+                if len(samples) < self.k_shot:
+                    removed_labels.add(label)
+                else:
+                    result[domain][label] = samples
+            if len(result[domain]) < self.min_labels:
+                del result[domain]
+                removed_domains.add(domain)
+        return result
+
+    def get_bad_sampleids(self, dataset):
+        domain_text_to_samples = defaultdict(lambda: defaultdict(list))
+        for local_index, sample in enumerate(dataset):
+            domain = self._get_field(
+                sample, self.domain_field, default=self.default_domain)
+            idx = self._get_field(sample, self.text_field, default='')
+            domain_text_to_samples[domain][idx].append(
+                (local_index, self._get_field(sample, self.label_field)))
+
+        overall_conflict_result = []
+        overall_duplicate_result = []
+        for domain, text_to_samples in domain_text_to_samples.items():
+            conflict_result = []
+            duplicate_result = []
+            for text, samples in text_to_samples.items():
+                label_cnt = set([item[1] for item in samples])
+                if len(label_cnt) >= 2:
+                    conflict_result.extend([item[0] for item in samples])
+                else:
+                    duplicate_result.extend([item[0] for item in samples[1:]])
+            overall_conflict_result.extend(conflict_result)
+            overall_duplicate_result.extend(duplicate_result)
+
+        result = set(list(overall_duplicate_result))
+        # remove conflict data which the same query has different label
+        result.update(set(list(overall_conflict_result)))
+        return result
+
+    def __len__(self):
+        return self.episode
+
+
+@dataclass
+class FewShotCollator():
+
+    def __init__(self, preprocessor: Preprocessor, k_shot):
+        self.preprocessor = preprocessor
+        self.k_shot = k_shot
+        self.label_field = 'label'
+        self.text_field = 'text'
+        self.domain_field = 'domain'
+
+    def _get_field(self, obj, key, default=None):
+        return getattr(obj, key, default) or obj.get(key, default)
+
+    def __call__(self, samples):
+        label_to_texts = defaultdict(list)
+        for sample in samples:
+            text = self._get_field(sample, self.text_field)
+            label = self._get_field(sample, self.label_field)
+            label_to_texts[label].append(text)
+        query_set = []
+        query_labels = []
+        support_set = []
+        for label, texts in label_to_texts.items():
+            s = texts[:self.k_shot]
+            q = texts[self.k_shot:]
+            query_set.extend(q)
+            support_set.extend([{
+                self.text_field: t,
+                self.label_field: label
+            } for t in s])
+            query_labels.extend([label] * len(q))
+        sample = {
+            'query_set': query_set,
+            'support_set': support_set,
+            'query_label': query_labels
+        }
+        result = self.preprocessor(sample, mode=ModeKeys.INFERENCE)
+        return result
+
+
+class FaqDataset(Dataset):
+
+    def __init__(self, data):
+        self.data = data
+
+    def __getitem__(self, i):
+        return self.data[i]
+
+    def __setitem__(self, key, value):
+        self.data[key] = value
+
+    def __len__(self):
+        return len(self.data)
+
+
+@TRAINERS.register_module(module_name=Trainers.faq_question_answering_trainer)
+class FaqQuestionAnsweringTrainer(EpochBasedTrainer):
+
+    def __init__(
+            self,
+            model: Optional[Union[TorchModel, nn.Module, str]] = None,
+            cfg_file: Optional[str] = None,
+            cfg_modify_fn: Optional[Callable] = None,
+            arg_parse_fn: Optional[Callable] = None,
+            data_collator: Optional[Union[Callable, Dict[str,
+                                                         Callable]]] = None,
+            train_dataset: Optional[Union[MsDataset, Dataset, List]] = None,
+            eval_dataset: Optional[Union[MsDataset, Dataset, List]] = None,
+            preprocessor: Optional[Union[Preprocessor,
+                                         Dict[str, Preprocessor]]] = None,
+            optimizers: Tuple[torch.optim.Optimizer,
+                              torch.optim.lr_scheduler._LRScheduler] = (None,
+                                                                        None),
+            model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
+            seed: int = 42,
+            **kwargs):
+        if isinstance(train_dataset, list):
+            train_dataset = FaqDataset(train_dataset)
+        if isinstance(eval_dataset, list):
+            eval_dataset = FaqDataset(eval_dataset)
+        super(FaqQuestionAnsweringTrainer,
+              self).__init__(model, cfg_file, cfg_modify_fn, arg_parse_fn,
+                             data_collator, train_dataset, eval_dataset,
+                             preprocessor, optimizers, model_revision, seed,
+                             **kwargs)
+        k_shot = self.cfg.safe_get('train.sampler.k_shot')
+        self.train_data_collator = FewShotCollator(self.train_preprocessor,
+                                                   k_shot)
+        self.eval_data_collator = FewShotCollator(self.eval_preprocessor,
+                                                  k_shot)
+
+    @property
+    def max_iters(self):
+        return self._train_iters_per_epoch * self.max_epochs
+
+    @property
+    def inner_iter(self) -> int:
+        return 0
+
+    def _build_dataloader_with_dataset(self,
+                                       dataset: Dataset,
+                                       workers_per_gpu: int,
+                                       dist: bool = False,
+                                       shuffle: bool = True,
+                                       seed: int = 0,
+                                       persistent_workers=False,
+                                       **kwargs) -> DataLoader:
+        rank, world_size = get_dist_info()
+        sampler = None
+        sampler_cfg = self.cfg.safe_get('train.sampler', {})
+        sampler_cfg['seed'] = seed
+        if dataset.mode == ModeKeys.TRAIN:
+            sampler_cfg['n_iter'] = self.cfg.safe_get(
+                'train.train_iters_per_epoch')
+        else:
+            sampler_cfg['n_iter'] = self.cfg.safe_get(
+                'evaluation.val_iters_per_epoch')
+        sampler_cfg['rank'] = rank
+        sampler_cfg['world_size'] = world_size
+        batch_sampler = EpisodeSampler(dataset, **sampler_cfg)
+
+        init_fn = partial(
+            worker_init_fn, num_workers=workers_per_gpu, rank=rank,
+            seed=seed) if seed is not None else None
+
+        if LooseVersion(torch.__version__) >= LooseVersion('1.7.0'):
+            kwargs['persistent_workers'] = persistent_workers
+        elif persistent_workers is True:
+            self.logger.warning(
+                'persistent_workers is invalid because your pytorch '
+                'version is lower than 1.7.0')
+        data_loader = DataLoader(
+            dataset,
+            sampler=sampler,
+            num_workers=workers_per_gpu,
+            batch_sampler=batch_sampler,
+            pin_memory=kwargs.pop('pin_memory', False),
+            worker_init_fn=init_fn,
+            **kwargs)
+
+        return data_loader
diff --git a/modelscope/trainers/nlp/gpt3_trainer.py b/modelscope/trainers/nlp/gpt3_trainer.py
index 51e7ba1e..afda6424 100644
--- a/modelscope/trainers/nlp/gpt3_trainer.py
+++ b/modelscope/trainers/nlp/gpt3_trainer.py
@@ -2,27 +2,25 @@
 
 import os
 from collections.abc import Mapping
-from typing import List
+from typing import Any, Dict, List
 
 import torch
-from megatron import mpu
+from megatron_util import mpu
 
 from modelscope.metainfo import Trainers
 from modelscope.models import TorchModel
+from modelscope.models.nlp import GPT3ForTextGeneration
 from modelscope.trainers.builder import TRAINERS
 from modelscope.trainers.nlp_trainer import NlpEpochBasedTrainer
 from modelscope.utils.config import Config
-from modelscope.utils.file_utils import func_receive_dict_inputs
 
 
 @TRAINERS.register_module(module_name=Trainers.gpt3_trainer)
 class GPT3Trainer(NlpEpochBasedTrainer):
 
     def rebuild_config(self, cfg: Config):
-        super().rebuild_config(cfg)
-        cfg.model.rank = int(os.environ.get('LOCAL_RANK', -1))
-        cfg.model.master_ip = os.environ.get('MASTER_ADDR', '127.0.0.1')
-        cfg.model.master_port = os.environ.get('MASTER_PORT', '29500')
+        cfg = super().rebuild_config(cfg)
+        cfg.model.rank = int(os.environ.get('RANK', 0))
         return cfg
 
     def train_step(self, model: TorchModel, inputs: Mapping):
@@ -39,13 +37,19 @@ class GPT3Trainer(NlpEpochBasedTrainer):
         model = self.model.module if self._dist else self.model
         model.eval()
 
-        with torch.no_grad():
-            if isinstance(
-                    data,
-                    Mapping) and not func_receive_dict_inputs(model.generate):
-                result = model.generate(**data)
-            else:
-                result = model.generate(data)
+        if self._is_pair(data):
+            return self._generate_eval(model, data)
+        else:
+            return self._forward_eval(model, data)
+
+    @staticmethod
+    def _is_pair(data: Dict[str, Any]) -> bool:
+        return 'is_pair' in data and bool(data['is_pair'][0])
+
+    def _generate_eval(self, model: GPT3ForTextGeneration,
+                       data: Dict[str, Any]) -> Dict[str, Any]:
+        data['do_sample'] = False
+        result = model.generate(data)
 
         prompt_length: List[int] = data['prompt_length']
         result['preds'] = [
@@ -56,6 +60,8 @@ class GPT3Trainer(NlpEpochBasedTrainer):
             self._decode(seq[skip_len - 1:])
             for seq, skip_len in zip(data['labels'], prompt_length)
         ]
-        assert len(result['preds']) == len(data['tgts'])
-
         return result
+
+    def _forward_eval(self, model: GPT3ForTextGeneration,
+                      data: Dict[str, Any]) -> Dict[str, Any]:
+        return model.forward(data)
diff --git a/modelscope/trainers/nlp/gpt_moe_trainer.py b/modelscope/trainers/nlp/gpt_moe_trainer.py
index 8d431881..6aa06f73 100644
--- a/modelscope/trainers/nlp/gpt_moe_trainer.py
+++ b/modelscope/trainers/nlp/gpt_moe_trainer.py
@@ -5,7 +5,7 @@ from collections.abc import Mapping
 from typing import List
 
 import torch
-from megatron import mpu
+from megatron_util import mpu
 
 from modelscope.metainfo import Trainers
 from modelscope.models import TorchModel
diff --git a/modelscope/trainers/nlp/plug_trainer.py b/modelscope/trainers/nlp/plug_trainer.py
index 7d7d830c..d7243822 100644
--- a/modelscope/trainers/nlp/plug_trainer.py
+++ b/modelscope/trainers/nlp/plug_trainer.py
@@ -2,7 +2,7 @@ import os
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from megatron import mpu
+from megatron_util import mpu
 from torch import nn
 
 from modelscope.metainfo import Trainers
diff --git a/modelscope/trainers/nlp/table_question_answering_trainer.py b/modelscope/trainers/nlp/table_question_answering_trainer.py
new file mode 100644
index 00000000..49d88874
--- /dev/null
+++ b/modelscope/trainers/nlp/table_question_answering_trainer.py
@@ -0,0 +1,550 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+import time
+from typing import Dict, Optional
+
+import json
+import numpy
+import torch
+import tqdm
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data import DataLoader
+
+from modelscope.metainfo import Trainers
+from modelscope.models import Model
+from modelscope.models.nlp.space_T_cn.table_question_answering import \
+    TableQuestionAnswering
+from modelscope.trainers.base import BaseTrainer
+from modelscope.trainers.builder import TRAINERS
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@TRAINERS.register_module(module_name=Trainers.table_question_answering_trainer
+                          )
+class TableQuestionAnsweringTrainer(BaseTrainer):
+
+    def __init__(self, model: str, cfg_file: str = None, *args, **kwargs):
+        self.model = Model.from_pretrained(model)
+        self.train_dataset = kwargs['train_dataset']
+        self.eval_dataset = kwargs['eval_dataset']
+
+    def get_linear_schedule_with_warmup(self,
+                                        optimizer,
+                                        num_warmup_steps,
+                                        num_training_steps,
+                                        last_epoch=-1):
+        """
+        set scheduler
+        """
+
+        def lr_lambda(current_step: int):
+            if current_step < num_warmup_steps:
+                return float(current_step) / float(max(1, num_warmup_steps))
+            return max(
+                0.0,
+                float(num_training_steps - current_step)
+                / float(max(1, num_training_steps - num_warmup_steps)))
+
+        return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+    def get_wc1(self, conds):
+        """
+        [ [wc, wo, wv],
+        [wc, wo, wv], ...
+        ]
+        """
+        wc1 = []
+        for cond in conds:
+            wc1.append(int(cond[0]))
+        return wc1
+
+    def get_wo1(self, conds):
+        """
+        [ [wc, wo, wv],
+        [wc, wo, wv], ...
+        ]
+        """
+        wo1 = []
+        for cond in conds:
+            wo1.append(int(cond[1]))
+        return wo1
+
+    def get_wv1(self, conds):
+        """
+        [ [wc, wo, wv],
+        [wc, wo, wv], ...
+        ]
+        """
+        wv1 = []
+        for cond in conds:
+            wv1.append(str(cond[2]))
+        return wv1
+
+    def set_from_to(self, data, start, end, value):
+        for i in range(start, end + 1):
+            data[i] = value
+        return data
+
+    def get_g(self, sql_i, l_hs, action):
+        """
+        for backward compatibility, separated with get_g
+        """
+        g_sc = []
+        g_sa = []
+        g_wn = []
+        g_wc = []
+        g_wo = []
+        g_wv = []
+        g_slen = []
+        g_action = []
+        g_cond_conn_op = []
+        idxs = []
+        for b, psql_i1 in enumerate(sql_i):
+            # g_sc.append(psql_i1["sel"][0])
+            # g_sa.append(psql_i1["agg"][0])
+            psql_i1['sel'] = numpy.asarray(psql_i1['sel'])
+            idx = numpy.argsort(psql_i1['sel'])
+            # put back one
+            slen = len(psql_i1['sel'])
+            sid_list = list(psql_i1['sel'][idx] + 1)
+            said_list = list(numpy.asarray(psql_i1['agg'])[idx])
+            for i, sid in enumerate(sid_list):
+                if sid >= l_hs[b]:
+                    sid_list[i] = 0
+                    if said_list[i] == 0:
+                        slen -= 1
+            sid_list += [
+                0 for _ in range(self.model.max_select_num - len(sid_list))
+            ]
+            # put back one
+            said_list += [
+                0 for _ in range(self.model.max_select_num - len(said_list))
+            ]
+            g_sc.append(sid_list)
+            g_sa.append(said_list)
+            g_slen.append(0 if slen <= 0 else slen)
+
+            psql_i1['sel'] = numpy.sort(psql_i1['sel'])
+            psql_i1['agg'] = numpy.sort(psql_i1['agg'])
+            assert len(psql_i1['sel']) == len(psql_i1['agg'])
+
+            g_action.append(action[b][0])
+            g_cond_conn_op.append(psql_i1['cond_conn_op'])
+
+            conds = numpy.asarray(psql_i1['conds'])
+            conds_num = [int(x) for x in conds[:, 0]]
+            idx = numpy.argsort(conds_num)
+            idxs.append(idx)
+            psql_i1['conds'] = conds[idx]
+            if not len(psql_i1['agg']) < 0:
+                # put back one
+                wlen = len(conds)
+                wcd_list = list(
+                    numpy.array(self.get_wc1(list(conds[idx]))) + 1)
+                wod_list = list(numpy.array(self.get_wo1(list(conds[idx]))))
+                for i, wcd in enumerate(wcd_list):
+                    if wcd >= l_hs[b]:
+                        wcd_list[i] = 0
+                        wlen -= 1
+                wcd_list += [
+                    0 for _ in range(self.model.max_where_num - len(wcd_list))
+                ]
+                wod_list += [
+                    0 for _ in range(self.model.max_where_num - len(wod_list))
+                ]
+                g_wc.append(wcd_list)
+                g_wn.append(0 if wlen <= 0 else wlen)
+                g_wo.append(wod_list)
+                g_wv.append(self.get_wv1(list(conds[idx])))
+            else:
+                raise EnvironmentError
+
+        return g_sc, g_sa, g_wn, g_wc, g_wo, g_wv, g_cond_conn_op, g_slen, g_action, idxs
+
+    def get_g_wvi_bert_from_g_wvi_corenlp(self, g_wvi_corenlp, l_n, idxs):
+        """
+        Generate SQuAD style start and end index of wv in nlu. Index is for of after WordPiece tokenization.
+
+        Assumption: where_str always presents in the nlu.
+        """
+        max_l = 0
+        for elem in l_n:
+            if elem > max_l:
+                max_l = elem
+
+        # for first [CLS] and end [SEP]
+        max_l += 2
+        g_wvi = []
+        g_wv_ps = []
+        g_wv_pe = []
+        for b, t_obj in enumerate(g_wvi_corenlp):
+            g_wvi1 = [0] * max_l
+            g_wvss1 = [0] * self.model.max_where_num
+            g_wvse1 = [0] * self.model.max_where_num
+            for i_wn, g_wvi_corenlp11 in enumerate(
+                    list(numpy.asarray(t_obj['wvi_corenlp'])[idxs[b]])):
+                st_idx, ed_idx = g_wvi_corenlp11
+
+                if st_idx == -100 and ed_idx == -100:
+                    continue
+                else:
+                    # put back one
+                    self.set_from_to(g_wvi1, st_idx + 1, ed_idx + 1, i_wn + 1)
+                    g_wvss1[i_wn] = st_idx + 1
+                    g_wvse1[i_wn] = ed_idx + 1
+
+            g_wvi.append(g_wvi1)
+            g_wv_ps.append(g_wvss1)
+            g_wv_pe.append(g_wvse1)
+
+        return g_wvi, (g_wv_ps, g_wv_pe)
+
+    def loss_scco(self, s_cco, g_cond_conn_op):
+        loss = torch.nn.functional.cross_entropy(
+            s_cco,
+            torch.tensor(g_cond_conn_op).to(self.model.device))
+        return loss
+
+    def loss_sw_se(self, s_action, s_sc, s_sa, s_cco, s_wc, s_wo, s_wvs, g_sc,
+                   g_sa, g_wn, g_wc, g_wo, g_wvi, g_cond_conn_op, g_slen,
+                   g_wvp, max_h_len, s_len, g_action):
+        loss = 0
+
+        loss += torch.nn.functional.cross_entropy(
+            s_sc.reshape(-1, max_h_len),
+            torch.tensor(g_sc).reshape(-1).to(self.model.device))
+        loss += torch.nn.functional.cross_entropy(
+            s_sa.reshape(-1, self.model.n_agg_ops),
+            torch.tensor(g_sa).reshape(-1).to(self.model.device))
+
+        s_slen, s_wlen = s_len
+        loss += self.loss_scco(s_cco, g_cond_conn_op)
+        loss += self.loss_scco(s_slen, g_slen)
+        loss += self.loss_scco(s_wlen, g_wn)
+
+        loss += self.loss_scco(s_action, g_action)
+
+        loss += torch.nn.functional.cross_entropy(
+            s_wc.reshape(-1, max_h_len),
+            torch.tensor(g_wc).reshape(-1).to(self.model.device))
+        loss += torch.nn.functional.cross_entropy(
+            s_wo.reshape(-1, self.model.n_cond_ops),
+            torch.tensor(g_wo).reshape(-1).to(self.model.device))
+
+        s_wvs_s, s_wvs_e = s_wvs
+        loss += torch.nn.functional.cross_entropy(
+            s_wvs_s.reshape(-1, s_wvs_s.shape[-1]),
+            torch.tensor(g_wvp[0]).reshape(-1).to(self.model.device))
+        loss += torch.nn.functional.cross_entropy(
+            s_wvs_e.reshape(-1, s_wvs_e.shape[-1]),
+            torch.tensor(g_wvp[1]).reshape(-1).to(self.model.device))
+
+        return loss
+
+    def sort_agg_sel(self, aggs, sels):
+        if len(aggs) != len(sels):
+            return aggs, sels
+        seldic = {}
+        for i, sel in enumerate(sels):
+            seldic[sel] = aggs[i]
+        aps = sorted(seldic.items(), key=lambda d: d[0])
+        new_aggs = []
+        new_sels = []
+        for ap in aps:
+            new_sels.append(ap[0])
+            new_aggs.append(ap[1])
+        return new_aggs, new_sels
+
+    def sort_conds(self, nlu, conds):
+        newconds = []
+        for cond in conds:
+            if len(newconds) == 0:
+                newconds.append(cond)
+                continue
+            idx = len(newconds)
+            for i, newcond in enumerate(newconds):
+                if cond[0] < newcond[0]:
+                    idx = i
+                    break
+                elif cond[0] == newcond[0]:
+                    val = cond[2]
+                    newval = newcond[2]
+                    validx = nlu.find(val)
+                    newvalidx = nlu.find(newval)
+                    if validx != -1 and newvalidx != -1 and validx < newvalidx:
+                        idx = i
+                        break
+            if idx == len(newconds):
+                newconds.append(cond)
+            else:
+                newconds.insert(idx, cond)
+        return newconds
+
+    def calculate_scores(self, answers, results, epoch=0):
+        if len(answers) != len(results) or len(results) == 0:
+            return
+
+        all_sum, all_right, sc_len, cco, wc_len = 0, 0, 0, 0, 0
+        act, s_agg, all_col, s_col = 0, 0, 0, 0
+        all_w, w_col, w_op, w_val = 0, 0, 0, 0
+        for idx, item in enumerate(tqdm.tqdm(answers, desc='evaluate')):
+            nlu = item['question']
+            qaSQL = item['sql']
+            result = results[idx]
+            sql = result['sql']
+            question = result['question']
+            questionToken = result['question_tok']
+            rights, errors = {}, {}
+            if nlu != question:
+                continue
+            all_sum += 1
+            right = True
+            if len(sql['sel']) == len(qaSQL['sel']) and len(sql['agg']) == len(
+                    qaSQL['agg']):
+                sc_len += 1
+                rights['select number'] = None
+            else:
+                right = False
+                errors['select number'] = None
+
+            if item['action'][0] == result['action']:
+                act += 1
+                rights['action'] = None
+            else:
+                right = False
+                errors['action'] = None
+
+            if sql['cond_conn_op'] == qaSQL['cond_conn_op']:
+                cco += 1
+                rights['condition operator'] = None
+            else:
+                right = False
+                errors['condition operator'] = None
+
+            if len(sql['conds']) == len(qaSQL['conds']):
+                wc_len += 1
+                rights['where number'] = None
+            else:
+                right = False
+                errors['where number'] = None
+
+            all_col += max(len(sql['agg']), len(qaSQL['agg']))
+            aaggs, asels = self.sort_agg_sel(qaSQL['agg'], qaSQL['sel'])
+            raggs, rsels = self.sort_agg_sel(sql['agg'], sql['sel'])
+            for j, agg in enumerate(aaggs):
+                if j < len(raggs) and raggs[j] == agg:
+                    s_agg += 1
+                    rights['select aggregation'] = None
+                else:
+                    right = False
+                    errors['select aggregation'] = None
+                if j < len(rsels) and j < len(asels) and rsels[j] == asels[j]:
+                    s_col += 1
+                    rights['select column'] = None
+                else:
+                    right = False
+                    errors['select column'] = None
+
+            all_w += max(len(sql['conds']), len(qaSQL['conds']))
+            aconds = self.sort_conds(nlu, qaSQL['conds'])
+            rconds = self.sort_conds(nlu, sql['conds'])
+
+            for j, cond in enumerate(aconds):
+                if j >= len(rconds):
+                    break
+
+                pcond = rconds[j]
+                if cond[0] == pcond[0]:
+                    w_col += 1
+                    rights['where column'] = None
+                else:
+                    right = False
+                    errors['where column'] = None
+                if cond[1] == pcond[1]:
+                    w_op += 1
+                    rights['where operator'] = None
+                else:
+                    right = False
+                    errors['where operator'] = None
+                value = ''
+                try:
+                    for k in range(pcond['startId'], pcond['endId'] + 1, 1):
+                        value += questionToken[k].strip()
+                except Exception:
+                    value = ''
+                valuelow = value.strip().lower()
+                normal = cond[2].strip().lower()
+                valuenormal = pcond[2].strip().lower()
+                if (normal in valuenormal) or (normal in valuelow) or (
+                        valuelow in normal) or (valuenormal in normal):
+                    w_val += 1
+                    rights['where value'] = None
+                else:
+                    right = False
+                    errors['where value'] = None
+
+            if right:
+                all_right += 1
+
+        all_ratio = all_right / (all_sum + 0.01)
+        act_ratio = act / (all_sum + 0.01)
+        sc_len_ratio = sc_len / (all_sum + 0.01)
+        cco_ratio = cco / (all_sum + 0.01)
+        wc_len_ratio = wc_len / (all_sum + 0.01)
+        s_agg_ratio = s_agg / (all_col + 0.01)
+        s_col_ratio = s_col / (all_col + 0.01)
+        w_col_ratio = w_col / (all_w + 0.01)
+        w_op_ratio = w_op / (all_w + 0.01)
+        w_val_ratio = w_val / (all_w + 0.01)
+        logger.info(
+            '{STATIS} [epoch=%d] all_ratio: %.3f, act_ratio: %.3f, sc_len_ratio: %.3f, '
+            'cco_ratio: %.3f, wc_len_ratio: %.3f, s_agg_ratio: %.3f, s_col_ratio: %.3f, '
+            'w_col_ratio: %.3f, w_op_ratio: %.3f, w_val_ratio: %.3f' %
+            (epoch, all_ratio, act_ratio, sc_len_ratio, cco_ratio,
+             wc_len_ratio, s_agg_ratio, s_col_ratio, w_col_ratio, w_op_ratio,
+             w_val_ratio))
+
+        metrics = {
+            'accuracy': all_ratio,
+            'action_accuracy': act_ratio,
+            'select_length_accuracy': sc_len_ratio,
+            'connector_accuracy': cco_ratio,
+            'where_length_accuracy': wc_len_ratio,
+            'select_aggregation_accuracy': s_agg_ratio,
+            'select_column_accuracy': s_col_ratio,
+            'where_column_accuracy': w_col_ratio,
+            'where_operator_accuracy': w_op_ratio,
+            'where_value_accuracy': w_val_ratio
+        }
+
+        return metrics
+
+    def evaluate(self, checkpoint_path=None):
+        """
+        Evaluate testsets
+        """
+        metrics = {'all_ratio': 0.0}
+        if checkpoint_path is not None:
+            # load model
+            state_dict = torch.load(checkpoint_path)
+            self.model.backbone_model.load_state_dict(
+                state_dict['backbone_model'])
+            self.model.head_model.load_state_dict(
+                state_dict['head_model'], strict=False)
+
+            # predict
+            results = []
+            for data in tqdm.tqdm(self.eval_dataset, desc='predict'):
+                result = self.model.predict([data])[0]
+                results.append(result)
+
+            metrics = self.calculate_scores(self.eval_dataset, results)
+
+        return metrics
+
+    def train(
+        self,
+        batch_size=16,
+        total_epoches=20,
+        backbone_learning_rate=1e-5,
+        head_learning_rate=5e-4,
+        backbone_weight_decay=0.01,
+        head_weight_decay=0.01,
+        warmup_ratio=0.1,
+    ):
+        """
+        Fine-tuning trainsets
+        """
+        # obtain train loader
+        train_loader = DataLoader(
+            batch_size=batch_size,
+            dataset=self.train_dataset,
+            shuffle=True,
+            num_workers=4,
+            collate_fn=lambda x: x)
+
+        # some params
+        total_train_steps = len(train_loader) * total_epoches
+        warmup_steps = int(warmup_ratio * total_train_steps)
+        opt = torch.optim.AdamW(
+            filter(lambda p: p.requires_grad,
+                   self.model.head_model.parameters()),
+            lr=head_learning_rate,
+            weight_decay=head_weight_decay)
+        opt_bert = torch.optim.AdamW(
+            filter(lambda p: p.requires_grad,
+                   self.model.backbone_model.parameters()),
+            lr=backbone_learning_rate,
+            weight_decay=backbone_weight_decay)
+        lr_scheduler = self.get_linear_schedule_with_warmup(
+            opt, warmup_steps, total_train_steps)
+        lr_scheduler_bert = self.get_linear_schedule_with_warmup(
+            opt_bert, warmup_steps, total_train_steps)
+
+        # start training
+        max_accuracy = 0.0
+        for epoch in range(1, total_epoches + 1):
+
+            # train model
+            self.model.head_model.train()
+            self.model.backbone_model.train()
+            for iB, item in enumerate(train_loader):
+                nlu, nlu_t, sql_i, q_know, t_know, action, hs_t, types, units, his_sql, schema_link = \
+                    self.model.get_fields_info(item, None, train=True)
+
+                # forward process
+                all_encoder_layer, _, tokens, i_nlu, i_hds, l_n, l_hpu, l_hs, start_index, column_index, ids = \
+                    self.model.get_bert_output(
+                        self.model.backbone_model, self.model.tokenizer, nlu_t, hs_t,
+                        types, units, his_sql, q_know, t_know, schema_link)
+                g_sc, g_sa, g_wn, g_wc, g_wo, g_wv, g_cond_conn_op, g_slen, g_action, idxs = \
+                    self.get_g(sql_i, l_hs, action)
+                g_wvi, g_wvp = self.get_g_wvi_bert_from_g_wvi_corenlp(
+                    item, l_n, idxs)
+                s_action, s_sc, s_sa, s_cco, s_wc, s_wo, s_wvs, s_len = self.model.head_model(
+                    all_encoder_layer, l_n, l_hs, start_index, column_index,
+                    tokens, ids)
+
+                # calculate loss
+                max_h_len = max(l_hs)
+                loss_all = self.loss_sw_se(s_action, s_sc, s_sa, s_cco, s_wc,
+                                           s_wo, s_wvs, g_sc, g_sa, g_wn, g_wc,
+                                           g_wo, g_wvi, g_cond_conn_op, g_slen,
+                                           g_wvp, max_h_len, s_len, g_action)
+
+                logger.info('{train} [epoch=%d/%d] [batch=%d/%d] loss: %.4f' %
+                            (epoch, total_epoches, iB, len(train_loader),
+                             loss_all.item()))
+
+                # backward process
+                opt.zero_grad()
+                opt_bert.zero_grad()
+                loss_all.backward()
+                opt.step()
+                lr_scheduler.step()
+                opt_bert.step()
+                lr_scheduler_bert.step()
+
+            # evaluate model
+            results = []
+            for data in tqdm.tqdm(self.eval_dataset, desc='predict'):
+                result = self.model.predict([data])[0]
+                results.append(result)
+            metrics = self.calculate_scores(
+                self.eval_dataset, results, epoch=epoch)
+            if metrics['accuracy'] >= max_accuracy:
+                max_accuracy = metrics['accuracy']
+                model_path = os.path.join(self.model.model_dir,
+                                          'finetuned_model.bin')
+                state_dict = {
+                    'head_model': self.model.head_model.state_dict(),
+                    'backbone_model': self.model.backbone_model.state_dict(),
+                }
+                torch.save(state_dict, model_path)
+                logger.info(
+                    'epoch %d obtain max score: %.4f, saving model to %s' %
+                    (epoch, metrics['accuracy'], model_path))
diff --git a/modelscope/trainers/nlp_trainer.py b/modelscope/trainers/nlp_trainer.py
index 5ce7c2f5..87d175a2 100644
--- a/modelscope/trainers/nlp_trainer.py
+++ b/modelscope/trainers/nlp_trainer.py
@@ -470,11 +470,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             self.id2label = {idx: label for idx, label in enumerate(labels)}
             self.num_labels = len(labels)
         except AttributeError:
-            label2id = parse_label_mapping(self.model_dir)
-            if label2id is not None:
-                self.label2id = label2id
-                self.id2label = {id: label for label, id in label2id.items()}
-                self.num_labels = len(label2id)
+            pass
 
         def build_dataset_keys(cfg):
             if cfg is not None:
@@ -532,7 +528,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
         """
 
         # Compatible with old logic
-        model_args = {} if self.label2id is None else {
+        extra_args = {} if self.label2id is None else {
             'label2id': self.label2id
         }
 
@@ -540,7 +536,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             self.model_dir,
             cfg_dict=self.cfg,
             preprocessor_mode=ModeKeys.TRAIN,
-            **model_args,
+            **extra_args,
             **self.train_keys,
             mode=ModeKeys.TRAIN,
             use_fast=True)
@@ -548,7 +544,7 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
             self.model_dir,
             cfg_dict=self.cfg,
             preprocessor_mode=ModeKeys.EVAL,
-            **model_args,
+            **extra_args,
             **self.eval_keys,
             mode=ModeKeys.EVAL,
             use_fast=True)
diff --git a/modelscope/trainers/optimizer/builder.py b/modelscope/trainers/optimizer/builder.py
index f43768d6..6b549f84 100644
--- a/modelscope/trainers/optimizer/builder.py
+++ b/modelscope/trainers/optimizer/builder.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import inspect
+from typing import Iterable, Union
 
 import torch
 
@@ -9,21 +10,29 @@ from modelscope.utils.registry import Registry, build_from_cfg, default_group
 OPTIMIZERS = Registry('optimizer')
 
 
-def build_optimizer(model: torch.nn.Module,
+def build_optimizer(model: Union[torch.nn.Module,
+                                 Iterable[torch.nn.parameter.Parameter]],
                     cfg: ConfigDict,
                     default_args: dict = None):
     """ build optimizer from optimizer config dict
 
     Args:
+        model: A torch.nn.Module or an iterable of parameters.
         cfg (:obj:`ConfigDict`): config dict for optimizer object.
         default_args (dict, optional): Default initialization arguments.
     """
-    if hasattr(model, 'module'):
-        model = model.module
-
     if default_args is None:
         default_args = {}
-    default_args['params'] = model.parameters()
+
+    if isinstance(model, torch.nn.Module) or (hasattr(
+            model, 'module') and isinstance(model.module, torch.nn.Module)):
+        if hasattr(model, 'module'):
+            model = model.module
+
+        default_args['params'] = model.parameters()
+    else:
+        # Input is a iterable of parameters, this case fits for the scenario of user-defined parameter groups.
+        default_args['params'] = model
 
     return build_from_cfg(
         cfg, OPTIMIZERS, group_key=default_group, default_args=default_args)
diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py
index aa4818d9..bbe9d006 100644
--- a/modelscope/trainers/trainer.py
+++ b/modelscope/trainers/trainer.py
@@ -129,7 +129,6 @@ class EpochBasedTrainer(BaseTrainer):
         # add default config
         merge_cfg(self.cfg)
         self.cfg = self.rebuild_config(self.cfg)
-        self.logger = get_logger(log_level=self.cfg.get('log_level', 'INFO'))
         if 'cfg_options' in kwargs:
             self.cfg.merge_from_dict(kwargs['cfg_options'])
 
@@ -147,8 +146,17 @@ class EpochBasedTrainer(BaseTrainer):
             preprocessor)
 
         self._dist = self.init_dist(kwargs.get('launcher'))
+
+        if is_master() and not os.path.exists(self.work_dir):
+            os.makedirs(self.work_dir)
+
         self.device = self.get_device(kwargs.get('device'))
 
+        # init logger after distribution init
+        log_file = os.path.join(self.work_dir, '{}.log'.format(self.timestamp))
+        self.logger = get_logger(
+            log_file=log_file, log_level=self.cfg.get('log_level', 'INFO'))
+
         self.train_dataset = self.to_task_dataset(
             train_dataset,
             mode=ModeKeys.TRAIN,
@@ -280,7 +288,7 @@ class EpochBasedTrainer(BaseTrainer):
         Returns: The rebuilt config
 
         """
-        if self.cfg_modify_fn is not None:
+        if hasattr(self, 'cfg_modify_fn') and self.cfg_modify_fn is not None:
             cfg = self.cfg_modify_fn(cfg)
         return cfg
 
@@ -477,18 +485,8 @@ class EpochBasedTrainer(BaseTrainer):
 
     def train(self, checkpoint_path=None, *args, **kwargs):
         self._mode = ModeKeys.TRAIN
-
-        if self.train_dataset is None:
-            self.train_dataloader = self.get_train_dataloader()
-        else:
-            self.train_dataloader = self._build_dataloader_with_dataset(
-                self.train_dataset,
-                dist=self._dist,
-                seed=self._seed,
-                collate_fn=self.train_data_collator,
-                **self.cfg.train.get('dataloader', {}))
+        self.train_dataloader = self.get_train_dataloader()
         self.data_loader = self.train_dataloader
-
         self.register_optimizers_hook()
         self.register_hook_from_cfg(self.cfg.train.hooks)
         self.set_checkpoint_file_to_hook(checkpoint_path)
@@ -502,15 +500,7 @@ class EpochBasedTrainer(BaseTrainer):
             CheckpointHook.load_checkpoint(checkpoint_path, self)
         self.model.eval()
         self._mode = ModeKeys.EVAL
-        if self.eval_dataset is None:
-            self.eval_dataloader = self.get_eval_data_loader()
-        else:
-            self.eval_dataloader = self._build_dataloader_with_dataset(
-                self.eval_dataset,
-                dist=self._dist,
-                seed=self._seed,
-                collate_fn=self.eval_data_collator,
-                **self.cfg.evaluation.get('dataloader', {}))
+        self.eval_dataloader = self.get_eval_data_loader()
         self.data_loader = self.eval_dataloader
         metric_classes = [build_metric(metric) for metric in self.metrics]
         for m in metric_classes:
@@ -672,19 +662,14 @@ class EpochBasedTrainer(BaseTrainer):
                 mode=ModeKeys.EVAL,
                 preprocessor=self.eval_preprocessor)
 
-        batch_size = self.cfg.evaluation.dataloader.batch_size_per_gpu
-        workers = self.cfg.evaluation.dataloader.workers_per_gpu
-        shuffle = self.cfg.evaluation.dataloader.get('shuffle', False)
+        default_config = {'shuffle': False}
+        default_config.update(self.cfg.evaluation.get('dataloader', {}))
         data_loader = self._build_dataloader_with_dataset(
             self.eval_dataset,
-            batch_size_per_gpu=batch_size,
-            workers_per_gpu=workers,
-            shuffle=shuffle,
             dist=self._dist,
             seed=self._seed,
-            persistent_workers=True,
             collate_fn=self.eval_data_collator,
-        )
+            **default_config)
         return data_loader
 
     def build_dataset(self, data_cfg, mode, preprocessor=None):
@@ -942,73 +927,55 @@ class EpochBasedTrainer(BaseTrainer):
         """ Evaluation loop used by `EpochBasedTrainer.evaluate()`.
 
         """
+        vis_closure = None
+        if hasattr(self.cfg.evaluation, 'visualization'):
+            vis_cfg = self.cfg.evaluation.visualization
+            vis_closure = partial(
+                self.visualization, dataset=self.eval_dataset, **vis_cfg)
+
         if self._dist and self.cfg.model.get('model_parallel_size', 1) == 1:
             from modelscope.trainers.utils.inference import multi_gpu_test
             # list of batched result and data samples
-            results, data_list = multi_gpu_test(
+            metric_values = multi_gpu_test(
                 self,
                 data_loader,
                 device=self.device,
-                tmpdir=None,
-                gpu_collect=False,
+                metric_classes=metric_classes,
+                vis_closure=vis_closure,
+                tmpdir=self.cfg.evaluation.get('cache_dir', None),
+                gpu_collect=self.cfg.evaluation.get('gpu_collect', False),
                 data_loader_iters_per_gpu=self._eval_iters_per_epoch)
         else:
             from modelscope.trainers.utils.inference import single_gpu_test
-            results, data_list = single_gpu_test(
+            metric_values = single_gpu_test(
                 self,
                 data_loader,
                 device=self.device,
+                metric_classes=metric_classes,
+                vis_closure=vis_closure,
                 data_loader_iters=self._eval_iters_per_epoch)
 
         self._inner_iter = self.iters_per_epoch - 1  # start from index 0
 
-        # evaluation result processing
-        if hasattr(self.cfg.evaluation, 'visualization'):
-            flatten_results = []
-            for r in results:
-                flatten_results.extend(r)
-            vis_cfg = self.cfg.evaluation.visualization
-            self.visualization(results, self.eval_dataset, **vis_cfg)
-
-        # do evaluation on rank0
-        metric_values = {}
-        if not self._dist or is_master():
-            assert len(data_list) == len(
-                results), f'size mismatch {len(data_list)} and {len(results)}'
-            for metric_cls in metric_classes:
-                for idx in range(len(data_list)):
-                    metric_cls.add(results[idx], data_list[idx])
-
-            for metric_cls in metric_classes:
-                metric_values.update(metric_cls.evaluate())
-
-        _, world_size = get_dist_info()
-        if world_size > 1:
-            metric_values = broadcast(metric_values, 0)
         return metric_values
 
-    def visualization(self, results, dataset, **kwargs):
+    def visualization(self, batch_result, dataset, **kwargs):
         """ visualization function for evaluation results.
 
+        Examples:
+            # draw list of images as numpy array
+            images = draw_images(num_of_visualization)
+
+            # set displayed name for each image
+            filenames = get_image_display_names()
+            vis_results = {'images': images, 'filenames' : filenames}
+
+            # visualization results will be displayed in group named eva_vis
+            self.visualization_buffer.output['eval_vis'] = vis_results
+
         Args:
             results (list(dict)):  a list of result dict.
-            dataset (:obj:`Dataset`): torch dataset object to access original data.
-
-        Implementation Examples:
-        ```python
-        # draw list of images as numpy array
-        images = draw_images(num_of_visualization)
-
-        # set displayed name for each image
-        filenames = get_image_display_names()
-        vis_results = {
-            'images': images,
-            'filenames' : filenames
-        }
-
-        # visualization results will be displayed in group named eva_vis
-        self.visualization_buffer.output['eval_vis'] = vis_results
-        ```
+            dataset (Dataset): torch dataset object to access original data.
         """
         # TODO @wenmeng.zwm add visualization support for cv evaluation
         raise NotImplementedError(
diff --git a/modelscope/trainers/training_args.py b/modelscope/trainers/training_args.py
new file mode 100644
index 00000000..c387e7b8
--- /dev/null
+++ b/modelscope/trainers/training_args.py
@@ -0,0 +1,270 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import dataclasses
+from argparse import Action, ArgumentDefaultsHelpFormatter, ArgumentParser
+from typing import Any, Dict, List, Union
+
+from addict import Dict as Adict
+
+
+@dataclasses.dataclass
+class ArgAttr():
+    """ Attributes for each arg
+
+    Args:
+        cfg_node_name (str or list[str]): if set empty, it means a normal arg for argparse, otherwise it means
+            this arg value correspond to those nodes in configuration file, and will replace them for training.
+        default:  default value for current argument.
+        type:  type for current argument.
+        choices (list of str): choices of value for this argument.
+        help (str): help str for this argument.
+
+    Examples:
+    ```python
+    # define argument train_batch_size which corresponds to train.dataloader.batch_size_per_gpu
+    training_args = Adict(
+        train_batch_size=ArgAttr(
+            'train.dataloader.batch_size_per_gpu',
+            default=16,
+            type=int,
+            help='training batch size')
+    )
+
+    # num_classes which will modify three places in configuration
+    training_args = Adict(
+    num_classes = ArgAttr(
+        ['model.mm_model.head.num_classes',
+         'model.mm_model.train_cfg.augments.0.num_classes',
+         'model.mm_model.train_cfg.augments.1.num_classes'],
+        type=int,
+        help='number of classes')
+    )
+    ```
+    # a normal argument which has no relation with configuration
+    training_args = Adict(
+        local_rank = ArgAttr(
+            '',
+            default=1,
+            type=int,
+            help='local rank for current training process')
+        )
+
+    """
+    cfg_node_name: Union[str, List[str]] = ''
+    default: Any = None
+    type: type = None
+    choices: List[str] = None
+    help: str = ''
+
+
+training_args = Adict(
+    train_batch_size=ArgAttr(
+        'train.dataloader.batch_size_per_gpu',
+        default=16,
+        type=int,
+        help='training batch size'),
+    train_data_worker=ArgAttr(
+        'train.dataloader.workers_per_gpu',
+        default=8,
+        type=int,
+        help='number of data worker used for training'),
+    eval_batch_size=ArgAttr(
+        'evaluation.dataloader.batch_size_per_gpu',
+        default=16,
+        type=int,
+        help='training batch size'),
+    max_epochs=ArgAttr(
+        'train.max_epochs',
+        default=10,
+        type=int,
+        help='max number of training epoch'),
+    work_dir=ArgAttr(
+        'train.work_dir',
+        default='./work_dir',
+        type=str,
+        help='training directory to save models and training logs'),
+    lr=ArgAttr(
+        'train.optimizer.lr',
+        default=0.001,
+        type=float,
+        help='initial learning rate'),
+    optimizer=ArgAttr(
+        'train.optimizer.type',
+        default='SGD',
+        type=str,
+        choices=[
+            'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'ASGD',
+            'RMSprop', 'Rprop'
+            'SGD'
+        ],
+        help='optimizer type'),
+    local_rank=ArgAttr(
+        '', default=0, type=int, help='local rank for this process'))
+
+
+class CliArgumentParser(ArgumentParser):
+    """ Argument Parser to define and parse command-line args for training.
+
+    Args:
+        arg_dict (dict of `ArgAttr` or list of them): dict or list of dict which defines different
+            paramters for training.
+    """
+
+    def __init__(self, arg_dict: Union[Dict[str, ArgAttr],
+                                       List[Dict[str, ArgAttr]]], **kwargs):
+        if 'formatter_class' not in kwargs:
+            kwargs['formatter_class'] = ArgumentDefaultsHelpFormatter
+        super().__init__(**kwargs)
+        self.arg_dict = arg_dict if isinstance(
+            arg_dict, Dict) else self._join_args(arg_dict)
+        self.define_args()
+
+    def _join_args(self, arg_dict_list: List[Dict[str, ArgAttr]]):
+        total_args = arg_dict_list[0].copy()
+        for args in arg_dict_list[1:]:
+            total_args.update(args)
+        return total_args
+
+    def define_args(self):
+        for arg_name, arg_attr in self.arg_dict.items():
+            name = f'--{arg_name}'
+            kwargs = dict(type=arg_attr.type, help=arg_attr.help)
+            if arg_attr.default is not None:
+                kwargs['default'] = arg_attr.default
+            else:
+                kwargs['required'] = True
+
+            if arg_attr.choices is not None:
+                kwargs['choices'] = arg_attr.choices
+
+            kwargs['action'] = SingleAction
+            self.add_argument(name, **kwargs)
+
+    def get_cfg_dict(self, args=None):
+        """
+        Args:
+            args (default None):
+                List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
+
+        Returns:
+            cfg_dict (dict of config): each key is a config node name such as 'train.max_epochs', this cfg_dict
+                should be used with function `cfg.merge_from_dict` to update config object.
+        """
+        self.args, remainning = self.parse_known_args(args)
+        args_dict = vars(self.args)
+        cfg_dict = {}
+        for k, v in args_dict.items():
+            if k not in self.arg_dict or self.arg_dict[k].cfg_node_name == '':
+                continue
+            cfg_node = self.arg_dict[k].cfg_node_name
+            if isinstance(cfg_node, list):
+                for node in cfg_node:
+                    cfg_dict[node] = v
+            else:
+                cfg_dict[cfg_node] = v
+
+        return cfg_dict
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def parse_int_float_bool_str(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return val.lower() == 'true'
+        if val == 'None':
+            return None
+        return val
+
+    @staticmethod
+    def parse_iterable(val):
+        """Parse iterable values in the string.
+        All elements inside '()' or '[]' are treated as iterable values.
+        Args:
+            val (str): Value string.
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction.parse_int_float_bool_str(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction.parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self.parse_iterable(val)
+        setattr(namespace, self.dest, options)
+
+
+class SingleAction(DictAction):
+    """ Argparse action to convert value to tuple or list or nested structure of
+    list and tuple, i.e 'V1,V2,V3', or with explicit brackets, i.e. '[V1,V2,V3]'.
+    It also support nested brackets to build list/tuple values. e.g. '[(V1,V2),(V3,V4)]'
+    """
+
+    def __call__(self, parser, namespace, value, option_string):
+        if isinstance(value, str):
+            setattr(namespace, self.dest, self.parse_iterable(value))
+        else:
+            setattr(namespace, self.dest, value)
diff --git a/modelscope/trainers/utils/inference.py b/modelscope/trainers/utils/inference.py
index 631d011e..90201d72 100644
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -15,13 +15,20 @@ from modelscope.utils.torch_utils import (broadcast, get_dist_info, is_master,
                                           make_tmp_dir)
 
 
-def single_gpu_test(trainer, data_loader, device, data_loader_iters=None):
+def single_gpu_test(trainer,
+                    data_loader,
+                    device,
+                    metric_classes=None,
+                    vis_closure=None,
+                    data_loader_iters=None):
     """Test model in EpochBasedTrainer with a single gpu.
 
     Args:
         trainer (modelscope.trainers.EpochBasedTrainer): Trainer to be tested.
         data_loader (nn.Dataloader): Pytorch data loader.
         device (str | torch.device): The target device for the data.
+        metric_classes (List): List of Metric class that uses to collect metrics.
+        vis_closure (Callable): Collect data for TensorboardHook.
         data_loader_iters (int): Used when dataset has no attribute __len__ or only load part of dataset.
 
     Returns:
@@ -43,14 +50,10 @@ def single_gpu_test(trainer, data_loader, device, data_loader_iters=None):
         data_len = data_loader_iters
         desc = 'Test iterations'
 
-    results = []
-    data_lists = []
     with tqdm(total=data_len, desc=desc) as pbar:
         for i, data in enumerate(data_loader):
             data = to_device(data, device)
-            result = trainer.evaluation_step(data)
-            results.append(result)
-            data_lists.append(data)
+            evaluate_batch(trainer, data, metric_classes, vis_closure)
 
             if progress_with_iters:
                 batch_size = 1  # iteration count
@@ -71,12 +74,14 @@ def single_gpu_test(trainer, data_loader, device, data_loader_iters=None):
             if progress_with_iters and (i + 1) >= data_len:
                 break
 
-    return results, data_lists
+    return get_metric_values(metric_classes)
 
 
 def multi_gpu_test(trainer,
                    data_loader,
                    device,
+                   metric_classes=None,
+                   vis_closure=None,
                    tmpdir=None,
                    gpu_collect=False,
                    data_loader_iters_per_gpu=None):
@@ -99,8 +104,6 @@ def multi_gpu_test(trainer,
     Returns:
         list: The prediction results.
     """
-    results = []
-    data_list = []
     dataset = data_loader.dataset
     rank, world_size = get_dist_info()
 
@@ -125,9 +128,8 @@ def multi_gpu_test(trainer,
     with tqdm(total=data_len, desc=desc) as pbar:
         for i, data in enumerate(data_loader):
             data = to_device(data, device)
-            data_list.append(data)
-            result = trainer.evaluation_step(data)
-            results.append(result)
+
+            evaluate_batch(trainer, data, metric_classes, vis_closure)
 
             if isinstance(data, Mapping):
                 if 'nsentences' in data:
@@ -158,23 +160,44 @@ def multi_gpu_test(trainer,
             if progress_with_iters and (i + 1) >= data_len:
                 break
 
-    # TODO: allgather data list may cost a lot of memory and needs to be redesigned
     # collect results and data from all ranks
     if gpu_collect:
-        results = collect_results_gpu(results, total_samples)
-        data_list = collect_results_gpu(data_list, total_samples)
+        metric_classes_list = collect_results_gpu(metric_classes)
     else:
         if tmpdir is None:
             tmpdir = make_tmp_dir()
-        results = collect_results_cpu(results, total_samples,
-                                      os.path.join(tmpdir, 'predict'))
-        data_list = collect_results_cpu(data_list, total_samples,
-                                        os.path.join(tmpdir, 'groundtruth'))
+        metric_classes_list = collect_results_cpu(
+            metric_classes, os.path.join(tmpdir, 'metrics'))
 
-    return results, data_list
+    metric_classes = merge_metrics(metric_classes_list)
+
+    return get_metric_values(metric_classes)
 
 
-def collect_results_cpu(result_part, size, tmpdir=None):
+def evaluate_batch(trainer, data, metric_classes, vis_closure):
+    batch_result = trainer.evaluation_step(data)
+
+    if metric_classes is not None:
+        for metric_cls in metric_classes:
+            metric_cls.add(batch_result, data)
+
+    if vis_closure is not None:
+        # trainer.visualization
+        vis_closure(batch_result)
+
+
+def get_metric_values(metric_classes):
+    rank, world_size = get_dist_info()
+    metric_values = {}
+    if rank == 0:
+        for metric_cls in metric_classes:
+            metric_values.update(metric_cls.evaluate())
+    if world_size > 1:
+        metric_values = broadcast(metric_values, 0)
+    return metric_values
+
+
+def collect_results_cpu(result_part, tmpdir=None):
     """Collect results under cpu mode.
 
     On cpu mode, this function will save the results on different gpus to
@@ -217,18 +240,13 @@ def collect_results_cpu(result_part, size, tmpdir=None):
             # on a certain gpu could makes the overall outputs empty.
             if part_result:
                 part_list.append(part_result)
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
+
         # remove tmp dir
         shutil.rmtree(tmpdir)
-        return ordered_results
+        return part_list
 
 
-def collect_results_gpu(result_part, size):
+def collect_results_gpu(result_part):
     """Collect results under gpu mode.
 
     On gpu mode, this function will encode results to gpu tensors and use gpu
@@ -269,10 +287,16 @@ def collect_results_gpu(result_part, size):
             # on a certain gpu could makes the overall outputs empty.
             if part_result:
                 part_list.append(part_result)
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        return ordered_results
+
+        return part_list
+
+
+def merge_metrics(metric_classes_list):
+    if metric_classes_list is None:
+        return None
+
+    metric_classes_0 = metric_classes_list[0]
+    for metric_classes_i in metric_classes_list[1:]:
+        for cls_0, cls_i in zip(metric_classes_0, metric_classes_i):
+            cls_0.merge(cls_i)
+    return metric_classes_0
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 65218a1c..bf2fb854 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -54,6 +54,8 @@ CLASS_NAME = 'class_name'
 GROUP_KEY = 'group_key'
 MODULE_NAME = 'module_name'
 MODULE_CLS = 'module_cls'
+TEMPLATE_PATH = 'TEMPLATE_PATH'
+TEMPLATE_FILE = 'ast_index_file.py'
 
 
 class AstScaning(object):
@@ -286,6 +288,8 @@ class AstScaning(object):
             for node in nodes:
                 if type(node).__name__ == 'Str':
                     result.append((node.s, None))
+                elif type(node).__name__ == 'Constant':
+                    result.append((node.value, None))
                 else:
                     result.append(_get_attribute_item(node))
             return result
@@ -611,7 +615,7 @@ class FilesAstScaning(object):
 file_scanner = FilesAstScaning()
 
 
-def _save_index(index, file_path, file_list=None):
+def _save_index(index, file_path, file_list=None, with_template=False):
     # convert tuple key to str key
     index[INDEX_KEY] = {str(k): v for k, v in index[INDEX_KEY].items()}
     index[VERSION_KEY] = __version__
@@ -619,6 +623,9 @@ def _save_index(index, file_path, file_list=None):
         file_list=file_list)
     index[MODELSCOPE_PATH_KEY] = MODELSCOPE_PATH.as_posix()
     json_index = json.dumps(index)
+    if with_template:
+        json_index = json_index.replace(MODELSCOPE_PATH.as_posix(),
+                                        TEMPLATE_PATH)
     storage.write(json_index.encode(), file_path)
     index[INDEX_KEY] = {
         ast.literal_eval(k): v
@@ -626,8 +633,11 @@ def _save_index(index, file_path, file_list=None):
     }
 
 
-def _load_index(file_path):
+def _load_index(file_path, with_template=False):
     bytes_index = storage.read(file_path)
+    if with_template:
+        bytes_index = bytes_index.decode().replace(TEMPLATE_PATH,
+                                                   MODELSCOPE_PATH.as_posix())
     wrapped_index = json.loads(bytes_index)
     # convert str key to tuple key
     wrapped_index[INDEX_KEY] = {
@@ -733,14 +743,21 @@ def load_index(
 
     if full_index_flag:
         if force_rebuild:
-            logger.info('Force rebuilding ast index')
+            logger.info('Force rebuilding ast index from scanning every file!')
+            index = file_scanner.get_files_scan_results(file_list)
         else:
             logger.info(
-                f'No valid ast index found from {file_path}, rebuilding ast index!'
+                f'No valid ast index found from {file_path}, generating ast index from prebuilt!'
             )
-        index = file_scanner.get_files_scan_results(file_list)
+            index = load_from_prebuilt()
+            if index is None:
+                index = file_scanner.get_files_scan_results(file_list)
         _save_index(index, file_path, file_list)
     elif local_changed and not full_index_flag:
+        logger.info(
+            'Updating the files for the changes of local files, '
+            'first time updating will take longer time! Please wait till updating done!'
+        )
         _update_index(index, files_mtime)
         _save_index(index, file_path, file_list)
 
@@ -760,5 +777,28 @@ def check_import_module_avaliable(module_dicts: dict) -> list:
     return missed_module
 
 
+def load_from_prebuilt(file_path=None):
+    if file_path is None:
+        local_path = p.resolve().parents[0]
+        file_path = os.path.join(local_path, TEMPLATE_FILE)
+    if os.path.exists(file_path):
+        index = _load_index(file_path, with_template=True)
+    else:
+        index = None
+    return index
+
+
+def generate_ast_template(file_path=None, force_rebuild=True):
+    index = load_index(force_rebuild=force_rebuild)
+    if file_path is None:
+        local_path = p.resolve().parents[0]
+        file_path = os.path.join(local_path, TEMPLATE_FILE)
+    _save_index(index, file_path, with_template=True)
+    if not os.path.exists(file_path):
+        raise Exception(
+            'The index file is not create correctly, please double check')
+    return index
+
+
 if __name__ == '__main__':
     index = load_index()
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 34f68d96..c5fbe8c5 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,7 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 import re
 import struct
 import sys
+import tempfile
 from typing import Union
 from urllib.parse import urlparse
 
@@ -164,3 +166,101 @@ def load_bytes_from_url(url: str) -> Union[bytes, str]:
         data = url
 
     return data, sample_rate
+
+
+def generate_scp_from_url(url: str, key: str = None):
+    wav_scp_path = None
+    raw_inputs = None
+    # for local wav.scp inputs
+    if os.path.exists(url) and url.lower().endswith('.scp'):
+        wav_scp_path = url
+        return wav_scp_path, raw_inputs
+    # for local wav file inputs
+    if os.path.exists(url) and (url.lower().endswith('.wav')):
+        wav_scp_path = url
+        return wav_scp_path, raw_inputs
+    # for wav url, download and generate wav.scp
+    result = urlparse(url)
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        # bytes
+        wav_scp_path = storage.read(url)
+
+        return wav_scp_path, raw_inputs
+
+    return wav_scp_path, raw_inputs
+
+
+def generate_text_from_url(url: str):
+    text_file_path = None
+    raw_inputs = None
+    # for text str input
+    if not os.path.exists(url) and not url.startswith('http'):
+        raw_inputs = url
+        return text_file_path, raw_inputs
+
+    # for local txt inputs
+    if os.path.exists(url) and (url.lower().endswith('.txt')
+                                or url.lower().endswith('.scp')):
+        text_file_path = url
+        return text_file_path, raw_inputs
+    # for url, download and generate txt
+    result = urlparse(url)
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        data = storage.read(url)
+        work_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(work_dir):
+            os.makedirs(work_dir)
+        text_file_path = os.path.join(work_dir, os.path.basename(url))
+        with open(text_file_path, 'wb') as fp:
+            fp.write(data)
+        return text_file_path, raw_inputs
+
+    return text_file_path, raw_inputs
+
+
+def generate_scp_for_sv(url: str, key: str = None):
+    wav_scp_path = None
+    wav_name = key if key is not None else os.path.basename(url)
+    # for local wav.scp inputs
+    if os.path.exists(url) and url.lower().endswith('.scp'):
+        wav_scp_path = url
+        return wav_scp_path
+    # for local wav file inputs
+    if os.path.exists(url) and (url.lower().endswith('.wav')
+                                or url.lower().endswith('.pcm')):
+        wav_path = url
+        work_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(work_dir):
+            os.makedirs(work_dir)
+        wav_scp_path = os.path.join(work_dir, 'wav.scp')
+        with open(wav_scp_path, 'w') as ft:
+            scp_content = '\t'.join([wav_name, wav_path]) + '\n'
+            ft.writelines(scp_content)
+        return wav_scp_path
+    # for wav url, download and generate wav.scp
+    result = urlparse(url)
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        data = storage.read(url)
+        work_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(work_dir):
+            os.makedirs(work_dir)
+        wav_path = os.path.join(work_dir, os.path.basename(url))
+        with open(wav_path, 'wb') as fb:
+            fb.write(data)
+        wav_scp_path = os.path.join(work_dir, 'wav.scp')
+        with open(wav_scp_path, 'w') as ft:
+            scp_content = '\t'.join([wav_name, wav_path]) + '\n'
+            ft.writelines(scp_content)
+
+    return wav_scp_path
+
+
+def generate_sv_scp_from_url(url: tuple):
+    if len(url) != 2:
+        raise Exception('Speaker Verification needs 2 input wav file!')
+    audio_scp1 = generate_scp_for_sv(url[0], key='test1')
+    audio_scp2 = generate_scp_for_sv(url[1], key='test1')
+    return audio_scp1, audio_scp2
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index d500348f..4a83b278 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -22,8 +22,10 @@ class CVTasks(object):
     # human face body related
     animal_recognition = 'animal-recognition'
     face_detection = 'face-detection'
+    face_liveness = 'face-liveness'
     card_detection = 'card-detection'
     face_recognition = 'face-recognition'
+    face_recognition_ood = 'face-recognition-ood'
     facial_expression_recognition = 'facial-expression-recognition'
     facial_landmark_confidence = 'facial-landmark-confidence'
     face_processing_base = 'face-processing-base'
@@ -45,10 +47,14 @@ class CVTasks(object):
 
     image_object_detection = 'image-object-detection'
     video_object_detection = 'video-object-detection'
+    image_fewshot_detection = 'image-fewshot-detection'
 
     image_segmentation = 'image-segmentation'
     semantic_segmentation = 'semantic-segmentation'
     image_depth_estimation = 'image-depth-estimation'
+    indoor_layout_estimation = 'indoor-layout-estimation'
+    video_depth_estimation = 'video-depth-estimation'
+    panorama_depth_estimation = 'panorama-depth-estimation'
     portrait_matting = 'portrait-matting'
     text_driven_segmentation = 'text-driven-segmentation'
     shop_segmentation = 'shop-segmentation'
@@ -56,6 +62,7 @@ class CVTasks(object):
     face_human_hand_detection = 'face-human-hand-detection'
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
+    image_matching = 'image-matching'
 
     crowd_counting = 'crowd-counting'
 
@@ -65,6 +72,7 @@ class CVTasks(object):
     image_colorization = 'image-colorization'
     image_color_enhancement = 'image-color-enhancement'
     image_denoising = 'image-denoising'
+    image_deblurring = 'image-deblurring'
     image_portrait_enhancement = 'image-portrait-enhancement'
     image_inpainting = 'image-inpainting'
     image_skychange = 'image-skychange'
@@ -76,7 +84,7 @@ class CVTasks(object):
     image_portrait_stylization = 'image-portrait-stylization'
     image_body_reshaping = 'image-body-reshaping'
     image_embedding = 'image-embedding'
-
+    image_face_fusion = 'image-face-fusion'
     product_retrieval_embedding = 'product-retrieval-embedding'
 
     # video recognition
@@ -88,6 +96,7 @@ class CVTasks(object):
     virtual_try_on = 'virtual-try-on'
     movie_scene_segmentation = 'movie-scene-segmentation'
     language_guided_video_summarization = 'language-guided-video-summarization'
+    vop_retrieval = 'video-text-retrieval'
 
     # video segmentation
     video_object_segmentation = 'video-object-segmentation'
@@ -96,12 +105,24 @@ class CVTasks(object):
 
     # video editing
     video_inpainting = 'video-inpainting'
+    video_frame_interpolation = 'video-frame-interpolation'
+    video_stabilization = 'video-stabilization'
+    video_super_resolution = 'video-super-resolution'
 
     # reid and tracking
     video_single_object_tracking = 'video-single-object-tracking'
+    video_multi_object_tracking = 'video-multi-object-tracking'
     video_summarization = 'video-summarization'
     image_reid_person = 'image-reid-person'
 
+    # pointcloud task
+    pointcloud_sceneflow_estimation = 'pointcloud-sceneflow-estimation'
+    # image multi-view depth estimation
+    image_multi_view_depth_estimation = 'image-multi-view-depth-estimation'
+
+    # domain specific object detection
+    domain_specific_object_detection = 'domain-specific-object-detection'
+
 
 class NLPTasks(object):
     # nlp tasks
@@ -140,6 +161,8 @@ class NLPTasks(object):
     extractive_summarization = 'extractive-summarization'
     feature_extraction = 'feature-extraction'
     translation_evaluation = 'translation-evaluation'
+    sudoku = 'sudoku'
+    text2sql = 'text2sql'
 
 
 class AudioTasks(object):
@@ -147,9 +170,13 @@ class AudioTasks(object):
     auto_speech_recognition = 'auto-speech-recognition'
     text_to_speech = 'text-to-speech'
     speech_signal_process = 'speech-signal-process'
+    speech_separation = 'speech-separation'
     acoustic_echo_cancellation = 'acoustic-echo-cancellation'
     acoustic_noise_suppression = 'acoustic-noise-suppression'
     keyword_spotting = 'keyword-spotting'
+    inverse_text_processing = 'inverse-text-processing'
+    punctuation = 'punctuation'
+    speaker_verification = 'speaker-verification'
 
 
 class MultiModalTasks(object):
@@ -164,6 +191,9 @@ class MultiModalTasks(object):
     visual_entailment = 'visual-entailment'
     video_multi_modal_embedding = 'video-multi-modal-embedding'
     image_text_retrieval = 'image-text-retrieval'
+    document_vl_embedding = 'document-vl-embedding'
+    video_captioning = 'video-captioning'
+    video_question_answering = 'video-question-answering'
 
 
 class ScienceTasks(object):
@@ -278,6 +308,8 @@ class DatasetFormations(enum.Enum):
     # native modelscope formation that supports, among other things,
     # multiple files in a dataset
     native = 2
+    # for local meta cache mark
+    formation_mark_ext = '.formation_mark'
 
 
 DatasetMetaFormats = {
@@ -300,6 +332,8 @@ class ModelFile(object):
     LABEL_MAPPING = 'label_mapping.json'
     TRAIN_OUTPUT_DIR = 'output'
     TS_MODEL_FILE = 'model.ts'
+    YAML_FILE = 'model.yaml'
+    TOKENIZER_FOLDER = 'tokenizer'
 
 
 class Invoke(object):
@@ -307,6 +341,7 @@ class Invoke(object):
     PRETRAINED = 'from_pretrained'
     PIPELINE = 'pipeline'
     TRAINER = 'trainer'
+    LOCAL_TRAINER = 'local_trainer'
     PREPROCESSOR = 'preprocessor'
 
 
@@ -355,6 +390,7 @@ MASTER_MODEL_BRANCH = 'master'
 DEFAULT_REPOSITORY_REVISION = 'master'
 DEFAULT_DATASET_REVISION = 'master'
 DEFAULT_DATASET_NAMESPACE = 'modelscope'
+DEFAULT_DATA_ACCELERATION_ENDPOINT = 'https://oss-accelerate.aliyuncs.com'
 
 
 class ModeKeys:
@@ -413,5 +449,15 @@ EXTENSIONS_TO_LOAD = {
 }
 
 
+class DatasetPathName:
+    META_NAME = 'meta'
+    DATA_FILES_NAME = 'data_files'
+    LOCK_FILE_NAME_ANY = 'any'
+    LOCK_FILE_NAME_DELIMITER = '-'
+
+
 class MetaDataFields:
     ARGS_BIG_DATA = 'big_data'
+
+
+DatasetVisibilityMap = {1: 'private', 3: 'internal', 5: 'public'}
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 531889d2..1d18434e 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -3,6 +3,8 @@
 import os
 
 import cv2
+import matplotlib
+import matplotlib.cm as cm
 import matplotlib.pyplot as plt
 import numpy as np
 from PIL import Image
@@ -11,7 +13,7 @@ from modelscope.outputs import OutputKeys
 from modelscope.preprocessors.image import load_image
 from modelscope.utils import logger as logging
 
-logger = logging.get_logger(__name__)
+logger = logging.get_logger()
 
 
 def numpy_to_cv2img(img_array):
@@ -483,6 +485,15 @@ def depth_to_color(depth):
     return depth_color
 
 
+def show_video_depth_estimation_result(depths, video_save_path):
+    height, width, layers = depths[0].shape
+    out = cv2.VideoWriter(video_save_path, cv2.VideoWriter_fourcc(*'MP4V'), 25,
+                          (width, height))
+    for (i, img) in enumerate(depths):
+        out.write(cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_RGB2BGR))
+    out.release()
+
+
 def masks_visualization(masks, palette):
     vis_masks = []
     for f in range(masks.shape[0]):
@@ -490,3 +501,99 @@ def masks_visualization(masks, palette):
         img_E.putpalette(palette)
         vis_masks.append(img_E)
     return vis_masks
+
+
+# This implementation is adopted from LoFTR,
+# made public available under the Apache License, Version 2.0,
+# at https://github.com/zju3dv/LoFTR
+
+
+def make_matching_figure(img0,
+                         img1,
+                         mkpts0,
+                         mkpts1,
+                         color,
+                         kpts0=None,
+                         kpts1=None,
+                         text=[],
+                         dpi=75,
+                         path=None):
+    # draw image pair
+    assert mkpts0.shape[0] == mkpts1.shape[
+        0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}'
+    fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi)
+    axes[0].imshow(img0, cmap='gray')
+    axes[1].imshow(img1, cmap='gray')
+    for i in range(2):  # clear all frames
+        axes[i].get_yaxis().set_ticks([])
+        axes[i].get_xaxis().set_ticks([])
+        for spine in axes[i].spines.values():
+            spine.set_visible(False)
+    plt.tight_layout(pad=1)
+
+    if kpts0 is not None:
+        assert kpts1 is not None
+        axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2)
+        axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c='w', s=2)
+
+    # draw matches
+    if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0:
+        fig.canvas.draw()
+        transFigure = fig.transFigure.inverted()
+        fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0))
+        fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1))
+        fig.lines = [
+            matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]),
+                                    (fkpts0[i, 1], fkpts1[i, 1]),
+                                    transform=fig.transFigure,
+                                    c=color[i],
+                                    linewidth=1) for i in range(len(mkpts0))
+        ]
+
+        axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4)
+        axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4)
+
+    # put txts
+    txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w'
+    fig.text(
+        0.01,
+        0.99,
+        '\n'.join(text),
+        transform=fig.axes[0].transAxes,
+        fontsize=15,
+        va='top',
+        ha='left',
+        color=txt_color)
+
+    # save or return figure
+    if path:
+        plt.savefig(str(path), bbox_inches='tight', pad_inches=0)
+        plt.close()
+    else:
+        return fig
+
+
+def match_pair_visualization(img_name0,
+                             img_name1,
+                             kpts0,
+                             kpts1,
+                             conf,
+                             output_filename='quadtree_match.png',
+                             method='QuadTreeAttention'):
+
+    print(f'Found {len(kpts0)} matches')
+
+    # visualize the matches
+    img0 = cv2.imread(str(img_name0))
+    img1 = cv2.imread(str(img_name1))
+
+    # Draw
+    color = cm.jet(conf)
+    text = [
+        method,
+        'Matches: {}'.format(len(kpts0)),
+    ]
+    fig = make_matching_figure(img0, img1, kpts0, kpts1, color, text=text)
+
+    # save the figure
+    fig.savefig(str(output_filename), dpi=300, bbox_inches='tight')
diff --git a/modelscope/utils/data_utils.py b/modelscope/utils/data_utils.py
index 2bc88e19..3a660122 100644
--- a/modelscope/utils/data_utils.py
+++ b/modelscope/utils/data_utils.py
@@ -3,6 +3,8 @@ from collections.abc import Mapping
 
 import torch
 
+from modelscope.outputs import ModelOutputBase
+
 
 def to_device(batch, device, non_blocking=False):
     """Put the data to the target cuda device just before the forward function.
@@ -13,7 +15,11 @@ def to_device(batch, device, non_blocking=False):
     Returns: The data to the target device.
 
     """
-    if isinstance(batch, dict) or isinstance(batch, Mapping):
+    if isinstance(batch, ModelOutputBase):
+        for idx in range(len(batch)):
+            batch[idx] = to_device(batch[idx], device)
+        return batch
+    elif isinstance(batch, dict) or isinstance(batch, Mapping):
         return type(batch)({k: to_device(v, device) for k, v in batch.items()})
     elif isinstance(batch, (tuple, list)):
         return type(batch)(to_device(v, device) for v in batch)
diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py
index bb29aaa0..fe6c6f93 100644
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -134,3 +134,9 @@ You can install it with pip on linux or mac:
 Or you can checkout the instructions on the
 installation page: https://github.com/alibaba/EasyNLP and follow the ones that match your environment.
 """
+
+# docstyle-ignore
+MEGATRON_UTIL_IMPORT_ERROR = """
+{0} requires the megatron_util library but it was not found in your environment. You can install it with pip:
+`pip install megatron_util -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html`
+"""
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index b251d107..9f06d5d6 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -7,7 +7,6 @@ import os
 import os.path as osp
 import sys
 from collections import OrderedDict
-from functools import wraps
 from importlib import import_module
 from itertools import chain
 from pathlib import Path
@@ -300,6 +299,8 @@ REQUIREMENTS_MAAPING = OrderedDict([
     ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)),
     ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)),
     ('fasttext', (is_package_available('fasttext'), FASTTEXT_IMPORT_ERROR)),
+    ('megatron_util', (is_package_available('megatron_util'),
+                       MEGATRON_UTIL_IMPORT_ERROR)),
 ])
 
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
diff --git a/modelscope/utils/logger.py b/modelscope/utils/logger.py
index 6a3c1d6f..17923a6d 100644
--- a/modelscope/utils/logger.py
+++ b/modelscope/utils/logger.py
@@ -6,6 +6,9 @@ from typing import Optional
 
 init_loggers = {}
 
+formatter = logging.Formatter(
+    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
 
 def get_logger(log_file: Optional[str] = None,
                log_level: int = logging.INFO,
@@ -19,10 +22,12 @@ def get_logger(log_file: Optional[str] = None,
         file_mode: Specifies the mode to open the file, if filename is
             specified (if filemode is unspecified, it defaults to 'w').
     """
+
     logger_name = __name__.split('.')[0]
     logger = logging.getLogger(logger_name)
 
     if logger_name in init_loggers:
+        add_file_handler_if_needed(logger, log_file, file_mode, log_level)
         return logger
 
     # handle duplicate logs to the console
@@ -49,8 +54,6 @@ def get_logger(log_file: Optional[str] = None,
         file_handler = logging.FileHandler(log_file, file_mode)
         handlers.append(file_handler)
 
-    formatter = logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     for handler in handlers:
         handler.setFormatter(formatter)
         handler.setLevel(log_level)
@@ -64,3 +67,21 @@ def get_logger(log_file: Optional[str] = None,
     init_loggers[logger_name] = True
 
     return logger
+
+
+def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
+    for handler in logger.handlers:
+        if isinstance(handler, logging.FileHandler):
+            return
+
+    if importlib.util.find_spec('torch') is not None:
+        from modelscope.utils.torch_utils import is_master
+        is_worker0 = is_master()
+    else:
+        is_worker0 = True
+
+    if is_worker0 and log_file is not None:
+        file_handler = logging.FileHandler(log_file, file_mode)
+        file_handler.setFormatter(formatter)
+        file_handler.setLevel(log_level)
+        logger.addHandler(file_handler)
diff --git a/modelscope/utils/megatron_utils.py b/modelscope/utils/megatron_utils.py
new file mode 100644
index 00000000..240572d1
--- /dev/null
+++ b/modelscope/utils/megatron_utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from typing import Optional
+
+from megatron_util import initialize_megatron
+
+from modelscope.utils.config import Config
+from modelscope.utils.hub import read_config
+
+_DEFAULT_CFG_WITH_MODEL_TYPE = {
+    'gpt-moe': {
+        'version': 'moe',
+        'world_size': 8
+    },
+    'plug': {
+        'version': 'v1',
+        'world_size': 8,
+        'tensor_model_parallel_size': 8,
+        'seed': 1234
+    },
+    'mglm-text-summarization': {
+        'version': 'v1',
+        'seed': 1234
+    },
+}
+
+
+def init_megatron_util(cfg: Optional[Config] = None,
+                       model_dir: Optional[str] = None,
+                       **kwargs):
+    assert not (cfg is None and model_dir is None), \
+        'cfg and model_dir cannot both be None when initializing megatron_util'
+    if cfg is None:
+        cfg = read_config(model_dir)
+    try:
+        megatron_cfg = cfg.megatron
+    except AttributeError:
+        try:
+            model_type = cfg.model.type
+        except AttributeError:
+            # Fit models without model type, such as mglm
+            model_type = cfg.pipeline.type
+        megatron_cfg = _DEFAULT_CFG_WITH_MODEL_TYPE[model_type] \
+            if model_type in _DEFAULT_CFG_WITH_MODEL_TYPE else {}
+    megatron_cfg.update(kwargs)
+    initialize_megatron(megatron_cfg)
diff --git a/modelscope/utils/metric.py b/modelscope/utils/metric.py
new file mode 100644
index 00000000..49b4557c
--- /dev/null
+++ b/modelscope/utils/metric.py
@@ -0,0 +1,98 @@
+# Copyright (c) Megvii Inc. All rights reserved.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import functools
+import os
+from collections import defaultdict, deque
+
+import numpy as np
+import torch
+
+__all__ = [
+    'AverageMeter',
+    'MeterBuffer',
+    'gpu_mem_usage',
+]
+
+
+def gpu_mem_usage():
+    """
+    Compute the GPU memory usage for the current device (MB).
+    """
+    mem_usage_bytes = torch.cuda.max_memory_allocated()
+    return mem_usage_bytes / (1024 * 1024)
+
+
+class AverageMeter:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=50):
+        self._deque = deque(maxlen=window_size)
+        self._total = 0.0
+        self._count = 0
+
+    def update(self, value):
+        self._deque.append(value)
+        self._count += 1
+        self._total += value
+
+    @property
+    def median(self):
+        d = np.array(list(self._deque))
+        return np.median(d)
+
+    @property
+    def avg(self):
+        # if deque is empty, nan will be returned.
+        d = np.array(list(self._deque))
+        return d.mean()
+
+    @property
+    def global_avg(self):
+        return self._total / max(self._count, 1e-5)
+
+    @property
+    def latest(self):
+        return self._deque[-1] if len(self._deque) > 0 else None
+
+    @property
+    def total(self):
+        return self._total
+
+    def reset(self):
+        self._deque.clear()
+        self._total = 0.0
+        self._count = 0
+
+    def clear(self):
+        self._deque.clear()
+
+
+class MeterBuffer(defaultdict):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, window_size=20):
+        factory = functools.partial(AverageMeter, window_size=window_size)
+        super().__init__(factory)
+
+    def reset(self):
+        for v in self.values():
+            v.reset()
+
+    def get_filtered_meter(self, filter_key='time'):
+        return {k: v for k, v in self.items() if filter_key in k}
+
+    def update(self, values=None, **kwargs):
+        if values is None:
+            values = {}
+        values.update(kwargs)
+        for k, v in values.items():
+            if isinstance(v, torch.Tensor):
+                v = v.detach()
+            self[k].update(v)
+
+    def clear_meters(self):
+        for v in self.values():
+            v.clear()
diff --git a/modelscope/utils/multi_modal/fp16/__init__.py b/modelscope/utils/multi_modal/fp16/__init__.py
deleted file mode 100644
index 81250858..00000000
--- a/modelscope/utils/multi_modal/fp16/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .fp16 import FP16_Module, FP16_Optimizer
diff --git a/modelscope/utils/multi_modal/fp16/fp16.py b/modelscope/utils/multi_modal/fp16/fp16.py
deleted file mode 100755
index 37a80e65..00000000
--- a/modelscope/utils/multi_modal/fp16/fp16.py
+++ /dev/null
@@ -1,655 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Stable version of apex FP16 Optimizer"""
-import torch
-from torch import nn
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-
-from .fp16util import (master_params_to_model_params,
-                       model_grads_to_master_grads)
-from .loss_scaler import DynamicLossScaler, LossScaler
-
-FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
-HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
-
-
-def conversion_helper(val, conversion):
-    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
-    if not isinstance(val, (tuple, list)):
-        return conversion(val)
-    rtn = [conversion_helper(v, conversion) for v in val]
-    if isinstance(val, tuple):
-        rtn = tuple(rtn)
-    return rtn
-
-
-def fp32_to_fp16(val):
-    """Convert fp32 `val` to fp16"""
-
-    def half_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, FLOAT_TYPES):
-            val = val.half()
-        return val
-
-    return conversion_helper(val, half_conversion)
-
-
-def fp16_to_fp32(val):
-    """Convert fp16 `val` to fp32"""
-
-    def float_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, HALF_TYPES):
-            val = val.float()
-        return val
-
-    return conversion_helper(val, float_conversion)
-
-
-class FP16_Module(nn.Module):
-
-    def __init__(self, module):
-        super(FP16_Module, self).__init__()
-        self.add_module('module', module.half())
-
-    def forward(self, *inputs, **kwargs):
-        return fp16_to_fp32(self.module(*(fp32_to_fp16(inputs)), **kwargs))
-
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        return self.module.state_dict(destination, prefix, keep_vars)
-
-    def load_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
-
-
-class FP16_Optimizer(object):
-    """
-    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
-    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
-    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
-    and changing the call to ``backward``.
-
-    Example::
-
-        model = torch.nn.Linear(D_in, D_out).cuda().half()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-        # Name the FP16_Optimizer instance to replace the existing optimizer
-        # (recommended but not required):
-        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-        ...
-        # loss.backward() becomes:
-        optimizer.backward(loss)
-        ...
-
-    Example with dynamic loss scaling::
-
-        ...
-        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-                                   # optional arg to control dynamic loss scaling behavior
-                                   # dynamic_loss_args={'scale_window' : 500})
-                                   # Usually, dynamic_loss_args is not necessary.
-
-    Args:
-        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.  # noqa
-        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.  # noqa
-        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option. # noqa
-        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used. # noqa
-        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling. # noqa
-
-    ``init_optimizer`` is expected to have been constructed in the ordinary way.
-    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
-    named to replace ``init_optimizer``, for two reasons:
-    First, it means that references to the same name
-    later in the file will not have to change.
-    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
-    modify ``init_optimizer``.  If you do choose a unique name for the new
-    :class:`FP16_Optimizer` instance, you should only work with this new instance,
-    because the preexisting optimizer might no longer behave as expected.
-
-    ``init_optimizer`` may be any Pytorch optimizer.
-    It may contain a mixture of fp16 and fp32 parameters organized into any number of
-    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
-    ingest these ``param_groups`` and remember them.
-
-    Calls to ::
-
-        loss.backward()
-
-    must be replaced with ::
-
-        optimizer.backward(loss)
-
-    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
-    loss scaling and copies to master gradients.
-
-    .. note::
-        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
-        are downscaled before being applied.  This means that adjusting the loss scale, or using
-        dynamic loss scaling, should not require retuning the learning rate or any other
-        hyperparameters.
-
-
-    **Advanced options**
-
-    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
-    See docstring for :attr:`step`.
-
-    **Gradient clipping**:  Use :attr:`clip_master_grads`.
-
-    **Multiple losses**:  If your model accumulates gradients from multiple losses,
-    this can be made more efficient by supplying ``update_master_grads=False``
-    to :attr:`backward`.  See docstring for :attr:`backward`.
-
-    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
-
-        print(optimizer.loss_scale)
-        optimizer.loss_scale = new_loss_scale
-
-    For static loss scaling, manually adjusting the loss scale over time is a reasonable
-    thing to do.  During later epochs, gradients may become smaller, and a
-    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
-    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
-    the loss scale is not recommended.
-
-    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
-    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
-    should still work as intended.
-    """
-
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=False):
-        if not torch.cuda.is_available:
-            raise SystemError('Cannot use fp16 without CUDA.')
-
-        self.verbose = verbose
-
-        self.optimizer = init_optimizer
-        # init_state_dict sets up an alternative way to cast per-param state tensors.
-        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
-        # init_state_dict = init_optimizer.state_dict()
-
-        self.fp16_groups = []
-        self.fp32_from_fp16_groups = []
-        self.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            self.maybe_print(
-                'FP16_Optimizer processing param group {}:'.format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        self.maybe_print(
-                            'FP16_Optimizer received torch.cuda.HalfTensor with {}'
-                            .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        # Copythe model parallel flag.
-                        master_param.model_parallel = param.model_parallel
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[
-                                master_param] = self.optimizer.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        self.maybe_print(
-                            'FP16_Optimizer received torch.cuda.FloatTensor with {}'
-                            .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError(
-                            'Wrapped parameters must be either '
-                            'torch.cuda.FloatTensor or torch.cuda.HalfTensor. '
-                            'Received {}'.format(param.type()))
-
-            self.fp16_groups.append(fp16_params_this_group)
-            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-        # alternative way to cast per-param state tensors:
-        # self.optimizer.load_state_dict(init_state_dict)
-
-        if dynamic_loss_scale:
-            self.dynamic_loss_scale = True
-            if dynamic_loss_args is not None:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-            else:
-                self.loss_scaler = DynamicLossScaler()
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(static_loss_scale)
-
-        self.overflow = False
-        self.first_closure_call_this_step = True
-
-        self.clip_grad_norm = nn.utils.clip_grad.clip_grad_norm_
-
-    def maybe_print(self, msg):
-        if self.verbose:
-            print(msg)
-
-    def __getstate__(self):
-        raise RuntimeError(
-            'FP16_Optimizer should be serialized using state_dict().')
-
-    def __setstate__(self, state):
-        raise RuntimeError(
-            'FP16_Optimizer should be deserialized using load_state_dict().')
-
-    def zero_grad(self, set_grads_to_None=False):
-        """
-        Zero fp32 and fp16 parameter grads.
-        """
-        # In principle, only the .grad attributes of the model params need to be zeroed,
-        # because gradients are copied into the FP32 master params.  However, we zero
-        # all gradients owned by the optimizer, just to be safe:
-        for group in self.optimizer.param_groups:
-            for p in group['params']:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-        # Zero fp16 gradients owned by the model:
-        for fp16_group in self.fp16_groups:
-            for param in fp16_group:
-                if set_grads_to_None:
-                    param.grad = None
-                else:
-                    if param.grad is not None:
-                        param.grad.detach_(
-                        )  # as in torch.optim.optimizer.zero_grad()
-                        param.grad.zero_()
-
-    def _check_overflow(self):
-        params = []
-        for group in self.fp16_groups:
-            for param in group:
-                params.append(param)
-        for group in self.fp32_from_fp32_groups:
-            for param in group:
-                params.append(param)
-        self.overflow = self.loss_scaler.has_overflow(params)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    def _master_params_to_model_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(
-                self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-
-    def _model_params_to_master_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(
-                self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
-
-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
-    # that does the overflow check, gradient copy + downscale, and fp32 allreduce in a different stream.
-    def _model_grads_to_master_grads(self):
-        for fp16_group, fp32_from_fp16_group in zip(
-                self.fp16_groups, self.fp32_from_fp16_groups):
-            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
-
-    def _downscale_master(self):
-        if self.loss_scale != 1.0:
-            for group in self.optimizer.param_groups:
-                for param in group['params']:
-                    if param.grad is not None:
-                        param.grad.data.mul_(1. / self.loss_scale)
-
-    def clip_master_grads(self, max_norm, norm_type=2):
-        """
-        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
-
-        Args:
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the current fp32 gradients (viewed as a single vector).
-
-        .. warning::
-            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``). # noqa
-        """
-        if not self.overflow:
-            fp32_params = []
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    fp32_params.append(param)
-            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
-        else:
-            return -1
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict[
-            'first_closure_call_this_step'] = self.first_closure_call_this_step
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
-        return state_dict
-
-    def load_state_dict(self, state_dict):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-
-        Example::
-
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-        self.first_closure_call_this_step = state_dict[
-            'first_closure_call_this_step']
-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 2.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current_group, saved_group in zip(self.fp32_from_fp16_groups,
-                                              state_dict['fp32_from_fp16']):
-            for current, saved in zip(current_group, saved_group):
-                current.data.copy_(saved.data)
-
-    def step(self, closure=None):  # could add clip option.
-        """
-        If no closure is supplied, :attr:`step` should be called after
-        ``fp16_optimizer_obj.backward(loss)``.
-        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
-        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
-        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
-        another forward pass using their model.
-
-        If a closure is supplied, :attr:`step` may be called without a prior call to
-        :attr:`backward(loss)`.
-        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
-        However, the user should take care that any ``loss.backward()`` call within the closure
-        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
-
-        Args:
-           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss. # noqa
-
-        Example with closure::
-
-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
-            # existing pytorch optimizer.
-            for input, target in dataset:
-                def closure():
-                    optimizer.zero_grad()
-                    output = model(input)
-                    loss = loss_fn(output, target)
-                    # loss.backward() becomes:
-                    optimizer.backward(loss)
-                    return loss
-                optimizer.step(closure)
-
-        .. warning::
-            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
-
-        .. _`ordinary Pytorch optimizer use`:
-            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
-        """
-
-        scale = self.loss_scaler.loss_scale
-        self._update_scale(self.overflow)
-
-        if self.overflow:
-            self.maybe_print(
-                'OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}'
-                .format(scale, self.loss_scale))
-            return
-
-        if closure is not None:
-            retval = self._step_with_closure(closure)
-        else:
-            retval = self.optimizer.step()
-
-        self._master_params_to_model_params()
-
-        return retval
-
-    def _step_with_closure(self, closure):
-
-        def wrapped_closure():
-            # helpful for debugging
-            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
-            #       .format(self.first_closure_call_this_step))
-            if self.first_closure_call_this_step:
-                # We expect that the fp16 params are initially fresh on entering self.step(),
-                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
-                # is called within self.optimizer.step().
-                self.first_closure_call_this_step = False
-            else:
-                # If self.optimizer.step() internally calls wrapped_closure more than once,
-                # it may update the fp32 params after each call.  However, self.optimizer
-                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
-                # we can't rely on self.optimizer to refresh the fp16 params.  We need
-                # to handle that manually:
-                self._master_params_to_model_params()
-            # Our API expects the user to give us ownership of the backward() call by
-            # replacing all calls to loss.backward() with optimizer.backward(loss).
-            # This requirement holds whether or not the call to backward() is made within a closure.
-            # If the user is properly calling optimizer.backward(loss) within "closure,"
-            # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, so all wrapped_closure needs to do is call
-            # closure() and return the loss.
-            temp_loss = closure()
-            while (self.overflow):
-                scale = self.loss_scaler.loss_scale
-                self._update_scale(self.overflow)
-                self.maybe_print(
-                    'OVERFLOW within closure! Skipping step. Attempted loss scale: {}, '
-                    'reducing to {}'.format(scale, self.loss_scale))
-                temp_loss = closure()
-            return temp_loss
-
-        retval = self.optimizer.step(wrapped_closure)
-
-        self.first_closure_call_this_step = True
-
-        return retval
-
-    def backward(self, loss, update_master_grads=True, retain_graph=False):
-        """
-        :attr:`backward` performs the following conceptual steps:
-
-        1. fp32_loss = loss.float() (see first Note below)
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined). # noqa
-        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32. # noqa
-        5. Finally, master grads are divided by loss_scale.
-
-        In this way, after :attr:`backward`, the master params have fresh gradients,
-        and :attr:`step` may be called.
-
-        .. note::
-            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
-            This provides some additional safety against overflow if the user has supplied an
-            fp16 loss value.
-            However, for maximum overflow safety, the user should
-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
-            :attr:`backward`.
-
-        .. warning::
-            The gradients found in a model's leaves after the call to
-            :attr:`backward` should not be regarded as valid in general,
-            because it's possible
-            they have been scaled (and in the case of dynamic loss scaling,
-            the scale factor may change over time).
-            If the user wants to inspect gradients after a call to :attr:`backward`,
-            only the master gradients should be regarded as valid.  These can be retrieved via
-            :attr:`inspect_master_grad_data()`.
-
-        Args:
-            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
-            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`. # noqa
-            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below). # noqa
-
-        Example::
-
-            # Ordinary operation:
-            optimizer.backward(loss)
-
-            # Naive operation with multiple losses (technically valid, but less efficient):
-            # fp32 grads will be correct after the second call,  but
-            # the first call incurs an unnecessary fp16->fp32 grad copy.
-            optimizer.backward(loss1)
-            optimizer.backward(loss2)
-
-            # More efficient way to handle multiple losses:
-            # The fp16->fp32 grad copy is delayed until fp16 grads from all
-            # losses have been accumulated.
-            optimizer.backward(loss1, update_master_grads=False)
-            optimizer.backward(loss2, update_master_grads=False)
-            optimizer.update_master_grads()
-        """
-        # To consider:  try multiple backward passes using retain_grad=True to find
-        # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
-        # discarding the iteration,  but probably wouldn't improve overall efficiency.
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-        if update_master_grads:
-            self.update_master_grads()
-
-    def update_master_grads(self):
-        """
-        Copy the ``.grad`` attribute from stored references to fp16 parameters to
-        the ``.grad`` attribute of the fp32 master parameters that are directly
-        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
-        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
-        """
-        if self.dynamic_loss_scale:
-            self._check_overflow()
-            if self.overflow: return  # noqa
-        self._model_grads_to_master_grads()
-        self._downscale_master()
-
-    def inspect_master_grad_data(self):
-        """
-        When running with :class:`FP16_Optimizer`,
-        ``.grad`` attributes of a model's fp16 leaves should not be
-        regarded as truthful, because they might be scaled.
-        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
-        the fp32 master params' ``.grad``
-        attributes will contain valid gradients properly divided by the loss scale.  However,
-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
-        nonintuitive.  :attr:`inspect_master_grad_data`
-        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
-
-        Returns:
-            List of lists (one list for each parameter group).  The list for each parameter group
-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
-        """
-        if self.overflow:
-            print(
-                'Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  '
-                'Gradients are currently invalid (may be inf, nan, or stale).  Returning None.'
-            )
-            return None
-        else:
-            # The optimizer owns only references to master params.
-            master_grads_data = []
-            for param_group in self.optimizer.param_groups:
-                master_grads_this_group = []
-                for param in param_group['params']:
-                    if param.grad is not None:
-                        master_grads_this_group.append(param.grad.data)
-                    else:
-                        master_grads_this_group.append(None)
-                master_grads_data.append(master_grads_this_group)
-            return master_grads_data
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
diff --git a/modelscope/utils/multi_modal/fp16/fp16util.py b/modelscope/utils/multi_modal/fp16/fp16util.py
deleted file mode 100644
index f7ccd167..00000000
--- a/modelscope/utils/multi_modal/fp16/fp16util.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from torch.autograd import Variable
-
-
-class tofp16(nn.Module):
-    """
-    Utility module that implements::
-
-        def forward(self, input):
-            return input.half()
-    """
-
-    def __init__(self):
-        super(tofp16, self).__init__()
-
-    def forward(self, input):
-        return input.half()
-
-
-def BN_convert_float(module):
-    """
-    Utility function for network_to_half().
-
-    Retained for legacy purposes.
-    """
-    if isinstance(
-            module,
-            torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-        module.float()
-    for child in module.children():
-        BN_convert_float(child)
-    return module
-
-
-def network_to_half(network):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-
-    Retained for legacy purposes. It is recommended to use FP16Model.
-    """
-    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
-
-
-def convert_module(module, dtype):
-    """
-    Converts a module's immediate parameters and buffers to dtype.
-    """
-    for param in module.parameters(recurse=False):
-        if param is not None:
-            if param.data.dtype.is_floating_point:
-                param.data = param.data.to(dtype=dtype)
-            if param._grad is not None and param._grad.data.dtype.is_floating_point:
-                param._grad.data = param._grad.data.to(dtype=dtype)
-
-    for buf in module.buffers(recurse=False):
-        if buf is not None and buf.data.dtype.is_floating_point:
-            buf.data = buf.data.to(dtype=dtype)
-
-
-def convert_network(network, dtype):
-    """
-    Converts a network's parameters and buffers to dtype.
-    """
-    for module in network.modules():
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm
-                      ) and module.affine is True:
-            continue
-        convert_module(module, dtype)
-    return network
-
-
-class FP16Model(nn.Module):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    """
-
-    def __init__(self, network):
-        super(FP16Model, self).__init__()
-        self.network = convert_network(network, dtype=torch.half)
-
-    def forward(self, *inputs):
-        inputs = tuple(t.half() for t in inputs)
-        return self.network(*inputs)
-
-
-def backwards_debug_hook(grad):
-    raise RuntimeError(
-        'master_params recieved a gradient in the backward pass!')
-
-
-def prep_param_lists(model, flat_master=False):
-    """
-    Creates a list of FP32 master parameters for a given model, as in
-    `Training Neural Networks with Mixed Precision:  Real Examples`_.
-
-    Args:
-        model (torch.nn.Module): Existing Pytorch model
-        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.  # noqa
-    Returns:
-        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element. # noqa
-
-    Example::
-
-        model_params, master_params = prep_param_lists(model)
-
-    .. warning::
-        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. # noqa
-
-    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
-        https://www.nvidia.com/en-us/on-demand/session/gtcsiliconvalley2018-s81012/
-    """
-    model_params = [
-        param for param in model.parameters() if param.requires_grad
-    ]
-
-    if flat_master:
-        # Give the user some more useful error messages
-        try:
-            # flatten_dense_tensors returns a contiguous flat array.
-            # http://pytorch.org/docs/master/_modules/torch/_utils.html
-            master_params = _flatten_dense_tensors(
-                [param.data for param in model_params]).float()
-        except:  # noqa
-            print(
-                'Error in prep_param_lists:  model may contain a mixture of parameters '
-                'of different types.  Use flat_master=False, or use F16_Optimizer.'
-            )
-            raise
-        master_params = torch.nn.Parameter(master_params)
-        master_params.requires_grad = True
-        # master_params.register_hook(backwards_debug_hook)
-        if master_params.grad is None:
-            master_params.grad = master_params.new(*master_params.size())
-        return model_params, [master_params]
-    else:
-        master_params = [
-            param.clone().float().detach() for param in model_params
-        ]
-        for param in master_params:
-            param.requires_grad = True
-        return model_params, master_params
-
-
-def model_grads_to_master_grads(model_params,
-                                master_params,
-                                flat_master=False):
-    """
-    Copy model gradients to master gradients.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. # noqa
-    """
-    if flat_master:
-        # The flattening may incur one more deep copy than is necessary.
-        master_params[0].grad.data.copy_(
-            _flatten_dense_tensors([p.grad.data for p in model_params]))
-    else:
-        for model, master in zip(model_params, master_params):
-            if model.grad is not None:
-                if master.grad is None:
-                    master.grad = Variable(
-                        master.data.new(*master.data.size()))
-                master.grad.data.copy_(model.grad.data)
-            else:
-                master.grad = None
-
-
-def master_params_to_model_params(model_params,
-                                  master_params,
-                                  flat_master=False):
-    """
-    Copy master parameters to model parameters.
-
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`. # noqa
-    """
-    if flat_master:
-        for model, master in zip(
-                model_params,
-                _unflatten_dense_tensors(master_params[0].data, model_params)):
-            model.data.copy_(master)
-    else:
-        for model, master in zip(model_params, master_params):
-            model.data.copy_(master.data)
-
-
-# Backward compatibility fixes
-
-
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
diff --git a/modelscope/utils/multi_modal/fp16/loss_scaler.py b/modelscope/utils/multi_modal/fp16/loss_scaler.py
deleted file mode 100755
index fc55a4ed..00000000
--- a/modelscope/utils/multi_modal/fp16/loss_scaler.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-
-# item() is a recent addition, so this helps with backward compatibility.
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-
-
-class LossScaler:
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-
-    def __init__(self, scale=1):
-        self.cur_scale = scale
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-
-    def update_scale(self, overflow):
-        pass
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-class DynamicLossScaler:
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.  # noqa
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.  # noqa
-    """
-
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for p in params:
-            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(
-                    p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow(self, params):
-        overflow = self.has_overflow_serial(params)
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if 'value cannot be converted' not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float(
-                    'inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-
-        if not hasattr(self, 'min_scale'):
-            self.min_scale = 1
-        if not hasattr(self, 'delayed_shift'):
-            self.delayed_shift = 1
-        if not hasattr(self, 'cur_hysteresis'):
-            self.cur_hysteresis = 1
-        if not hasattr(self, 'consecutive_hysteresis'):
-            self.consecutive_hysteresis = True
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale / self.scale_factor,
-                                     self.min_scale)
-            else:
-                self.cur_hysteresis -= 1
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if self.consecutive_hysteresis:
-                self.cur_hysteresis = self.delayed_shift
-            if (self.cur_iter
-                    - self.last_overflow_iter) % self.scale_window == 0:
-                if not self.consecutive_hysteresis:
-                    self.cur_hysteresis = self.delayed_shift
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-
-    def scale_gradient(self, module, grad_in, grad_out):
-        return tuple(self.loss_scale * g for g in grad_in)
-
-    def backward(self, loss, retain_graph=False):
-        scaled_loss = loss * self.loss_scale
-        scaled_loss.backward(retain_graph=retain_graph)
-
-
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-"""
-TO-DO separate out into an example.
-if __name__ == "__main__":
-    import torch
-    from torch.autograd import Variable
-    from dynamic_loss_scaler import DynamicLossScaler
-
-    # N is batch size; D_in is input dimension;
-    # H is hidden dimension; D_out is output dimension.
-    N, D_in, H, D_out = 64, 1000, 100, 10
-
-    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-    x = Variable(torch.randn(N, D_in), requires_grad=False)
-    y = Variable(torch.randn(N, D_out), requires_grad=False)
-
-    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
-    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
-    parameters = [w1, w2]
-
-    learning_rate = 1e-6
-    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
-    loss_scaler = DynamicLossScaler()
-
-    for t in range(500):
-        y_pred = x.mm(w1).clamp(min=0).mm(w2)
-        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
-        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
-        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
-        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-
-        # Run backprop
-        optimizer.zero_grad()
-        loss.backward()
-
-        # Check for overflow
-        has_overflow = DynamicLossScaler.has_overflow(parameters)
-
-        # If no overflow, unscale grad and update as usual
-        if not has_overflow:
-            for param in parameters:
-                param.grad.data.mul_(1. / loss_scaler.loss_scale)
-            optimizer.step()
-        # Otherwise, don't do anything -- ie, skip iteration
-        else:
-            print('OVERFLOW!')
-
-        # Update loss scale for next iteration
-        loss_scaler.update_scale(has_overflow)
-
-"""
diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py
index 3dcb5f71..794a9084 100755
--- a/modelscope/utils/nlp/distributed.py
+++ b/modelscope/utils/nlp/distributed.py
@@ -14,35 +14,14 @@
 # limitations under the License.
 
 import math
-import os
 
 import torch
 import torch.distributed as dist
-from megatron import mpu
+from megatron_util import mpu
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from torch.autograd import Variable
 from torch.nn.modules import Module
 
-from modelscope.utils.torch_utils import init_dist
-
-
-def initialize_distributed(rank, mpu, world_size, model_parallel_size,
-                           master_ip, master_port):
-    """Initialize torch.distributed."""
-    # Manually set the device ids.
-    device = rank % torch.cuda.device_count()
-    torch.cuda.set_device(device)
-    # Call the init process
-    init_method = 'tcp://'
-    init_method += master_ip + ':' + master_port
-    torch.distributed.init_process_group(
-        backend='nccl',
-        world_size=int(os.getenv('WORLD_SIZE', world_size)),
-        rank=rank,
-        init_method=init_method)
-    # Set the model-parallel communicators.
-    mpu.initialize_model_parallel(model_parallel_size)
-
 
 def normal_init_method(mean, std):
 
@@ -70,7 +49,7 @@ class DistributedDataParallel(Module):
 
         self.module = module
         self.data_parallel_group = mpu.get_data_parallel_group()
-        src_rank = mpu.get_model_parallel_rank()
+        src_rank = mpu.get_tensor_model_parallel_rank()
         for p in self.module.parameters():
             if torch.is_tensor(p):
                 dist.broadcast(p, src_rank, group=self.data_parallel_group)
diff --git a/modelscope/utils/plugins.py b/modelscope/utils/plugins.py
new file mode 100644
index 00000000..6c2f2975
--- /dev/null
+++ b/modelscope/utils/plugins.py
@@ -0,0 +1,215 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+import importlib
+import os
+import pkgutil
+import sys
+from contextlib import contextmanager
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import Iterable, List, Optional, Set
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+LOCAL_PLUGINS_FILENAME = '.modelscope_plugins'
+GLOBAL_PLUGINS_FILENAME = os.path.join(Path.home(), '.modelscope', 'plugins')
+DEFAULT_PLUGINS = []
+
+
+@contextmanager
+def pushd(new_dir: str, verbose: bool = False):
+    """
+    Changes the current directory to the given path and prepends it to `sys.path`.
+    This method is intended to use with `with`, so after its usage, the current
+    directory will be set to the previous value.
+    """
+    previous_dir = os.getcwd()
+    if verbose:
+        logger.info(f'Changing directory to {new_dir}')  # type: ignore
+    os.chdir(new_dir)
+    try:
+        yield
+    finally:
+        if verbose:
+            logger.info(f'Changing directory back to {previous_dir}')
+        os.chdir(previous_dir)
+
+
+@contextmanager
+def push_python_path(path: str):
+    """
+    Prepends the given path to `sys.path`.
+    This method is intended to use with `with`, so after its usage, its value
+    will be removed from `sys.path`.
+    """
+    path = Path(path).resolve()
+    path = str(path)
+    sys.path.insert(0, path)
+    try:
+        yield
+    finally:
+        sys.path.remove(path)
+
+
+def discover_file_plugins(
+        filename: str = LOCAL_PLUGINS_FILENAME) -> Iterable[str]:
+    """
+    Discover plugins from file
+    """
+    with open(filename) as f:
+        for module_name in f:
+            module_name = module_name.strip()
+            if module_name:
+                yield module_name
+
+
+def discover_plugins() -> Iterable[str]:
+    """
+    Discover plugins
+    """
+    plugins: Set[str] = set()
+    if os.path.isfile(LOCAL_PLUGINS_FILENAME):
+        with push_python_path('.'):
+            for plugin in discover_file_plugins(LOCAL_PLUGINS_FILENAME):
+                if plugin in plugins:
+                    continue
+                yield plugin
+                plugins.add(plugin)
+    if os.path.isfile(GLOBAL_PLUGINS_FILENAME):
+        for plugin in discover_file_plugins(GLOBAL_PLUGINS_FILENAME):
+            if plugin in plugins:
+                continue
+            yield plugin
+            plugins.add(plugin)
+
+
+def import_all_plugins(plugins: List[str] = None) -> List[str]:
+    """
+    Imports default plugins, input plugins and file discovered plugins.
+    """
+    import_module_and_submodules(
+        'modelscope',
+        include={
+            'modelscope.metrics.builder',
+            'modelscope.models.builder',
+            'modelscope.pipelines.builder',
+            'modelscope.preprocessors.builder',
+            'modelscope.trainers.builder',
+        },
+        exclude={
+            'modelscope.metrics.*',
+            'modelscope.models.*',
+            'modelscope.pipelines.*',
+            'modelscope.preprocessors.*',
+            'modelscope.trainers.*',
+            'modelscope.msdatasets',
+            'modelscope.utils',
+            'modelscope.exporters',
+        })
+
+    imported_plugins: List[str] = []
+
+    imported_plugins.extend(import_plugins(DEFAULT_PLUGINS))
+    imported_plugins.extend(import_plugins(plugins))
+    imported_plugins.extend(import_file_plugins())
+
+    return imported_plugins
+
+
+def import_plugins(plugins: List[str] = None) -> List[str]:
+    """
+    Imports the plugins listed in the arguments.
+    """
+    imported_plugins: List[str] = []
+    if plugins is None or len(plugins) == 0:
+        return imported_plugins
+
+    # Workaround for a presumed Python issue where spawned processes can't find modules in the current directory.
+    cwd = os.getcwd()
+    if cwd not in sys.path:
+        sys.path.append(cwd)
+
+    for module_name in plugins:
+        try:
+            import_module_and_submodules(module_name)
+            logger.info('Plugin %s available', module_name)
+            imported_plugins.append(module_name)
+        except ModuleNotFoundError as e:
+            logger.error(f'Plugin {module_name} could not be loaded: {e}')
+
+    return imported_plugins
+
+
+def import_file_plugins() -> List[str]:
+    """
+    Imports the plugins found with `discover_plugins()`.
+    """
+    imported_plugins: List[str] = []
+
+    # Workaround for a presumed Python issue where spawned processes can't find modules in the current directory.
+    cwd = os.getcwd()
+    if cwd not in sys.path:
+        sys.path.append(cwd)
+
+    for module_name in discover_plugins():
+        try:
+            importlib.import_module(module_name)
+            logger.info('Plugin %s available', module_name)
+            imported_plugins.append(module_name)
+        except ModuleNotFoundError as e:
+            logger.error(f'Plugin {module_name} could not be loaded: {e}')
+
+    return imported_plugins
+
+
+def import_module_and_submodules(package_name: str,
+                                 include: Optional[Set[str]] = None,
+                                 exclude: Optional[Set[str]] = None) -> None:
+    """
+    Import all public submodules under the given package.
+    """
+    # take care of None
+    include = include if include else set()
+    exclude = exclude if exclude else set()
+
+    def fn_in(packge_name: str, pattern_set: Set[str]) -> bool:
+        for pattern in pattern_set:
+            if fnmatch(package_name, pattern):
+                return True
+        return False
+
+    if not fn_in(package_name, include) and fn_in(package_name, exclude):
+        return
+
+    importlib.invalidate_caches()
+
+    # For some reason, python doesn't always add this by default to your path, but you pretty much
+    # always want it when using `--include-package`.  And if it's already there, adding it again at
+    # the end won't hurt anything.
+    with push_python_path('.'):
+        # Import at top level
+        try:
+            module = importlib.import_module(package_name)
+            path = getattr(module, '__path__', [])
+            path_string = '' if not path else path[0]
+
+            # walk_packages only finds immediate children, so need to recurse.
+            for module_finder, name, _ in pkgutil.walk_packages(path):
+                # Sometimes when you import third-party libraries that are on your path,
+                # `pkgutil.walk_packages` returns those too, so we need to skip them.
+                if path_string and module_finder.path != path_string:  # type: ignore[union-attr]
+                    continue
+                if name.startswith('_'):
+                    # skip directly importing private subpackages
+                    continue
+                if name.startswith('test'):
+                    # skip tests
+                    continue
+                subpackage = f'{package_name}.{name}'
+                import_module_and_submodules(subpackage, exclude=exclude)
+        except Exception as e:
+            logger.warning(f'{package_name} not imported: {str(e)}')
+            if len(package_name.split('.')) == 1:
+                raise ModuleNotFoundError('Package not installed')
diff --git a/modelscope/utils/regress_test_utils.py b/modelscope/utils/regress_test_utils.py
index e7a47214..bae2edac 100644
--- a/modelscope/utils/regress_test_utils.py
+++ b/modelscope/utils/regress_test_utils.py
@@ -120,8 +120,19 @@ class RegressTool:
             with open(baseline, 'rb') as f:
                 base = pickle.load(f)
 
-            print(f'baseline: {json.dumps(base, cls=NumpyEncoder)}')
-            print(f'latest  : {json.dumps(io_json, cls=NumpyEncoder)}')
+            class SafeNumpyEncoder(NumpyEncoder):
+
+                def default(self, obj):
+                    try:
+                        return super().default(obj)
+                    except Exception:
+                        print(
+                            f'Type {obj.__class__} cannot be serialized and printed'
+                        )
+                        return None
+
+            print(f'baseline: {json.dumps(base, cls=SafeNumpyEncoder)}')
+            print(f'latest  : {json.dumps(io_json, cls=SafeNumpyEncoder)}')
             if not compare_io_and_print(base, io_json, compare_fn, **kwargs):
                 raise ValueError('Result not match!')
 
@@ -519,7 +530,8 @@ def compare_arguments_nested(print_content,
                              arg1,
                              arg2,
                              rtol=1.e-3,
-                             atol=1.e-8):
+                             atol=1.e-8,
+                             ignore_unknown_type=True):
     type1 = type(arg1)
     type2 = type(arg2)
     if type1.__name__ != type2.__name__:
@@ -594,7 +606,10 @@ def compare_arguments_nested(print_content,
             return False
         return True
     else:
-        raise ValueError(f'type not supported: {type1}')
+        if ignore_unknown_type:
+            return True
+        else:
+            raise ValueError(f'type not supported: {type1}')
 
 
 def compare_io_and_print(baseline_json, io_json, compare_fn=None, **kwargs):
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index 8ffec100..76759e34 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -15,10 +15,10 @@ from collections import OrderedDict
 
 import requests
 import torch
-from datasets.config import TF_AVAILABLE, TORCH_AVAILABLE
 from torch.utils.data import Dataset
 
-from .torch_utils import _find_free_port
+from modelscope.utils.import_utils import is_tf_available, is_torch_available
+from modelscope.utils.torch_utils import _find_free_port
 
 TEST_LEVEL = 2
 TEST_LEVEL_STR = 'TEST_LEVEL'
@@ -33,13 +33,13 @@ def test_level():
 
 
 def require_tf(test_case):
-    if not TF_AVAILABLE:
+    if not is_tf_available():
         test_case = unittest.skip('test requires TensorFlow')(test_case)
     return test_case
 
 
 def require_torch(test_case):
-    if not TORCH_AVAILABLE:
+    if not is_torch_available():
         test_case = unittest.skip('test requires PyTorch')(test_case)
     return test_case
 
diff --git a/modelscope/utils/timer.py b/modelscope/utils/timer.py
new file mode 100644
index 00000000..74251064
--- /dev/null
+++ b/modelscope/utils/timer.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright © Alibaba, Inc. and its affiliates.
+
+import datetime
+import time
+
+
+class Timer(object):
+
+    def __init__(self):
+        """Recorder of time consumption.
+
+        """
+        self.reset()
+
+    @property
+    def average_time(self):
+        return self.total_time / self.calls if self.calls > 0 else 0.0
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.add(time.time() - self.start_time)
+        if average:
+            return self.average_time
+        else:
+            return self.diff
+
+    def add(self, time_diff):
+        self.diff = time_diff
+        self.total_time += self.diff
+        self.calls += 1
+
+    def reset(self):
+        self.total_time = 0.0
+        self.calls = 0
+        self.start_time = 0.0
+        self.diff = 0.0
+
+    def avg_time_str(self):
+        time_str = str(datetime.timedelta(seconds=self.average_time))
+        return time_str
+
+
+def get_time_str(time_diff):
+    time_str = str(datetime.timedelta(seconds=time_diff))
+    return time_str
diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py
index ed1f94c5..7e0a9129 100644
--- a/modelscope/utils/torch_utils.py
+++ b/modelscope/utils/torch_utils.py
@@ -108,7 +108,7 @@ def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
 def get_dist_info() -> Tuple[int, int]:
     if is_dist():
         try:
-            from megatron import mpu
+            from megatron_util import mpu
             assert mpu.model_parallel_is_initialized()
             rank = mpu.get_data_parallel_rank()
             world_size = mpu.get_data_parallel_world_size()
@@ -125,6 +125,37 @@ def get_local_rank():
     return int(os.environ.get('LOCAL_RANK', 0))
 
 
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier)
+    among all processes when using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
 def is_dist():
     return dist.is_available() and dist.is_initialized()
 
@@ -203,7 +234,98 @@ def set_random_seed(seed):
             f'Random seed should be positive, current seed is {seed}')
 
 
-def set_random_seed_mpu(seed):
-    from megatron import mpu
-    set_random_seed(seed)
-    mpu.model_parallel_cuda_manual_seed(seed)
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+    if dist.get_backend() == 'nccl':
+        return dist.new_group(backend='gloo')
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    backend = dist.get_backend(group)
+    assert backend in ['gloo', 'nccl']
+    device = torch.device('cpu' if backend == 'gloo' else 'cuda')
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024**3:
+        logger.warning(
+            'Rank {} trying to all-gather {:.2f} GB of data on device {}'.
+            format(get_rank(),
+                   len(buffer) / (1024**3), device))
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert (
+        world_size >= 1
+    ), 'comm.gather/all_gather must be called from ranks within the group!'
+    local_size = torch.tensor([tensor.numel()],
+                              dtype=torch.int64,
+                              device=tensor.device)
+    size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device)
+        for _ in range(world_size)
+    ]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros((max_size - local_size, ),
+                              dtype=torch.uint8,
+                              device=tensor.device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [
+        torch.empty((max_size, ), dtype=torch.uint8, device=tensor.device)
+        for _ in size_list
+    ]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
diff --git a/modelscope/version.py b/modelscope/version.py
index 316f0745..1f4b62e7 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '1.1.0'
+__version__ = '1.2.0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-10-13 08:56:12'
diff --git a/pose_keypoint.jpg b/pose_keypoint.jpg
new file mode 100644
index 00000000..550a892f
--- /dev/null
+++ b/pose_keypoint.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c18fbde1e9c681ce8927a7b851b366ac71f13a0ffaba1bec202c49e49216d9d3
+size 191543
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 2dd63417..983fd70f 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -1,6 +1,8 @@
+bitstring
 easyasr>=0.0.2
 espnet==202204
-funasr>=0.1.4
+funasr>=0.1.6
+funtextprocessing>=0.1.1
 greenlet>=1.1.2
 h5py
 inflect
@@ -11,6 +13,7 @@ librosa
 lxml
 matplotlib
 MinDAEC
+mir_eval>=0.7
 msgpack>=1.0.4
 nara_wpe
 nltk
@@ -29,12 +32,14 @@ pygments>=2.12.0
 pysptk>=0.1.15,<0.2.0
 pytorch_wavelets
 PyWavelets>=1.0.0
+rotary_embedding_torch>=0.1.5
 scikit-learn
 SoundFile>0.10
 sox
+speechbrain>=0.5
 torchaudio
 tqdm
 traitlets>=5.3.0
-ttsfrd>=0.0.3
+ttsfrd>=0.1.1
 unidecode
 wcwidth>=0.2.5
diff --git a/requirements/cv.txt b/requirements/cv.txt
index 43eba7f9..46f4302e 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -12,6 +12,7 @@ imageio>=2.9.0
 imageio-ffmpeg>=0.4.2
 imgaug>=0.4.0
 kornia>=0.5.0
+lap
 lmdb
 lpips
 ml_collections
@@ -22,8 +23,10 @@ networkx>=2.5
 numba
 onnxruntime>=1.10
 opencv-python
-pai-easycv>=0.6.3.9
+pai-easycv>=0.8
 pandas
+panopticapi
+plyfile>=0.7.4
 psutil
 regex
 scikit-image>=0.19.3
@@ -35,4 +38,6 @@ tf_slim
 timm>=0.4.9
 torchmetrics>=0.6.2
 torchvision
+ujson
+utils
 videofeatures_clipit>=1.0
diff --git a/requirements/docs.txt b/requirements/docs.txt
index f51d1565..3b353835 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,7 +1,7 @@
 docutils>=0.16.0
 myst_parser
 recommonmark
-sphinx>=4.0.2
+sphinx>=5.3.0
 sphinx-book-theme
 sphinx-copybutton
 sphinx_markdown_tables
diff --git a/requirements/framework.txt b/requirements/framework.txt
index abc08cf1..d5b4cefb 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,7 +1,6 @@
 addict
 attrs
-# version beyond 2.5.2 introduces compatbility issue and is being resolved
-datasets<=2.5.2
+datasets>=2.7.0
 easydict
 einops
 filelock>=3.3.0
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 457fe2b0..8a86be8e 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,9 +1,13 @@
+accelerate
+diffusers>=0.11.1
 ftfy>=6.0.3
 librosa
+opencv-python
 pycocoevalcap>=1.2
 pycocotools>=2.0.4
 # compatible with taming-transformers-rom1504
 pytorch_lightning<=1.7.7
+rapidfuzz
 # rough-score was just recently updated from 0.0.4 to 0.0.7
 # which introduced compatability issues that are being investigated
 rouge_score<=0.0.4
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index 694fc7db..fa926ef5 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -4,6 +4,7 @@ filelock
 ftfy
 jieba>=0.42.1
 matplotlib
+megatron_util
 nltk
 pandas
 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.
@@ -11,6 +12,7 @@ protobuf>=3.19.0,<3.21.0
 pythainlp
 pyvi
 regex
+rouge
 sacremoses>=0.0.41
 scikit_learn
 sentencepiece
diff --git a/setup.py b/setup.py
index 5dfafefa..210c211c 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@ import shutil
 import subprocess
 from setuptools import find_packages, setup
 
+from modelscope.utils.ast_utils import generate_ast_template
 from modelscope.utils.constant import Fields
 
 
@@ -168,6 +169,7 @@ def pack_resource():
 
 if __name__ == '__main__':
     # write_version_py()
+    generate_ast_template()
     pack_resource()
     os.chdir('package')
     install_requires, deps_link = parse_requirements('requirements.txt')
@@ -199,6 +201,9 @@ if __name__ == '__main__':
         url='https://github.com/modelscope/modelscope',
         packages=find_packages(exclude=('configs', 'tools', 'demo')),
         include_package_data=True,
+        package_data={
+            '': ['*.h', '*.cpp', '*.cu'],
+        },
         classifiers=[
             'Development Status :: 4 - Beta',
             'License :: OSI Approved :: Apache Software License',
diff --git a/tests/export/test_export_sbert_sequence_classification.py b/tests/export/test_export_sbert_sequence_classification.py
index 7533732d..dc02cf18 100644
--- a/tests/export/test_export_sbert_sequence_classification.py
+++ b/tests/export/test_export_sbert_sequence_classification.py
@@ -7,6 +7,7 @@ from collections import OrderedDict
 
 from modelscope.exporters import Exporter, TorchModelExporter
 from modelscope.models import Model
+from modelscope.utils.constant import Tasks
 from modelscope.utils.test_utils import test_level
 
 
@@ -18,6 +19,7 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
         if not os.path.exists(self.tmp_dir):
             os.makedirs(self.tmp_dir)
         self.model_id = 'damo/nlp_structbert_sentence-similarity_chinese-base'
+        self.model_id_bert = 'langboat/mengzi-bert-base'
 
     def tearDown(self):
         shutil.rmtree(self.tmp_dir)
@@ -33,6 +35,17 @@ class TestExportSbertSequenceClassification(unittest.TestCase):
             TorchModelExporter.from_model(model).export_torch_script(
                 shape=(2, 256), output_dir=self.tmp_dir))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_export_bert_sequence_classification(self):
+        model = Model.from_pretrained(
+            self.model_id_bert, task=Tasks.text_classification)
+        print(
+            Exporter.from_model(model).export_onnx(
+                shape=(2, 256), output_dir=self.tmp_dir))
+        print(
+            TorchModelExporter.from_model(model).export_torch_script(
+                shape=(2, 256), output_dir=self.tmp_dir))
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_export_outer_module(self):
         from transformers import BertForSequenceClassification, BertTokenizerFast
diff --git a/tests/export/test_export_tf_model.py b/tests/export/test_export_tf_model.py
new file mode 100644
index 00000000..723c3d1d
--- /dev/null
+++ b/tests/export/test_export_tf_model.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
+from tensorflow.keras.preprocessing import image
+
+from modelscope.exporters import TfModelExporter
+from modelscope.utils.regress_test_utils import compare_arguments_nested
+from modelscope.utils.test_utils import test_level
+
+
+class TestExportTfModel(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_export_resnet50(self):
+        img_path = 'data/test/images/auto_demo.jpg'
+        img = image.load_img(img_path, target_size=(224, 224))
+        x = image.img_to_array(img)
+        x = np.expand_dims(x, axis=0)
+        x = preprocess_input(x)
+        x_t = tf.convert_to_tensor(x)
+        model = ResNet50(weights='imagenet')
+
+        def call_func(inputs):
+            return [model.predict(list(inputs.values())[0])]
+
+        output_files = TfModelExporter().export_onnx(
+            model=model,
+            dummy_inputs={'input': x_t},
+            call_func=call_func,
+            output_dir=self.tmp_dir)
+        print(output_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/msdatasets/test_dataset_delete.py b/tests/msdatasets/test_dataset_delete.py
index 8b3c2426..1b5ee831 100644
--- a/tests/msdatasets/test_dataset_delete.py
+++ b/tests/msdatasets/test_dataset_delete.py
@@ -9,7 +9,7 @@ from modelscope.msdatasets import MsDataset
 from modelscope.utils import logger as logging
 from modelscope.utils.test_utils import test_level
 
-logger = logging.get_logger(__name__)
+logger = logging.get_logger()
 
 KEY_EXTRACTED = 'extracted'
 EXPECTED_MSG = 'success'
diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py
index d91f24d7..2cd910c2 100644
--- a/tests/msdatasets/test_dataset_upload.py
+++ b/tests/msdatasets/test_dataset_upload.py
@@ -12,7 +12,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode,
                                        ModelFile)
 from modelscope.utils.test_utils import test_level
 
-logger = logging.get_logger(__name__)
+logger = logging.get_logger()
 
 KEY_EXTRACTED = 'extracted'
 
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 81a87398..6cc1dc51 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -137,6 +137,76 @@ class MsDatasetTest(unittest.TestCase):
         )
         print(next(iter(tf_dataset)))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_streaming_load_coco(self):
+        small_coco_for_test = MsDataset.load(
+            dataset_name='EasyCV/small_coco_for_test',
+            split='train',
+            use_streaming=True,
+            download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        dataset_sample_dict = next(iter(small_coco_for_test))
+        print(dataset_sample_dict)
+        assert dataset_sample_dict.values()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_streaming_load_uni_fold(self):
+        """Test case for loading large scale datasets."""
+        dataset = MsDataset.load(
+            dataset_name='Uni-Fold-Data',
+            split='train',
+            use_streaming=True,
+            namespace='DPTech')
+        data_example = next(iter(dataset))
+        print(data_example)
+        assert data_example.values()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_streaming_load_afqmc(self):
+        """To streaming-load afqmc dataset, which contains train/dev/validation data in meta-files."""
+        dataset = MsDataset.load('afqmc', split='test', use_streaming=True)
+        data_example = next(iter(dataset))
+        print(data_example)
+        assert data_example.values()
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_streaming_load_from_hf(self):
+        """Use stream mode to load dataset from huggingface hub."""
+        from modelscope.utils.constant import Hubs
+        ds_train = MsDataset.load(
+            'glue',
+            subset_name='sst2',
+            split='train',
+            hub=Hubs.huggingface,
+            use_streaming=True)
+        data_example = next(iter(ds_train))
+        print(data_example)
+        assert data_example.values()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_streaming_load_img_object(self):
+        """Test case for iterating PIL object."""
+        from PIL.PngImagePlugin import PngImageFile
+        dataset = MsDataset.load(
+            dataset_name='SIDD',
+            subset_name='default',
+            namespace='huizheng',
+            split='train',
+            use_streaming=True)
+        data_example = next(iter(dataset))
+        print(data_example)
+        assert isinstance(data_example['Noisy Image:FILE:Object'],
+                          PngImageFile)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_to_ms_dataset(self):
+        """Test case for converting huggingface dataset to `MsDataset` instance."""
+        from datasets.load import load_dataset
+        hf_dataset = load_dataset('beans', split='train', streaming=True)
+        ms_dataset = MsDataset.to_ms_dataset(hf_dataset)
+        data_example = next(iter(ms_dataset))
+        print(data_example)
+        assert data_example.values()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/adaseq_pipelines/__init__.py b/tests/pipelines/adaseq_pipelines/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/pipelines/adaseq_pipelines/test_named_entity_recognition.py b/tests/pipelines/adaseq_pipelines/test_named_entity_recognition.py
new file mode 100644
index 00000000..4a0af955
--- /dev/null
+++ b/tests/pipelines/adaseq_pipelines/test_named_entity_recognition.py
@@ -0,0 +1,25 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self):
+        os.system('pip install adaseq>=0.5.0')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_span_based_ner_pipeline(self):
+        pipeline_ins = pipeline(
+            Tasks.named_entity_recognition,
+            'damo/nlp_nested-ner_named-entity-recognition_chinese-base-med')
+        print(
+            pipeline_ins(
+                '1、可测量目标： 1周内胸闷缓解。2、下一步诊疗措施：1.心内科护理常规，一级护理，低盐低脂饮食，留陪客。'
+                '2.予“阿司匹林肠溶片”抗血小板聚集，“呋塞米、螺内酯”利尿减轻心前负荷，“瑞舒伐他汀”调脂稳定斑块，“厄贝沙坦片片”降血压抗心机重构'
+            ))
diff --git a/tests/pipelines/easycv_pipelines/test_panoptic_segmentation_pipeline.py b/tests/pipelines/easycv_pipelines/test_panoptic_segmentation_pipeline.py
new file mode 100644
index 00000000..49e01251
--- /dev/null
+++ b/tests/pipelines/easycv_pipelines/test_panoptic_segmentation_pipeline.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import panoptic_seg_masks_to_image
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class EasyCVPanopticSegmentationPipelineTest(unittest.TestCase,
+                                             DemoCompatibilityCheck):
+    img_path = 'data/test/images/image_semantic_segmentation.jpg'
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_r50_panoptic-segmentation_cocopan'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_r50(self):
+        segmentor = pipeline(task=self.task, model=self.model_id)
+        outputs = segmentor(self.img_path)
+        draw_img = panoptic_seg_masks_to_image(outputs[OutputKeys.MASKS])
+        cv2.imwrite('result.jpg', draw_img)
+        print('print ' + self.model_id + ' success')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_addr_mgeo.py b/tests/pipelines/test_addr_mgeo.py
new file mode 100644
index 00000000..d630b857
--- /dev/null
+++ b/tests/pipelines/test_addr_mgeo.py
@@ -0,0 +1,126 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.nlp import SbertForSequenceClassification
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.nlp import TextClassificationPipeline
+from modelscope.preprocessors import TextClassificationTransformersPreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import IgnoreKeyFn, MsRegressTool
+from modelscope.utils.test_utils import test_level
+
+
+class MGeoTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    multi_modal_inputs = {
+        'source_sentence': ['杭州余杭东方未来学校附近世纪华联商场(金家渡北苑店)'],
+        'first_sequence_gis': [[
+            [
+                13159, 13295, 13136, 13157, 13158, 13291, 13294, 74505, 74713,
+                75387, 75389, 75411
+            ],
+            [3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4],
+            [3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],  # noqa: E126
+            [[1254, 1474, 1255, 1476], [1253, 1473, 1256, 1476],
+             [1247, 1473, 1255, 1480], [1252, 1475, 1253, 1476],
+             [1253, 1475, 1253, 1476], [1252, 1471, 1254, 1475],
+             [1254, 1473, 1256, 1475], [1238, 1427, 1339, 1490],
+             [1238, 1427, 1339, 1490], [1252, 1474, 1255, 1476],
+             [1252, 1474, 1255, 1476], [1249, 1472, 1255, 1479]],
+            [[24, 23, 15, 23], [24, 28, 15, 18], [31, 24, 22, 22],
+             [43, 13, 37, 13], [43, 6, 35, 6], [31, 32, 22, 14],
+             [19, 30, 9, 16], [24, 30, 15, 16], [24, 30, 15, 16],
+             [29, 24, 20, 22], [28, 25, 19, 21], [31, 26, 22, 20]],
+            '120.08802231437534,30.343853313981505'
+        ]],
+        'sentences_to_compare': [
+            '良渚街道金家渡北苑42号世纪华联超市(金家渡北苑店)', '金家渡路金家渡中苑南区70幢金家渡中苑70幢',
+            '金家渡路140-142号附近家家福足道(金家渡店)'
+        ],
+        'second_sequence_gis':
+        [[[13083, 13081, 13084, 13085, 13131, 13134, 13136, 13147, 13148],
+          [3, 3, 3, 3, 3, 3, 3, 3, 3], [3, 4, 4, 4, 4, 4, 4, 4, 4],
+          [[1248, 1477, 1250, 1479], [1248, 1475, 1250, 1476],
+           [1247, 1478, 1249, 1481], [1249, 1479, 1249, 1480],
+           [1249, 1476, 1250, 1476], [1250, 1474, 1252, 1478],
+           [1247, 1473, 1255, 1480], [1250, 1478, 1251, 1479],
+           [1249, 1478, 1250, 1481]],
+          [[30, 26, 21, 20], [32, 43, 23, 43], [33, 23, 23, 23],
+           [31, 13, 22, 13], [25, 43, 16, 43], [20, 33, 10, 33],
+           [26, 29, 17, 17], [18, 21, 8, 21], [26, 23, 17, 23]],
+          '120.08075205680345,30.34697777462197'],
+         [[13291, 13159, 13295, 74713, 75387, 75389, 75411],
+          [3, 3, 3, 4, 4, 4, 4], [3, 4, 4, 4, 4, 4, 4],
+          [[1252, 1471, 1254, 1475], [1254, 1474, 1255, 1476],
+           [1253, 1473, 1256, 1476], [1238, 1427, 1339, 1490],
+           [1252, 1474, 1255, 1476], [1252, 1474, 1255, 1476],
+           [1249, 1472, 1255, 1479]],
+          [[28, 28, 19, 18], [22, 16, 12, 16], [23, 24, 13, 22],
+           [24, 30, 15, 16], [27, 20, 18, 20], [27, 21, 18, 21],
+           [30, 24, 21, 22]], '120.0872539617001,30.342783672056953'],
+         [[13291, 13290, 13294, 13295, 13298], [3, 3, 3, 3, 3],
+          [3, 4, 4, 4, 4],
+          [[1252, 1471, 1254, 1475], [1253, 1469, 1255, 1472],
+           [1254, 1473, 1256, 1475], [1253, 1473, 1256, 1476],
+           [1255, 1467, 1258, 1472]],
+          [[32, 25, 23, 21], [26, 33, 17, 33], [21, 19, 11, 19],
+           [25, 21, 16, 21], [21, 33, 11,
+                              33]], '120.08839673752281,30.34156156893651']]
+    }
+    single_modal_inputs = {
+        'source_sentence': ['杭州余杭东方未来学校附近世纪华联商场(金家渡北苑店)'],
+        'sentences_to_compare': [
+            '良渚街道金家渡北苑42号世纪华联超市(金家渡北苑店)', '金家渡路金家渡中苑南区70幢金家渡中苑70幢',
+            '金家渡路140-142号附近家家福足道(金家渡店)'
+        ]
+    }
+
+    pipe_input = [
+        [
+            Tasks.text_ranking,
+            'damo/mgeo_geographic_textual_similarity_rerank_chinese_base',
+            multi_modal_inputs
+        ],
+        [
+            Tasks.text_ranking,
+            'damo/mgeo_geographic_textual_similarity_rerank_chinese_base',
+            single_modal_inputs
+        ],
+        [
+            Tasks.token_classification,
+            'damo/mgeo_geographic_elements_tagging_chinese_base',
+            '浙江省杭州市余杭区阿里巴巴西溪园区'
+        ],
+        [
+            Tasks.token_classification,
+            'damo/mgeo_geographic_composition_analysis_chinese_base',
+            '浙江省杭州市余杭区阿里巴巴西溪园区'
+        ],
+        [
+            Tasks.token_classification,
+            'damo/mgeo_geographic_where_what_cut_chinese_base',
+            '浙江省杭州市余杭区阿里巴巴西溪园区'
+        ],
+        [
+            Tasks.sentence_similarity,
+            'damo/mgeo_geographic_entity_alignment_chinese_base',
+            ('后湖金桥大道绿色新都116—120栋116号（诺雅广告）', '金桥大道46号宏宇·绿色新都120幢')
+        ],
+    ]
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        for task, model, inputs in self.pipe_input:
+            pipeline_ins = pipeline(task=task, model=model)
+            print(pipeline_ins(input=inputs))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_automatic_speech_recognition.py b/tests/pipelines/test_automatic_speech_recognition.py
index 57e0ea5d..dc624f29 100644
--- a/tests/pipelines/test_automatic_speech_recognition.py
+++ b/tests/pipelines/test_automatic_speech_recognition.py
@@ -61,6 +61,10 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             'checking_item': OutputKeys.TEXT,
             'example': 'dataset_example'
         },
+        'test_run_with_funasr': {
+            'checking_item': OutputKeys.TEXT,
+            'example': 'dataset_example'
+        },
         'dataset_example': {
             'Wrd': 49532,  # the number of words
             'Snt': 5000,  # the number of sentences
@@ -197,6 +201,16 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online',
             'wav_path': 'data/test/audios/asr_example_ko.wav'
         },
+        {
+            'model_id':
+            'damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline',
+            'wav_path': 'data/test/audios/asr_example_pt.wav'
+        },
+        {
+            'model_id':
+            'damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online',
+            'wav_path': 'data/test/audios/asr_example_pt.wav'
+        },
         {
             'model_id':
             'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online',
@@ -257,6 +271,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
     def setUp(self) -> None:
         self.am_pytorch_model_id = 'damo/speech_paraformer_asr_nat-aishell1-pytorch'
         self.am_tf_model_id = 'damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1'
+        self.am_funasr_model_id = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
         # this temporary workspace dir will store waveform files
         self.workspace = os.path.join(os.getcwd(), '.tmp')
         self.task = Tasks.auto_speech_recognition
@@ -315,7 +330,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
         audio = audio.tobytes()
         return audio, fs
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_pcm(self):
         """run with wav data
         """
@@ -334,7 +349,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             model_id=self.am_pytorch_model_id, audio_in=audio, sr=sr)
         self.check_result('test_run_with_pcm_pytorch', rec_result)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav(self):
         """run with single waveform file
         """
@@ -353,7 +368,7 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             model_id=self.am_pytorch_model_id, audio_in=wav_file_path)
         self.check_result('test_run_with_wav_pytorch', rec_result)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_url(self):
         """run with single url file
         """
@@ -370,6 +385,19 @@ class AutomaticSpeechRecognitionTest(unittest.TestCase,
             model_id=self.am_pytorch_model_id, audio_in=URL_FILE)
         self.check_result('test_run_with_url_pytorch', rec_result)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_funasr(self):
+        """run with single url file using FunASR
+        """
+
+        logger.info('Run ASR test with url file (FunASR)...')
+
+        rec_result = self.run_pipeline(
+            model_id=self.am_funasr_model_id, audio_in=URL_FILE)
+        self.check_result('test_run_with_funasr', rec_result)
+
+        logger.info('Run ASR test with url file (pytorch)...')
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_wav_dataset_pytorch(self):
         """run with datasets, and audio format is waveform
diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py
index 6e671d2e..6f73a243 100644
--- a/tests/pipelines/test_body_3d_keypoints.py
+++ b/tests/pipelines/test_body_3d_keypoints.py
@@ -39,9 +39,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck):
         if not cap.isOpened():
             raise Exception('modelscope error: %s cannot be decoded by OpenCV.'
                             % (self.test_video))
-        pipeline_input = self.test_video
-        self.pipeline_inference(
-            body_3d_keypoints, pipeline_input=pipeline_input)
+        self.pipeline_inference(body_3d_keypoints, pipeline_input=cap)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_demo_compatibility(self):
diff --git a/tests/pipelines/test_chinese_stable_diffusion.py b/tests/pipelines/test_chinese_stable_diffusion.py
new file mode 100644
index 00000000..acbdb074
--- /dev/null
+++ b/tests/pipelines/test_chinese_stable_diffusion.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ChineseStableDiffusionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'damo/multi-modal_chinese_stable_diffusion_v1.0'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_default(self):
+        pipe = pipeline(task=self.task, model=self.model_id)
+        output = pipe('中国山水画')
+        output['output_img'][0].save('result.png')
+        print('Image saved to result.png')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_dpmsolver(self):
+        from diffusers.schedulers import DPMSolverMultistepScheduler
+        pipe = pipeline(task=self.task, model=self.model_id)
+        pipe.pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
+            pipe.pipeline.scheduler.config)
+        output = pipe('中国山水画')
+        output['output_img'][0].save('result2.png')
+        print('Image saved to result2.png')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_ddcolor_image_colorization.py b/tests/pipelines/test_ddcolor_image_colorization.py
new file mode 100644
index 00000000..e1876329
--- /dev/null
+++ b/tests/pipelines/test_ddcolor_image_colorization.py
@@ -0,0 +1,61 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.cv import DDColorImageColorizationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DDColorImageColorizationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_colorization
+        self.model_id = 'damo/cv_ddcolor_image-colorization'
+        self.test_image = 'data/test/images/audrey_hepburn.jpg'
+
+    def pipeline_inference(self, pipeline: Pipeline, test_image: str):
+        result = pipeline(test_image)
+        if result is not None:
+            cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG])
+            print(f'Output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        image_colorization = DDColorImageColorizationPipeline(cache_path)
+        self.pipeline_inference(image_colorization, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_pretrained(self):
+        model = Model.from_pretrained(self.model_id)
+        image_colorization = pipeline(
+            task=Tasks.image_colorization, model=model)
+        self.pipeline_inference(image_colorization, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        image_colorization = pipeline(
+            task=Tasks.image_colorization, model=self.model_id)
+        self.pipeline_inference(image_colorization, self.test_image)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        image_colorization = pipeline(Tasks.image_colorization)
+        self.pipeline_inference(image_colorization, self.test_image)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_diffusers_stable_diffusion.py b/tests/pipelines/test_diffusers_stable_diffusion.py
new file mode 100644
index 00000000..4ffc4d26
--- /dev/null
+++ b/tests/pipelines/test_diffusers_stable_diffusion.py
@@ -0,0 +1,28 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DiffusersStableDiffusionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.text_to_image_synthesis
+        self.model_id = 'shadescript/stable-diffusion-2-1-dev'
+
+    test_input = 'a photo of an astronaut riding a horse on mars'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        diffusers_pipeline = pipeline(task=self.task, model=self.model_id)
+        output = diffusers_pipeline(self.test_input, height=512, width=512)
+        output['output_img'][0].save('output.png')
+        print('Image saved to output.png')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_document_vl_embedding.py b/tests/pipelines/test_document_vl_embedding.py
new file mode 100644
index 00000000..f8d2d5a3
--- /dev/null
+++ b/tests/pipelines/test_document_vl_embedding.py
@@ -0,0 +1,60 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path as osp
+import unittest
+
+import json
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class DocumentVLEmbeddingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/multi-modal_convnext-roberta-base_vldoc-embedding'
+        cache_path = snapshot_download(self.model_id)
+        self.test_image = osp.join(cache_path, 'data/demo.png')
+        self.test_json = osp.join(cache_path, 'data/demo.json')
+        self.task = Tasks.document_vl_embedding
+
+    def pipeline_inference(self, pipe: Pipeline):
+        inp = {'images': [self.test_image], 'ocr_info_paths': [self.test_json]}
+        result = pipe(inp)
+
+        print('Results of VLDoc: ')
+        for k, v in result.items():
+            print(f'{k}: {v.size()}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        doc_VL_emb_pipeline = pipeline(task=self.task, model=self.model_id)
+        self.pipeline_inference(doc_VL_emb_pipeline)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        print('test_run_with_model_from_modelhub')
+        model = Model.from_pretrained(self.model_id)
+
+        doc_VL_emb_pipeline = pipeline(task=self.task, model=model)
+        self.pipeline_inference(doc_VL_emb_pipeline)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        print('test_run_modelhub_default_model')
+        # default model: VLDoc
+        vldoc_doc_VL_emb_pipeline = pipeline(self.task)
+        self.pipeline_inference(vldoc_doc_VL_emb_pipeline)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_liveness_ir.py b/tests/pipelines/test_face_liveness_ir.py
new file mode 100644
index 00000000..f307440c
--- /dev/null
+++ b/tests/pipelines/test_face_liveness_ir.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class FaceLivenessIrTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-liveness_flir'
+        self.img_path = 'data/test/images/face_liveness_ir.jpg'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_liveness, model=self.model_id)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_default_model(self):
+        face_detection = pipeline(Tasks.face_liveness)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_liveness_rgb.py b/tests/pipelines/test_face_liveness_rgb.py
new file mode 100644
index 00000000..40e39e9e
--- /dev/null
+++ b/tests/pipelines/test_face_liveness_rgb.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+import unittest
+
+import cv2
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import draw_face_detection_no_lm_result
+from modelscope.utils.test_utils import test_level
+
+
+class FaceLivenessRgbTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'damo/cv_manual_face-liveness_flrgb'
+        self.img_path = 'data/test/images/face_liveness_rgb.png'
+
+    def show_result(self, img_path, detection_result):
+        img = draw_face_detection_no_lm_result(img_path, detection_result)
+        cv2.imwrite('result.png', img)
+        print(f'output written to {osp.abspath("result.png")}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        face_detection = pipeline(Tasks.face_liveness, model=self.model_id)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_default_model(self):
+        face_detection = pipeline(Tasks.face_liveness)
+        result = face_detection(self.img_path)
+        self.show_result(self.img_path, result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_face_recognition_ood.py b/tests/pipelines/test_face_recognition_ood.py
new file mode 100644
index 00000000..06325e3b
--- /dev/null
+++ b/tests/pipelines/test_face_recognition_ood.py
@@ -0,0 +1,44 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class FaceRecognitionOodTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.face_recognition_ood
+        self.model_id = 'damo/cv_ir_face-recognition-ood_rts'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_face_compare(self):
+        img1 = 'data/test/images/face_recognition_1.png'
+        img2 = 'data/test/images/face_recognition_2.png'
+
+        face_recognition = pipeline(
+            Tasks.face_recognition_ood, model=self.model_id)
+        result1 = face_recognition(img1)
+        emb1 = result1[OutputKeys.IMG_EMBEDDING]
+        score1 = result1[OutputKeys.SCORES][0][0]
+
+        result2 = face_recognition(img2)
+        emb2 = result2[OutputKeys.IMG_EMBEDDING]
+        score2 = result2[OutputKeys.SCORES][0][0]
+
+        sim = np.dot(emb1[0], emb2[0])
+        print(f'Cos similarity={sim:.3f}, img1:{img1}  img2:{img2}')
+        print(f'OOD score: img1:{score1:.3f}  img2:{score2:.3f}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_fill_mask.py b/tests/pipelines/test_fill_mask.py
index bc244826..0e427464 100644
--- a/tests/pipelines/test_fill_mask.py
+++ b/tests/pipelines/test_fill_mask.py
@@ -27,6 +27,7 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
     }
     model_id_veco = 'damo/nlp_veco_fill-mask-large'
     model_id_bert = 'damo/nlp_bert_fill-mask_chinese-base'
+    model_id_megatron_bert = 'damo/nlp_megatron_bert_fill_mask_1.3B_test'
 
     ori_texts = {
         'zh':
@@ -158,6 +159,14 @@ class FillMaskTest(unittest.TestCase, DemoCompatibilityCheck):
             f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
             f'{pipeline_ins(self.test_inputs[language])}\n')
 
+        # Megatron-Bert
+        language = 'zh'
+        pipeline_ins = pipeline(
+            task=Tasks.fill_mask, model=self.model_id_megatron_bert)
+        print(
+            f'\nori_text: {self.ori_texts[language]}\ninput: {self.test_inputs[language]}\npipeline: '
+            f'{pipeline_ins(self.test_inputs[language])}\n')
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.fill_mask)
diff --git a/tests/pipelines/test_general_image_classification.py b/tests/pipelines/test_general_image_classification.py
index 7798c399..1bb54260 100644
--- a/tests/pipelines/test_general_image_classification.py
+++ b/tests/pipelines/test_general_image_classification.py
@@ -5,6 +5,7 @@ import unittest
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.regress_test_utils import MsRegressTool
 from modelscope.utils.test_utils import test_level
 
 
@@ -14,6 +15,7 @@ class GeneralImageClassificationTest(unittest.TestCase,
     def setUp(self) -> None:
         self.task = Tasks.image_classification
         self.model_id = 'damo/cv_vit-base_image-classification_Dailylife-labels'
+        self.regress_tool = MsRegressTool(baseline=False)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_ImageNet(self):
@@ -28,7 +30,10 @@ class GeneralImageClassificationTest(unittest.TestCase,
         general_image_classification = pipeline(
             Tasks.image_classification,
             model='damo/cv_vit-base_image-classification_Dailylife-labels')
-        result = general_image_classification('data/test/images/bird.JPEG')
+        with self.regress_tool.monitor_module_single_forward(
+                general_image_classification.model,
+                'vit_base_image_classification'):
+            result = general_image_classification('data/test/images/bird.JPEG')
         print(result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
@@ -40,6 +45,32 @@ class GeneralImageClassificationTest(unittest.TestCase,
         result = nexit_image_classification('data/test/images/bird.JPEG')
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_convnext(self):
+        convnext_image_classification = pipeline(
+            Tasks.image_classification,
+            model='damo/cv_convnext-base_image-classification_garbage')
+        result = convnext_image_classification('data/test/images/banana.jpg')
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_beitv2(self):
+        beitv2_image_classification = pipeline(
+            Tasks.image_classification,
+            model=
+            'damo/cv_beitv2-base_image-classification_patch16_224_pt1k_ft22k_in1k'
+        )
+        result = beitv2_image_classification('data/test/images/bird.JPEG')
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_bnext(self):
+        nexit_image_classification = pipeline(
+            Tasks.image_classification,
+            model='damo/cv_bnext-small_image-classification_ImageNet-labels')
+        result = nexit_image_classification('data/test/images/bird.JPEG')
+        print(result)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_Dailylife_default(self):
         general_image_classification = pipeline(Tasks.image_classification)
diff --git a/tests/pipelines/test_gpt3_text_generation.py b/tests/pipelines/test_gpt3_text_generation.py
index 674e95bb..7f7722b5 100644
--- a/tests/pipelines/test_gpt3_text_generation.py
+++ b/tests/pipelines/test_gpt3_text_generation.py
@@ -17,12 +17,12 @@ class TextGPT3GenerationTest(unittest.TestCase):
         self.model_dir_13B = snapshot_download(self.model_id_13B)
         self.input = '好的'
 
-    @unittest.skip('distributed gpt3 1.3B, skipped')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_gpt3_1_3B(self):
         pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
         print(pipe(self.input))
 
-    @unittest.skip('distributed gpt3 2.7B, skipped')
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_gpt3_2_7B(self):
         pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B)
         print(pipe(self.input))
diff --git a/tests/pipelines/test_hand_detection.py b/tests/pipelines/test_hand_detection.py
index e14d51a2..8a6bbd5a 100644
--- a/tests/pipelines/test_hand_detection.py
+++ b/tests/pipelines/test_hand_detection.py
@@ -10,7 +10,7 @@ from modelscope.utils.test_utils import test_level
 class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
-        self.task = Tasks.image_object_detection
+        self.task = Tasks.domain_specific_object_detection
         self.model_id = 'damo/cv_yolox-pai_hand-detection'
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
diff --git a/tests/pipelines/test_hitea_tasks.py b/tests/pipelines/test_hitea_tasks.py
new file mode 100644
index 00000000..50efdfbd
--- /dev/null
+++ b/tests/pipelines/test_hitea_tasks.py
@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class HiTeATasksTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_video_captioning_with_model(self):
+        model = Model.from_pretrained(
+            'damo/multi-modal_hitea_video-captioning_base_en')
+        pipeline_caption = pipeline(
+            task=Tasks.video_captioning,
+            model=model,
+        )
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        result = pipeline_caption(video)
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_video_captioning_with_name(self):
+        model = 'damo/multi-modal_hitea_video-captioning_base_en'
+        pipeline_caption = pipeline(
+            Tasks.video_captioning,
+            model=model,
+        )
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        result = pipeline_caption(video)
+        print(result[OutputKeys.CAPTION])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_video_question_answering_with_model(self):
+        model = Model.from_pretrained(
+            'damo/multi-modal_hitea_video-question-answering_base_en')
+        pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        text = 'How many people are there?'
+        input = {'video': video, 'text': text}
+        result = pipeline_vqa(input)
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_video_question_answering_with_name(self):
+        model = 'damo/multi-modal_hitea_video-question-answering_base_en'
+        pipeline_vqa = pipeline(Tasks.video_question_answering, model=model)
+        video = 'data/test/videos/video_caption_and_qa_test.mp4'
+        text = 'Who teaches a girl how to paint eggs?'
+        input = {'video': video, 'text': text}
+        result = pipeline_vqa(input)
+        print(result)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_deblur.py b/tests/pipelines/test_image_deblur.py
new file mode 100644
index 00000000..476263af
--- /dev/null
+++ b/tests/pipelines/test_image_deblur.py
@@ -0,0 +1,65 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import ImageDeblurPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_deblurring
+        self.model_id = 'damo/cv_nafnet_image-deblur_gopro'
+
+    demo_image_path = 'data/test/images/blurry.jpg'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = ImageDeblurPipeline(cache_path)
+        pipeline.group_key = self.task
+        deblur_img = pipeline(
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = deblur_img.shape[:2]
+        print('pipeline: the shape of output_img is {}x{}'.format(h, w))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(task=Tasks.image_deblurring, model=model)
+        deblur_img = pipeline_ins(
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = deblur_img.shape[:2]
+        print('pipeline: the shape of output_img is {}x{}'.format(h, w))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.image_deblurring, model=self.model_id)
+        deblur_img = pipeline_ins(
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = deblur_img.shape[:2]
+        print('pipeline: the shape of output_img is {}x{}'.format(h, w))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.image_deblurring)
+        deblur_img = pipeline_ins(
+            input=self.demo_image_path)[OutputKeys.OUTPUT_IMG]  # BGR
+        h, w = deblur_img.shape[:2]
+        print('pipeline: the shape of output_img is {}x{}'.format(h, w))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_defrcn_fewshot.py b/tests/pipelines/test_image_defrcn_fewshot.py
new file mode 100644
index 00000000..4658206a
--- /dev/null
+++ b/tests/pipelines/test_image_defrcn_fewshot.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import subprocess
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageDefrcnFewShotTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        logger.info('start install detectron2-0.3')
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        logger.info('install detectron2-0.3 finished')
+
+        self.task = Tasks.image_fewshot_detection
+        self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
+        self.image = 'data/test/images/image_voc2007_000001.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_defrcn = pipeline(task=self.task, model=model)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_defrcn = pipeline(task=self.task, model=self.model_id)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_defrcn = pipeline(task=self.task)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline_defrcn = pipeline(self.task, model=cache_path)
+        print(pipeline_defrcn(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_face_fusion.py b/tests/pipelines/test_image_face_fusion.py
new file mode 100644
index 00000000..fde15edf
--- /dev/null
+++ b/tests/pipelines/test_image_face_fusion.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+import cv2
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageFaceFusionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_face_fusion
+        self.model_id = 'damo/cv_unet-image-face-fusion_damo'
+        self.template_img = 'data/test/images/facefusion_template.jpg'
+        self.user_img = 'data/test/images/facefusion_user.jpg'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        snapshot_path = snapshot_download(self.model_id)
+        print('snapshot_path: {}'.format(snapshot_path))
+        image_face_fusion = pipeline(
+            Tasks.image_face_fusion, model=snapshot_path)
+
+        result = image_face_fusion(
+            dict(template=self.template_img, user=self.user_img))
+        cv2.imwrite('result_facefusion.png', result[OutputKeys.OUTPUT_IMG])
+        print('facefusion.test_run_direct_model_download done')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        image_face_fusion = pipeline(
+            Tasks.image_face_fusion, model=self.model_id)
+
+        result = image_face_fusion(
+            dict(template=self.template_img, user=self.user_img))
+        cv2.imwrite('result_facefusion.png', result[OutputKeys.OUTPUT_IMG])
+        print('facefusion.test_run_modelhub done')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        image_face_fusion = pipeline(Tasks.image_face_fusion)
+
+        result = image_face_fusion(
+            dict(template=self.template_img, user=self.user_img))
+        cv2.imwrite('result_facefusion.png', result[OutputKeys.OUTPUT_IMG])
+        print('facefusion.test_run_modelhub_default_model done')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_layout_estimation.py b/tests/pipelines/test_image_layout_estimation.py
new file mode 100644
index 00000000..b312e8c2
--- /dev/null
+++ b/tests/pipelines/test_image_layout_estimation.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import sys
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageLayoutEstimationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.indoor_layout_estimation
+        self.model_id = 'damo/cv_panovit_indoor-layout-estimation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_layout_estimation(self):
+        input_location = 'data/test/images/indoor_layout_estimation.png'
+        estimator = pipeline(
+            Tasks.indoor_layout_estimation, model=self.model_id)
+        result = estimator(input_location)
+        layout = result[OutputKeys.LAYOUT]
+        cv2.imwrite('layout.jpg', layout)
+
+        print('test_image_layout_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_matching.py b/tests/pipelines/test_image_matching.py
new file mode 100644
index 00000000..55fd56df
--- /dev/null
+++ b/tests/pipelines/test_image_matching.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+from pathlib import Path
+
+import cv2
+import matplotlib.cm as cm
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import match_pair_visualization
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageMatchingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'image-matching'
+        self.model_id = 'damo/cv_quadtree_attention_image-matching_outdoor'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_matching(self):
+        input_location = [[
+            'data/test/images/image_matching1.jpg',
+            'data/test/images/image_matching2.jpg'
+        ]]
+        estimator = pipeline(Tasks.image_matching, model=self.model_id)
+        result = estimator(input_location)
+        kpts0, kpts1, conf = result[0][OutputKeys.MATCHES]
+
+        match_pair_visualization(
+            input_location[0][0],
+            input_location[0][1],
+            kpts0,
+            kpts1,
+            conf,
+            output_filename='quadtree_match.png')
+
+        print('test_image_matching DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_mvs_depth_estimation.py b/tests/pipelines/test_image_mvs_depth_estimation.py
new file mode 100644
index 00000000..a7e327e3
--- /dev/null
+++ b/tests/pipelines/test_image_mvs_depth_estimation.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class ImageMVSDepthEstimationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'image-multi-view-depth-estimation'
+        self.model_id = 'damo/cv_casmvs_multi-view-depth-estimation_general'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_mvs_depth_estimation(self):
+        estimator = pipeline(
+            Tasks.image_multi_view_depth_estimation,
+            model='damo/cv_casmvs_multi-view-depth-estimation_general')
+        model_dir = snapshot_download(self.model_id)
+        input_location = os.path.join(model_dir, 'test_data')
+
+        result = estimator(input_location)
+        pcd = result[OutputKeys.OUTPUT]
+        pcd.write('./pcd_fusion.ply')
+        print('test_image_mvs_depth_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_portrait_enhancement.py b/tests/pipelines/test_image_portrait_enhancement.py
index 1ca97253..f0814c07 100644
--- a/tests/pipelines/test_image_portrait_enhancement.py
+++ b/tests/pipelines/test_image_portrait_enhancement.py
@@ -18,6 +18,7 @@ class ImagePortraitEnhancementTest(unittest.TestCase, DemoCompatibilityCheck):
     def setUp(self) -> None:
         self.task = Tasks.image_portrait_enhancement
         self.model_id = 'damo/cv_gpen_image-portrait-enhancement'
+        self.model_id_hires = 'damo/cv_gpen_image-portrait-enhancement-hires'
         self.test_image = 'data/test/images/Solvay_conference_1927.png'
 
     def pipeline_inference(self, pipeline: Pipeline, test_image: str):
@@ -34,6 +35,12 @@ class ImagePortraitEnhancementTest(unittest.TestCase, DemoCompatibilityCheck):
             Tasks.image_portrait_enhancement, model=self.model_id)
         self.pipeline_inference(face_enhancement, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub_hires(self):
+        face_enhancement = pipeline(
+            Tasks.image_portrait_enhancement, model=self.model_id_hires)
+        self.pipeline_inference(face_enhancement, self.test_image)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         face_enhancement = pipeline(Tasks.image_portrait_enhancement)
diff --git a/tests/pipelines/test_inverse_text_processing.py b/tests/pipelines/test_inverse_text_processing.py
new file mode 100644
index 00000000..dc7fb1e0
--- /dev/null
+++ b/tests/pipelines/test_inverse_text_processing.py
@@ -0,0 +1,70 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class InverseTextProcessingTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.inverse_text_processing,
+        self.model_dict = {
+            'en':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-en',
+            'de':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-de',
+            'es':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-es',
+            'fr':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-fr',
+            'id':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-id',
+            'ko':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-ko',
+            'ja':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-ja',
+            'pt':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-pt',
+            'ru':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-ru',
+            'vi':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-vi',
+            'tl':
+            'damo/speech_inverse_text_processing_fun-text-processing-itn-tl',
+        }
+        self.text_in_dict = {
+            'en':
+            'on december second, we paid one hundred and twenty three dollars for christmas tree.',
+            'de': 'einhundertdreiundzwanzig',
+            'es': 'ciento veintitrés',
+            'fr': 'cent vingt-trois',
+            'id': 'seratus dua puluh tiga',
+            'ko': '삼백오 독일 마',
+            'ja': '百二十三',
+            'pt': 'cento e vinte e três',
+            'ru': 'сто двадцать три',
+            'vi': 'một trăm hai mươi ba',
+            'tl': "ika-lima mayo dalawang libo dalawampu't dalawa",
+        }
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_for_multi_language_itn(self):
+        for key, value in self.model_dict.items():
+            lang = key
+            model_name = value
+            itn_inference_pipline = pipeline(
+                task=Tasks.inverse_text_processing, model=model_name)
+            lang_text_in = self.text_in_dict[lang]
+            itn_result = itn_inference_pipline(text_in=lang_text_in)
+            print(itn_result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py
index 4822db16..f31d212b 100644
--- a/tests/pipelines/test_key_word_spotting.py
+++ b/tests/pipelines/test_key_word_spotting.py
@@ -230,23 +230,20 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
         audio = audio.tobytes()
         return audio
 
-    # TODO: recover to test level 0 once issue fixed
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav(self):
         kws_result = self.run_pipeline(
             model_id=self.model_id, audio_in=POS_WAV_FILE)
         self.check_result('test_run_with_wav', kws_result)
 
-    # TODO: recover to test level 0 once issue fixed
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_pcm(self):
         audio = self.wav2bytes(os.path.join(os.getcwd(), POS_WAV_FILE))
 
         kws_result = self.run_pipeline(model_id=self.model_id, audio_in=audio)
         self.check_result('test_run_with_pcm', kws_result)
 
-    # TODO: recover to test level 0 once issue fixed
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_wav_by_customized_keywords(self):
         keywords = '播放音乐'
 
@@ -257,15 +254,13 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
         self.check_result('test_run_with_wav_by_customized_keywords',
                           kws_result)
 
-    # TODO: recover to test level 0 once issue fixed
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_url(self):
         kws_result = self.run_pipeline(
             model_id=self.model_id, audio_in=URL_FILE)
         self.check_result('test_run_with_url', kws_result)
 
-    # TODO: recover to test level 1 once issue fixed
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_pos_testsets(self):
         wav_file_path = download_and_untar(
             os.path.join(self.workspace, POS_TESTSETS_FILE), POS_TESTSETS_URL,
@@ -276,8 +271,7 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck):
             model_id=self.model_id, audio_in=audio_list)
         self.check_result('test_run_with_pos_testsets', kws_result)
 
-    # TODO: recover to test level 1 once issue fixed
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_neg_testsets(self):
         wav_file_path = download_and_untar(
             os.path.join(self.workspace, NEG_TESTSETS_FILE), NEG_TESTSETS_URL,
diff --git a/tests/pipelines/test_maskdino_instance_segmentation.py b/tests/pipelines/test_maskdino_instance_segmentation.py
new file mode 100644
index 00000000..14e0887d
--- /dev/null
+++ b/tests/pipelines/test_maskdino_instance_segmentation.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.models.cv.image_instance_segmentation import MaskDINOSwinModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import MaskDINOInstanceSegmentationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MaskDINOInstanceSegmentationTest(unittest.TestCase,
+                                       DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_maskdino-swin-l_image-instance-segmentation_coco'
+
+    image = 'data/test/images/image_instance_segmentation.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.image_segmentation, model=self.model_id)
+        print(pipeline_ins(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(
+            task=Tasks.image_segmentation, model=model, preprocessor=None)
+        print(pipeline_ins(input=self.image)[OutputKeys.LABELS])
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        model = MaskDINOSwinModel(cache_path)
+        pipeline1 = MaskDINOInstanceSegmentationPipeline(
+            model, preprocessor=None)
+        pipeline2 = pipeline(
+            Tasks.image_segmentation, model=model, preprocessor=None)
+        print(f'pipeline1:{pipeline1(input=self.image)[OutputKeys.LABELS]}')
+        print(f'pipeline2: {pipeline2(input=self.image)[OutputKeys.LABELS]}')
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py
index 01a00f2a..a7c790ef 100644
--- a/tests/pipelines/test_named_entity_recognition.py
+++ b/tests/pipelines/test_named_entity_recognition.py
@@ -251,6 +251,10 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
             'damo/nlp_structbert_keyphrase-extraction_base-icassp2023-mug-track4-baseline',
             'language': 'zh'
         },
+        {
+            'model_id': 'damo/nlp_raner_chunking_english-large',
+            'language': 'en'
+        },
     ]
 
     def setUp(self) -> None:
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index e0591496..f1c20f47 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -12,7 +12,9 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
+        self.model_id_vlpt = 'damo/cv_resnet50_ocr-detection-vlpt'
         self.test_image = 'data/test/images/ocr_detection.jpg'
+        self.test_image_vlpt = 'data/test/images/ocr_detection_vlpt.jpg'
         self.task = Tasks.ocr_detection
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
@@ -25,6 +27,11 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id)
         self.pipeline_inference(ocr_detection, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_vlpt_with_model_from_modelhub(self):
+        ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id_vlpt)
+        self.pipeline_inference(ocr_detection, self.test_image_vlpt)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         ocr_detection = pipeline(Tasks.ocr_detection)
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 6dec2c57..8dc7197d 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -45,15 +45,16 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = img_captioning('data/test/images/image_captioning.png')
         print(result[OutputKeys.CAPTION])
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_image_captioning_batch(self):
-        img_captioning = pipeline(
-            Tasks.image_captioning,
-            model='damo/ofa_image-caption_coco_large_en')
+        img_captioning.model.num_return_sequences = 2
+        result = img_captioning('data/test/images/image_captioning.png')
+        print(result[OutputKeys.CAPTION])
+
+        # test batch infer
+        img_captioning.model.num_return_sequences = 1
         results = img_captioning(
             [{
                 'image': 'data/test/images/image_captioning.png'
-            } for _ in range(6)],
+            } for _ in range(3)],
             batch_size=2)
         for r in results:
             print(r[OutputKeys.CAPTION])
@@ -65,6 +66,12 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             model='damo/ofa_ocr-recognition_scene_base_zh')
         result = ocr_recognize('data/test/images/image_ocr_recognition.jpg')
         print(result[OutputKeys.TEXT])
+        # test batch infer
+        results = ocr_recognize(
+            ['data/test/images/image_ocr_recognition.jpg' for _ in range(3)],
+            batch_size=2)
+        for r in results:
+            print(r[OutputKeys.TEXT])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_classification_with_model(self):
@@ -84,6 +91,12 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(image)
         print(result)
 
+        # test batch infer
+        image = ['data/test/images/image_classification.png' for _ in range(3)]
+        results = ofa_pipe(image, batch_size=2)
+        for r in results:
+            print(r[OutputKeys.LABELS], r[OutputKeys.SCORES])
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_summarization_with_model(self):
         model = Model.from_pretrained(
@@ -104,12 +117,23 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
             model='damo/ofa_summarization_gigaword_large_en')
         text = 'five-time world champion michelle kwan withdrew' + \
                'from the #### us figure skating championships on wednesday ,' + \
-               ' but will petition us skating officials for the chance to ' +\
+               ' but will petition us skating officials for the chance to ' + \
                'compete at the #### turin olympics .'
         input = {'text': text}
         result = ofa_pipe(input)
         print(result)
 
+        # test for return multiple sequences
+        ofa_pipe.model.num_return_sequences = 2
+        result = ofa_pipe(input)
+        print(result)
+        # test batch infer
+        ofa_pipe.model.num_return_sequences = 1
+        input = [{'text': text} for _ in range(3)]
+        results = ofa_pipe(input, batch_size=2)
+        for r in results:
+            print(r[OutputKeys.TEXT])
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_text_classification_with_model(self):
         model = Model.from_pretrained(
@@ -130,6 +154,11 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         text2 = 'A member of my team will execute your orders with immense precision.'
         result = ofa_pipe((text, text2))
         print(result)
+        # test batch infer
+        inputs = [(text, text2) for _ in range(3)]
+        results = ofa_pipe(inputs, batch_size=2)
+        for r in results:
+            print(r[OutputKeys.LABELS], r[OutputKeys.SCORES])
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_visual_entailment_with_model(self):
@@ -152,8 +181,13 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         input = {'image': image, 'text': text}
         result = ofa_pipe(input)
         print(result)
+        # test batch infer
+        input = [{'image': image, 'text': text} for _ in range(3)]
+        results = ofa_pipe(input, batch_size=2)
+        for r in results:
+            print(r[OutputKeys.LABELS], r[OutputKeys.SCORES])
 
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_visual_grounding_with_model(self):
         model = Model.from_pretrained(
             'damo/ofa_visual-grounding_refcoco_large_en')
@@ -182,6 +216,9 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         image_name = image.split('/')[-2]
         self.save_img(image, result[OutputKeys.BOXES][0],
                       osp.join('large_en_name_' + image_name + '.png'))
+        # test batch infer
+        result = ofa_pipe([input for _ in range(3)], batch_size=2)
+        print(result)
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_visual_grounding_zh_with_name(self):
@@ -217,6 +254,10 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = ofa_pipe(input)
         print(result)
 
+        # test batch infer
+        result = ofa_pipe([input for _ in range(3)], batch_size=2)
+        print(result)
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_image_captioning_distilled_with_model(self):
         model = Model.from_pretrained(
@@ -230,6 +271,9 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         result = img_captioning(image)
         print(result[OutputKeys.CAPTION])
 
+        # test batch infer
+        print(img_captioning([image for _ in range(3)], batch_size=2))
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_visual_entailment_distilled_model_with_name(self):
         ofa_pipe = pipeline(
@@ -280,6 +324,47 @@ class OfaTasksTest(unittest.TestCase, DemoCompatibilityCheck):
         example = {'wav': 'data/test/audios/asr_example_ofa.wav'}
         result = ofa_pipe(example)
         print(result[OutputKeys.TEXT])
+        # test batch infer
+        result = ofa_pipe([example for _ in range(3)], batch_size=2)
+        for r in result:
+            print(r[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_sudoku_with_name(self):
+        model = 'damo/ofa_sudoku_kaggle_large'
+        ofa_pipe = pipeline(Tasks.sudoku, model=model)
+        # the valid num is 1-9，and use 0 represents the empty block
+        # the separator of column is ` : `, and the separator of row is ` | `
+        example = '5 : 3 : 0 : 0 : 7 : 0 : 0 : 0 : 0 | \
+                6 : 0 : 0 : 1 : 9 : 5 : 0 : 0 : 0 | \
+                0 : 9 : 8 : 0 : 0 : 0 : 0 : 6 : 0 | \
+                8 : 0 : 0 : 0 : 6 : 0 : 0 : 0 : 3 | \
+                4 : 0 : 0 : 8 : 0 : 3 : 0 : 0 : 1 | \
+                7 : 0 : 0 : 0 : 2 : 0 : 0 : 0 : 6 | \
+                0 : 6 : 0 : 0 : 0 : 0 : 2 : 8 : 0 | \
+                0 : 0 : 0 : 4 : 1 : 9 : 0 : 0 : 5 | \
+                0 : 0 : 0 : 0 : 8 : 0 : 0 : 7 : 9'
+
+        result = ofa_pipe(example)
+        print(result[OutputKeys.TEXT])
+        # test batch infer
+        result = ofa_pipe([example for _ in range(3)], batch_size=2)
+        for r in result:
+            print(r[OutputKeys.TEXT])
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_text2sql_with_name(self):
+        model = 'damo/ofa_text2sql_spider_large_en'
+        ofa_pipe = pipeline(Tasks.text2sql, model=model)
+        text = 'Show all book categories and the number of books in each category.'
+        database = 'culture_company'  # optional, default `culture_company`
+        example = {'text': text, 'database': database}
+        result = ofa_pipe(example)
+        print(result[OutputKeys.TEXT])
+        # test batch infer
+        result = ofa_pipe([example for _ in range(3)], batch_size=2)
+        for r in result:
+            print(r[OutputKeys.TEXT])
 
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
diff --git a/tests/pipelines/test_panorama_depth_estimation.py b/tests/pipelines/test_panorama_depth_estimation.py
new file mode 100644
index 00000000..99e575e3
--- /dev/null
+++ b/tests/pipelines/test_panorama_depth_estimation.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import depth_to_color
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class PanoramaDepthEstimationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'panorama-depth-estimation'
+        self.model_id = 'damo/cv_unifuse_panorama-depth-estimation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_panorama_depth_estimation(self):
+        input_location = 'data/test/images/panorama_depth_estimation.jpg'
+        estimator = pipeline(
+            Tasks.panorama_depth_estimation, model=self.model_id)
+        result = estimator(input_location)
+        depth_vis = result[OutputKeys.DEPTHS_COLOR]
+        cv2.imwrite('result.jpg', depth_vis)
+        print('test_panorama_depth_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_pointcloud_sceneflow_estimation.py b/tests/pipelines/test_pointcloud_sceneflow_estimation.py
new file mode 100644
index 00000000..34d87f09
--- /dev/null
+++ b/tests/pipelines/test_pointcloud_sceneflow_estimation.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class PointCloudSceneFlowEstimationTest(unittest.TestCase,
+                                        DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'pointcloud-sceneflow-estimation'
+        self.model_id = 'damo/cv_pointnet2_sceneflow-estimation_general'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_pointcloud_scenelfow_estimation(self):
+        input_location = ('data/test/pointclouds/flyingthings_pcd1.npy',
+                          'data/test/pointclouds/flyingthings_pcd2.npy')
+        estimator = pipeline(
+            Tasks.pointcloud_sceneflow_estimation, model=self.model_id)
+        result = estimator(input_location)
+        flow = result[OutputKeys.OUTPUT]
+        pcd12 = result[OutputKeys.PCD12]
+        pcd12_align = result[OutputKeys.PCD12_ALIGN]
+
+        print(f'pred flow shape:{flow.shape}')
+        np.save('flow.npy', flow)
+        # visualization
+        pcd12.write('pcd12.ply')
+        pcd12_align.write('pcd12_align.ply')
+
+        print('test_pointcloud_scenelfow_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py
index 35b00976..06be4850 100644
--- a/tests/pipelines/test_sentence_embedding.py
+++ b/tests/pipelines/test_sentence_embedding.py
@@ -14,6 +14,10 @@ from modelscope.utils.test_utils import test_level
 
 class SentenceEmbeddingTest(unittest.TestCase):
     model_id = 'damo/nlp_corom_sentence-embedding_english-base'
+    ecom_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base-ecom'
+    medical_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base-medical'
+    general_base_model_id = 'damo/nlp_corom_sentence-embedding_chinese-base'
+
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
         'sentences_to_compare': [
@@ -36,6 +40,31 @@ class SentenceEmbeddingTest(unittest.TestCase):
         'sentences_to_compare': []
     }
 
+    inputs4 = {
+        'source_sentence': ["how long it take to get a master's degree"]
+    }
+
+    inputs5 = {
+        'source_sentence': [
+            'how long it take to get a master degree',
+            'students take about 18 to 24 months to complete a degree'
+        ]
+    }
+
+    ecom_inputs1 = {
+        'source_sentence': ['毛绒玩具'],
+        'sentences_to_compare': ['大熊泰迪熊猫毛绒玩具公仔布娃娃抱抱熊', '背心式狗狗牵引绳']
+    }
+
+    ecom_inputs2 = {'source_sentence': ['毛绒玩具', '毛绒玩具儿童款']}
+
+    medical_inputs1 = {
+        'source_sentence': ['肠道不适可以服用益生菌吗'],
+        'sentences_to_compare': ['肠胃不好能吃益生菌,益生菌有调节肠胃道菌群的作用', '身体发烧应该多喝水']
+    }
+
+    medical_inputs2 = {'source_sentence': ['肠道不适可以服用益生菌吗', '肠道不适可以服用益生菌吗']}
+
     el_model_id = 'damo/nlp_bert_entity-embedding_chinese-base'
     el_inputs = {
         'source_sentence': ['宋小宝小品《美人鱼》， [ENT_S] 大鹏 [ENT_E] 上演生死离别，关键时刻美人鱼登场'],
@@ -67,6 +96,40 @@ class SentenceEmbeddingTest(unittest.TestCase):
               f'pipeline1:{pipeline1(input=self.inputs3)}')
         print()
         print(f'pipeline2: {pipeline2(input=self.inputs3)}')
+        print(f'inputs: {self.inputs4}\n'
+              f'pipeline1:{pipeline1(input=self.inputs4)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs4)}')
+        print(f'inputs: {self.inputs5}\n'
+              f'pipeline1:{pipeline1(input=self.inputs5)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.inputs5)}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_ecom_model_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.ecom_base_model_id)
+        tokenizer = SentenceEmbeddingTransformersPreprocessor(cache_path)
+        model = BertForSentenceEmbedding.from_pretrained(cache_path)
+        pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(f'inputs: {self.ecom_inputs1}\n'
+              f'pipeline1:{pipeline1(input=self.ecom_inputs1)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.ecom_inputs1)}')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_medical_model_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.medical_base_model_id)
+        tokenizer = SentenceEmbeddingTransformersPreprocessor(cache_path)
+        model = BertForSentenceEmbedding.from_pretrained(cache_path)
+        pipeline1 = SentenceEmbeddingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.sentence_embedding, model=model, preprocessor=tokenizer)
+        print(f'inputs: {self.medical_inputs1}\n'
+              f'pipeline1:{pipeline1(input=self.medical_inputs1)}')
+        print()
+        print(f'pipeline2: {pipeline2(input=self.medical_inputs1)}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
@@ -87,6 +150,18 @@ class SentenceEmbeddingTest(unittest.TestCase):
         pipeline_ins = pipeline(task=Tasks.sentence_embedding)
         print(pipeline_ins(input=self.inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_ecom_model_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.ecom_base_model_id)
+        print(pipeline_ins(input=self.ecom_inputs2))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_medical_model_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.sentence_embedding, model=self.medical_base_model_id)
+        print(pipeline_ins(input=self.medical_inputs1))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_el_model(self):
         pipeline_ins = pipeline(
diff --git a/tests/pipelines/test_speech_separation.py b/tests/pipelines/test_speech_separation.py
new file mode 100644
index 00000000..194f84a8
--- /dev/null
+++ b/tests/pipelines/test_speech_separation.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os.path
+import unittest
+
+import numpy
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+MIX_SPEECH_FILE = 'data/test/audios/mix_speech.wav'
+
+
+class SpeechSeparationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        pass
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_normal(self):
+        import soundfile as sf
+        model_id = 'damo/speech_mossformer_separation_temporal_8k'
+        separation = pipeline(Tasks.speech_separation, model=model_id)
+        result = separation(os.path.join(os.getcwd(), MIX_SPEECH_FILE))
+        self.assertTrue(OutputKeys.OUTPUT_PCM_LIST in result)
+        self.assertEqual(len(result[OutputKeys.OUTPUT_PCM_LIST]), 2)
+        for i, signal in enumerate(result[OutputKeys.OUTPUT_PCM_LIST]):
+            save_file = f'output_spk{i}.wav'
+            sf.write(save_file, numpy.frombuffer(signal, dtype=numpy.int16),
+                     8000)
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_text_error_correction.py b/tests/pipelines/test_text_error_correction.py
index a714d3d0..81d74c8a 100644
--- a/tests/pipelines/test_text_error_correction.py
+++ b/tests/pipelines/test_text_error_correction.py
@@ -19,6 +19,8 @@ class TextErrorCorrectionTest(unittest.TestCase, DemoCompatibilityCheck):
         self.model_id = 'damo/nlp_bart_text-error-correction_chinese'
 
     input = '随着中国经济突飞猛近，建造工业与日俱增'
+    input_2 = '这洋的话，下一年的福气来到自己身上。'
+    input_3 = '在拥挤时间，为了让人们尊守交通规律，派至少两个警察或者交通管理者。'
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_direct_download(self):
@@ -34,6 +36,15 @@ class TextErrorCorrectionTest(unittest.TestCase, DemoCompatibilityCheck):
             f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}'
         )
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name_batch(self):
+        run_kwargs = {'batch_size': 2}
+        pipeline_ins = pipeline(
+            task=Tasks.text_error_correction, model=self.model_id)
+        print(
+            'batch: ',
+            pipeline_ins([self.input, self.input_2, self.input_3], run_kwargs))
+
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
         model = Model.from_pretrained(self.model_id)
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index 1ce6695f..cbb1b29b 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -95,6 +95,19 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
         self.run_pipeline_with_model_id(self.gpt3_base_model_id,
                                         self.gpt3_input)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_gpt_base_with_model_name_batch(self):
+        self.run_pipeline_with_model_id(
+            self.gpt3_base_model_id,
+            [self.gpt3_input, self.gpt3_input[:10], self.gpt3_input[10:]],
+            run_kwargs={'batch_size': 2})
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_gpt_base_with_model_name_batch_iter(self):
+        self.run_pipeline_with_model_id(
+            self.gpt3_base_model_id,
+            [self.gpt3_input, self.gpt3_input[:10], self.gpt3_input[10:]])
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_gpt_large_with_model_name(self):
         self.run_pipeline_with_model_id(self.gpt3_large_model_id,
@@ -219,6 +232,13 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck):
                 max_length=20,
                 repetition_penalty=0.5))
 
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_gpt2(self):
+        pipe = pipeline(
+            task=Tasks.text_generation,
+            model='damo/nlp_gpt2_text-generation_english-base')
+        print(pipe('My name is Teven and I am'))
+
     @unittest.skip('demo compatibility test is only enabled on a needed-basis')
     def test_demo_compatibility(self):
         self.compatibility_check()
diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py
index 3329faad..40c4da58 100644
--- a/tests/pipelines/test_text_ranking.py
+++ b/tests/pipelines/test_text_ranking.py
@@ -13,11 +13,7 @@ from modelscope.utils.test_utils import test_level
 
 
 class TextRankingTest(unittest.TestCase):
-    models = [
-        'damo/nlp_corom_passage-ranking_english-base',
-        'damo/nlp_rom_passage-ranking_chinese-base'
-    ]
-
+    base_model_id = 'damo/nlp_corom_passage-ranking_english-base'
     inputs = {
         'source_sentence': ["how long it take to get a master's degree"],
         'sentences_to_compare': [
@@ -28,6 +24,29 @@ class TextRankingTest(unittest.TestCase):
         ]
     }
 
+    chinese_base_model_id = 'damo/nlp_rom_passage-ranking_chinese-base'
+    chinese_inputs = {
+        'source_sentence': ['功和功率的区别'],
+        'sentences_to_compare': [
+            '功反映做功多少，功率反映做功快慢。',
+            '什么是有功功率和无功功率?无功功率有什么用什么是有功功率和无功功率?无功功率有什么用电力系统中的电源是由发电机产生的三相正弦交流电,在交>流电路中,由电源供给负载的电功率有两种;一种是有功功率,一种是无功功率。',
+            '优质解答在物理学中,用电功率表示消耗电能的快慢．电功率用P表示,它的单位是瓦特（Watt）,简称瓦（Wa）符号是W.电流在单位时间内做的功叫做电功率 以灯泡为例,电功率越大,灯泡越亮.灯泡的亮暗由电功率（实际功率）\
+        决定,不由通过的电流、电压、电能决定!',
+        ]
+    }
+
+    ecom_base_model_id = 'damo/nlp_corom_passage-ranking_chinese-base-ecom'
+    ecom_inputs = {
+        'source_sentence': ['毛绒玩具'],
+        'sentences_to_compare': ['大熊泰迪熊猫毛绒玩具公仔布娃娃抱抱熊', '背心式狗狗牵引绳']
+    }
+
+    medical_base_model_id = 'damo/nlp_corom_passage-ranking_chinese-base-medical'
+    medical_inputs = {
+        'source_sentence': ['肠道不适可以服用益生菌吗'],
+        'sentences_to_compare': ['肠胃不好能吃益生菌,益生菌有调节肠胃道菌群的作用', '身体发烧应该多喝水']
+    }
+
     el_model_id = 'damo/nlp_bert_entity-matching_chinese-base'
     el_inputs = {
         'source_sentence': ['我是猫》([日]夏目漱石)【摘要 [ENT_S] 书评 [ENT_E]  试读】'],
@@ -43,38 +62,54 @@ class TextRankingTest(unittest.TestCase):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
-        for model_id in self.models:
-            cache_path = snapshot_download(model_id)
-            tokenizer = TextRankingTransformersPreprocessor(cache_path)
-            model = BertForTextRanking.from_pretrained(cache_path)
-            pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
-            pipeline2 = pipeline(
-                Tasks.text_ranking, model=model, preprocessor=tokenizer)
-            print(f'sentence: {self.inputs}\n'
-                  f'pipeline1:{pipeline1(input=self.inputs)}')
-            print()
-            print(f'pipeline2: {pipeline2(input=self.inputs)}')
+        cache_path = snapshot_download(self.base_model_id)
+        tokenizer = TextRankingTransformersPreprocessor(cache_path)
+        model = BertForTextRanking.from_pretrained(cache_path)
+        pipeline1 = TextRankingPipeline(model, preprocessor=tokenizer)
+        pipeline2 = pipeline(
+            Tasks.text_ranking, model=model, preprocessor=tokenizer)
+        print(f'sentence: {self.inputs}\n'
+              f'pipeline1:{pipeline1(input=self.inputs)}')
+        print()
+        print(f'pipeline2:{pipeline2(input=self.inputs)}')
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
-        for model_id in self.models:
-            model = Model.from_pretrained(model_id)
-            tokenizer = TextRankingTransformersPreprocessor(model.model_dir)
-            pipeline_ins = pipeline(
-                task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
-            print(pipeline_ins(input=self.inputs))
+        model = Model.from_pretrained(self.base_model_id)
+        tokenizer = TextRankingTransformersPreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=model, preprocessor=tokenizer)
+        print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_name(self):
-        for model_id in self.models:
-            pipeline_ins = pipeline(task=Tasks.text_ranking, model=model_id)
-            print(pipeline_ins(input=self.inputs))
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.base_model_id)
+        print(pipeline_ins(input=self.inputs))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
         pipeline_ins = pipeline(task=Tasks.text_ranking)
         print(pipeline_ins(input=self.inputs))
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_chinese_model_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.chinese_base_model_id)
+        print(pipeline_ins(input=self.chinese_inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_ecom_model_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.ecom_base_model_id)
+        print(pipeline_ins(input=self.ecom_inputs))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_medical_model_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text_ranking, model=self.medical_base_model_id)
+        print(pipeline_ins(input=self.medical_inputs))
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_el_model(self):
         pipeline_ins = pipeline(
diff --git a/tests/pipelines/test_text_to_image_synthesis.py b/tests/pipelines/test_text_to_image_synthesis.py
index e2a616e6..82f4e657 100644
--- a/tests/pipelines/test_text_to_image_synthesis.py
+++ b/tests/pipelines/test_text_to_image_synthesis.py
@@ -53,7 +53,7 @@ class TextToImageSynthesisTest(unittest.TestCase, DemoCompatibilityCheck):
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_model_from_modelhub_dpm_solver(self):
-        test_text.update({'solver': 'dpm-solver'})
+        self.test_text.update({'solver': 'dpm-solver'})
         model = Model.from_pretrained(self.model_id)
         pipe_line_text_to_image_synthesis = pipeline(
             task=Tasks.text_to_image_synthesis, model=model)
diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py
index 01580563..f746dfbe 100644
--- a/tests/pipelines/test_text_to_speech.py
+++ b/tests/pipelines/test_text_to_speech.py
@@ -27,12 +27,33 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase,
         self.task = Tasks.text_to_speech
         self.zhcn_text = '今天北京天气怎么样'
         self.en_text = 'How is the weather in Beijing?'
+        self.kokr_text = '오늘날씨가어때요'
+        self.ru_text = 'Какая сегодня погода?'
         self.test_model_name = [
-            'pretrain_16k', 'pretrain_24k', 'zhitian_emo', 'zhizhe_emo',
-            'zhiyan_emo', 'zhibei_emo', 'zhcn_16k', 'luca', 'luna', 'andy',
-            'annie', 'engb_16k', 'enus_16k'
+            'chuangirl', 'jiajia', 'xiaoda', 'kyong', 'masha', 'pretrain_16k',
+            'pretrain_24k', 'zhitian_emo', 'zhizhe_emo', 'zhiyan_emo',
+            'zhibei_emo', 'zhcn_16k', 'luca', 'luna', 'andy', 'annie',
+            'engb_16k', 'enus_16k'
         ]
         self.test_models = [{
+            'model':
+            'speech_tts/speech_sambert-hifigan_tts_chuangirl_Sichuan_16k',
+            'text': self.zhcn_text
+        }, {
+            'model':
+            'speech_tts/speech_sambert-hifigan_tts_jiajia_Cantonese_16k',
+            'text': self.zhcn_text
+        }, {
+            'model':
+            'speech_tts/speech_sambert-hifigan_tts_xiaoda_WuuShanghai_16k',
+            'text': self.zhcn_text
+        }, {
+            'model': 'speech_tts/speech_sambert-hifigan_tts_kyong_Korean_16k',
+            'text': self.kokr_text
+        }, {
+            'model': 'speech_tts/speech_sambert-hifigan_tts_masha_Russian_16k',
+            'text': self.ru_text
+        }, {
             'model':
             'speech_tts/speech_sambert-hifigan_tts_zh-cn_multisp_pretrain_16k',
             'text': self.zhcn_text
diff --git a/tests/pipelines/test_tinynas_detection.py b/tests/pipelines/test_tinynas_detection.py
index 79ccf89f..a73e7b0c 100644
--- a/tests/pipelines/test_tinynas_detection.py
+++ b/tests/pipelines/test_tinynas_detection.py
@@ -2,6 +2,9 @@
 
 import unittest
 
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.demo_utils import DemoCompatibilityCheck
@@ -63,6 +66,93 @@ class TinynasObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         tinynas_object_detection.show_result(test_image, result,
                                              'demo_ret.jpg')
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_human_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_human-detection_damoyolo')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_human_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_human-detection_damoyolo')
+        img = Image.open('data/test/images/image_detection.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_facemask_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_facemask')
+        result = tinynas_object_detection(
+            'data/test/images/image_detection.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_facemask_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_facemask')
+        img = Image.open('data/test/images/image_detection.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_safetyhat_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_safety-helmet')
+        result = tinynas_object_detection(
+            'data/test/images/image_safetyhat.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_safetyhat_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_safety-helmet')
+        img = Image.open('data/test/images/image_safetyhat.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_cigarette_detection_damoyolo(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_cigarette')
+        result = tinynas_object_detection('data/test/images/image_smoke.jpg')
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_cigarette_detection_damoyolo_with_image(self):
+        tinynas_object_detection = pipeline(
+            Tasks.domain_specific_object_detection,
+            model='damo/cv_tinynas_object-detection_damoyolo_cigarette')
+        img = Image.open('data/test/images/image_smoke.jpg')
+        result = tinynas_object_detection(img)
+        assert result and (OutputKeys.SCORES in result) and (
+            OutputKeys.LABELS in result) and (OutputKeys.BOXES in result)
+        print('results: ', result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_unifold.py b/tests/pipelines/test_unifold.py
index 22e29cb2..cf67929d 100644
--- a/tests/pipelines/test_unifold.py
+++ b/tests/pipelines/test_unifold.py
@@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck):
         self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \
             'NIAALKNHIDKIKPIAMQIYKKYSKNIP NIAALKNHIDKIKPIAMQIYKKYSKNIP'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_by_direct_model_download(self):
         model_dir1 = snapshot_download(self.model_id_multimer)
         multi_pipeline_ins = pipeline(task=self.task, model=model_dir1)
diff --git a/tests/pipelines/test_user_satisfaction_estimation.py b/tests/pipelines/test_user_satisfaction_estimation.py
new file mode 100644
index 00000000..2bbfd5d7
--- /dev/null
+++ b/tests/pipelines/test_user_satisfaction_estimation.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.models import Model
+from modelscope.pipelines import pipeline
+from modelscope.preprocessors import DialogueClassificationUsePreprocessor
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class UserSatisfactionEstimationTest(unittest.TestCase,
+                                     DemoCompatibilityCheck):
+
+    model_id = 'damo/nlp_user-satisfaction-estimation_chinese'
+    input_dialogue = [('返修退换货咨询|||', '手机有质量问题怎么办|||稍等，我看下', '开不开机了|||',
+                       '说话|||很好')]
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        preprocessor = DialogueClassificationUsePreprocessor(model.model_dir)
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification,
+            model=model,
+            preprocessor=preprocessor)
+        print(pipeline_ins(input=self.input_dialogue))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.text_classification, model=self.model_id)
+        print(pipeline_ins(input=self.input_dialogue))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        print(self.compatibility_check())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_depth_estimation.py b/tests/pipelines/test_video_depth_estimation.py
new file mode 100644
index 00000000..77cb4b9b
--- /dev/null
+++ b/tests/pipelines/test_video_depth_estimation.py
@@ -0,0 +1,31 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import show_video_depth_estimation_result
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoDepthEstimationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = 'video-depth-estimation'
+        self.model_id = 'damo/cv_dro-resnet18_video-depth-estimation_indoor'
+
+    @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
+    def test_image_depth_estimation(self):
+        input_location = 'data/test/videos/video_depth_estimation.mp4'
+        estimator = pipeline(Tasks.video_depth_estimation, model=self.model_id)
+        result = estimator(input_location)
+        show_video_depth_estimation_result(result[OutputKeys.DEPTHS_COLOR],
+                                           'out.mp4')
+
+        print('test_video_depth_estimation DONE')
+
+
+if __name__ == '__main__':
+
+    unittest.main()
diff --git a/tests/pipelines/test_video_frame_interpolation.py b/tests/pipelines/test_video_frame_interpolation.py
new file mode 100644
index 00000000..951da2b9
--- /dev/null
+++ b/tests/pipelines/test_video_frame_interpolation.py
@@ -0,0 +1,52 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import VideoFrameInterpolationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoFrameInterpolationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_frame_interpolation
+        self.model_id = 'damo/cv_raft_video-frame-interpolation'
+        self.test_video = 'data/test/videos/video_frame_interpolation_test.mp4'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = VideoFrameInterpolationPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        pipeline_ins = pipeline(
+            task=Tasks.video_frame_interpolation, model=self.model_id)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.video_frame_interpolation)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_multi_object_tracking.py b/tests/pipelines/test_video_multi_object_tracking.py
new file mode 100644
index 00000000..eb37ffd0
--- /dev/null
+++ b/tests/pipelines/test_video_multi_object_tracking.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class MultiObjectTracking(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_multi_object_tracking
+        self.model_id = 'damo/cv_yolov5_video-multi-object-tracking_fairmot'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_end2end(self):
+        video_multi_object_tracking = pipeline(
+            Tasks.video_multi_object_tracking, model=self.model_id)
+        video_path = 'data/test/videos/MOT17-03-partial.mp4'
+        result = video_multi_object_tracking(video_path)
+        print('result is : ', result[OutputKeys.BOXES])
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_modelhub_default_model(self):
+        video_multi_object_tracking = pipeline(
+            Tasks.video_multi_object_tracking)
+        video_path = 'data/test/videos/MOT17-03-partial.mp4'
+        result = video_multi_object_tracking(video_path)
+        print('result is : ', result[OutputKeys.BOXES])
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_stabilization.py b/tests/pipelines/test_video_stabilization.py
new file mode 100644
index 00000000..d102f3e1
--- /dev/null
+++ b/tests/pipelines/test_video_stabilization.py
@@ -0,0 +1,51 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import VideoStabilizationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoStabilizationTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_stabilization
+        self.model_id = 'damo/cv_dut-raft_video-stabilization_base'
+        self.test_video = 'data/test/videos/video_stabilization_test_video.avi'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = VideoStabilizationPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        pipeline_ins = pipeline(
+            task=Tasks.video_stabilization, model=self.model_id)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.video_stabilization)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_video_super_resolution.py b/tests/pipelines/test_video_super_resolution.py
new file mode 100644
index 00000000..0da18dd7
--- /dev/null
+++ b/tests/pipelines/test_video_super_resolution.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import VideoSuperResolutionPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VideoSuperResolutionTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_super_resolution
+        self.model_id = 'damo/cv_realbasicvsr_video-super-resolution_videolq'
+        self.test_video = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/000.mp4'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = VideoSuperResolutionPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_run_with_model_from_modelhub(self):
+        model = Model.from_pretrained(self.model_id)
+        pipeline_ins = pipeline(task=Tasks.video_super_resolution, model=model)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        pipeline_ins = pipeline(
+            task=Tasks.video_super_resolution, model=self.model_id)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_run_with_default_model(self):
+        pipeline_ins = pipeline(task=Tasks.video_super_resolution)
+        out_video_path = pipeline_ins(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
+    def test_demo_compatibility(self):
+        self.compatibility_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vision_middleware.py b/tests/pipelines/test_vision_middleware.py
new file mode 100644
index 00000000..b3531154
--- /dev/null
+++ b/tests/pipelines/test_vision_middleware.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vision_middleware import VisionMiddlewareModel
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VisionMiddlewareTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_segmentation
+        self.model_id = 'damo/cv_vit-b16_vision-middleware'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_pipeline(self):
+
+        vim_pipeline = pipeline(self.task, self.model_id)
+        result = vim_pipeline('data/test/images/vision_middleware_test1.jpg')
+
+        print(f'ViM output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_load_model_from_pretrained(self):
+        model = Model.from_pretrained('damo/cv_vit-b16_vision-middleware')
+        self.assertTrue(model.__class__ == VisionMiddlewareModel)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_vop_retrieval.py b/tests/pipelines/test_vop_retrieval.py
new file mode 100644
index 00000000..c9c356c5
--- /dev/null
+++ b/tests/pipelines/test_vop_retrieval.py
@@ -0,0 +1,36 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models import Model
+from modelscope.models.cv.vop_retrieval import VoP
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.demo_utils import DemoCompatibilityCheck
+from modelscope.utils.test_utils import test_level
+
+
+class VopRetrievalTest(unittest.TestCase, DemoCompatibilityCheck):
+
+    def setUp(self) -> None:
+        self.task = Tasks.vop_retrieval
+        # self.model_id = '../cv_vit-b32_retrieval_vop'
+        self.model_id = 'damo/cv_vit-b32_retrieval_vop'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_modelhub(self):
+        vop_pipeline = pipeline(self.task, self.model_id)
+        # t2v
+        result = vop_pipeline('a squid is talking')
+        # v2t
+        # result = vop_pipeline('video10.mp4')
+        print(f'vop output: {result}.')
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_load_model_from_pretrained(self):
+        # model = Model.from_pretrained('../cv_vit-b32_retrieval_vop')
+        model = Model.from_pretrained('damo/cv_vit-b32_retrieval_vop')
+        self.assertTrue(model.__class__ == VoP)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/preprocessors/test_nlp.py b/tests/preprocessors/test_nlp.py
index 9a31cc91..d63660c8 100644
--- a/tests/preprocessors/test_nlp.py
+++ b/tests/preprocessors/test_nlp.py
@@ -1,8 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
+import os.path
 import unittest
 
-from modelscope.preprocessors import build_preprocessor, nlp
+from modelscope.preprocessors import Preprocessor, build_preprocessor, nlp
 from modelscope.utils.constant import Fields, InputFields
 from modelscope.utils.logger import get_logger
 
@@ -32,6 +32,17 @@ class NLPPreprocessorTest(unittest.TestCase):
             output['attention_mask'],
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
 
+    def test_preprocessor_download(self):
+        from modelscope.preprocessors.nlp.token_classification_preprocessor import TokenClassificationPreprocessorBase
+        preprocessor: TokenClassificationPreprocessorBase = \
+            Preprocessor.from_pretrained('damo/nlp_raner_named-entity-recognition_chinese-base-news')
+        self.assertTrue(preprocessor is not None)
+        from modelscope.utils.hub import snapshot_download
+        model_dir = snapshot_download(
+            'damo/nlp_raner_named-entity-recognition_chinese-base-news')
+        self.assertTrue(
+            os.path.isfile(os.path.join(model_dir, 'pytorch_model.bin')))
+
     def test_token_classification_tokenize_bert(self):
         cfg = dict(
             type='token-cls-tokenizer',
diff --git a/tests/run.py b/tests/run.py
index 1b252756..6daba6dc 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -355,10 +355,34 @@ def run_case_in_env(env_name, env, test_suite_env_map, isolated_cases,
     run_command_with_popen(cmd)
 
 
+def run_non_parallelizable_test_suites(suites, result_dir):
+    cmd = ['python', 'tests/run.py', '--result_dir', result_dir, '--suites']
+    for suite in suites:
+        cmd.append(suite)
+    run_command_with_popen(cmd)
+
+
 def run_in_subprocess(args):
     # only case args.isolated_cases run in subporcess, all other run in a subprocess
     test_suite_files = gather_test_suites_files(
         os.path.abspath(args.test_dir), args.pattern)
+
+    non_parallelizable_suites = [
+        'test_download_dataset.py',
+        'test_hub_examples.py',
+        'test_hub_operation.py',
+        'test_hub_private_files.py',
+        'test_hub_private_repository.py',
+        'test_hub_repository.py',
+        'test_hub_retry.py',
+        'test_hub_revision.py',
+        'test_hub_revision_release_mode.py',
+        'test_hub_upload.py',
+    ]
+    test_suite_files = [
+        x for x in test_suite_files if x not in non_parallelizable_suites
+    ]
+
     run_config = None
     isolated_cases = []
     test_suite_env_map = {}
@@ -383,6 +407,11 @@ def run_in_subprocess(args):
         isolated_cases = test_suite_files
 
     with tempfile.TemporaryDirectory() as temp_result_dir:
+        # first run cases that nonparallelizable
+        run_non_parallelizable_test_suites(non_parallelizable_suites,
+                                           temp_result_dir)
+
+        # run case parallel in envs
         for env in set(test_suite_env_map.values()):
             parallel_run_case_in_env(env, run_config['envs'][env],
                                      test_suite_env_map, isolated_cases,
diff --git a/tests/run_config.yaml b/tests/run_config.yaml
index eb12debe..efc216de 100644
--- a/tests/run_config.yaml
+++ b/tests/run_config.yaml
@@ -44,6 +44,13 @@ isolated:  # test cases that may require excessive anmount of GPU memory or run
   - test_conversational_text_to_sql.py
   - test_video_multi_modal_embedding.py
   - test_image_skychange.py
+  - test_video_stabilization.py
+  - test_video_super_resolution.py
+  - test_kws_nearfield_trainer.py
+  - test_gpt3_text_generation.py
+  - test_ddcolor_image_colorization.py
+  - test_image_defrcn_fewshot_trainer.py
+  - test_image_deblur_trainer.py
 
 envs:
   default: # default env, case not in other env will in default, pytorch.
diff --git a/tests/trainers/audio/test_ans_trainer.py b/tests/trainers/audio/test_ans_trainer.py
index d897e6a9..6b18eefa 100644
--- a/tests/trainers/audio/test_ans_trainer.py
+++ b/tests/trainers/audio/test_ans_trainer.py
@@ -10,6 +10,7 @@ from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.audio.audio_utils import to_segment
+from modelscope.utils.constant import DownloadMode
 from modelscope.utils.hub import read_config
 from modelscope.utils.test_utils import test_level
 
@@ -31,7 +32,9 @@ class TestANSTrainer(unittest.TestCase):
         cfg.dump(self.cfg_file)
 
         hf_ds = MsDataset.load(
-            'ICASSP_2021_DNS_Challenge', split='test').to_hf_dataset()
+            'ICASSP_2021_DNS_Challenge',
+            split='test',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD).to_hf_dataset()
         mapped_ds = hf_ds.map(
             partial(to_segment, segment_length=SEGMENT_LENGTH_TEST),
             remove_columns=['duration'],
diff --git a/tests/trainers/audio/test_asr_trainer.py b/tests/trainers/audio/test_asr_trainer.py
new file mode 100644
index 00000000..d9cde03f
--- /dev/null
+++ b/tests/trainers/audio/test_asr_trainer.py
@@ -0,0 +1,48 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.audio.audio_utils import TtsTrainType
+from modelscope.utils.constant import DownloadMode, Fields, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class TestASRTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
+        self.dataset_id = 'speech_asr_aishell1_trainsets'
+        self.dataset_namespace = 'speech_asr'
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer(self):
+        ds_dict = MsDataset.load(
+            self.dataset_id, namespace=self.dataset_namespace)
+        kwargs = dict(
+            model=self.model_id, work_dir=self.tmp_dir, data_dir=ds_dict)
+        trainer = build_trainer(
+            Trainers.speech_asr_trainer, default_args=kwargs)
+        trainer.train()
+        result_model = os.path.join(self.tmp_dir, 'valid.acc.best.pth')
+        assert os.path.exists(result_model)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/audio/test_kws_nearfield_trainer.py b/tests/trainers/audio/test_kws_nearfield_trainer.py
new file mode 100644
index 00000000..a61f70bf
--- /dev/null
+++ b/tests/trainers/audio/test_kws_nearfield_trainer.py
@@ -0,0 +1,117 @@
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.utils.hub import read_config, snapshot_download
+from modelscope.utils.test_utils import test_level
+from modelscope.utils.torch_utils import get_dist_info
+
+POS_FILE = 'data/test/audios/kws_xiaoyunxiaoyun.wav'
+NEG_FILE = 'data/test/audios/kws_bofangyinyue.wav'
+
+
+class TestKwsNearfieldTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        print(f'tmp dir: {self.tmp_dir}')
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+        self.model_id = 'damo/speech_charctc_kws_phone-xiaoyun'
+
+        model_dir = snapshot_download(self.model_id)
+        print(model_dir)
+        self.configs = read_config(self.model_id)
+
+        # update some configs
+        self.configs.train.max_epochs = 10
+        self.configs.train.batch_size_per_gpu = 4
+        self.configs.train.dataloader.workers_per_gpu = 1
+        self.configs.evaluation.batch_size_per_gpu = 4
+        self.configs.evaluation.dataloader.workers_per_gpu = 1
+
+        self.config_file = os.path.join(self.tmp_dir, 'config.json')
+        self.configs.dump(self.config_file)
+
+        self.train_scp, self.cv_scp, self.trans_file = self.create_list()
+
+        print(f'test level is {test_level()}')
+
+    def create_list(self):
+        train_scp_file = os.path.join(self.tmp_dir, 'train.scp')
+        cv_scp_file = os.path.join(self.tmp_dir, 'cv.scp')
+        trans_file = os.path.join(self.tmp_dir, 'merged.trans')
+
+        with open(trans_file, 'w') as fp_trans:
+            with open(train_scp_file, 'w') as fp_scp:
+                for i in range(8):
+                    fp_scp.write(
+                        f'train_pos_wav_{i}\t{os.path.join(os.getcwd(), POS_FILE)}\n'
+                    )
+                    fp_trans.write(f'train_pos_wav_{i}\t小云小云\n')
+
+                for i in range(16):
+                    fp_scp.write(
+                        f'train_neg_wav_{i}\t{os.path.join(os.getcwd(), NEG_FILE)}\n'
+                    )
+                    fp_trans.write(f'train_neg_wav_{i}\t播放音乐\n')
+
+            with open(cv_scp_file, 'w') as fp_scp:
+                for i in range(2):
+                    fp_scp.write(
+                        f'cv_pos_wav_{i}\t{os.path.join(os.getcwd(), POS_FILE)}\n'
+                    )
+                    fp_trans.write(f'cv_pos_wav_{i}\t小云小云\n')
+
+                for i in range(2):
+                    fp_scp.write(
+                        f'cv_neg_wav_{i}\t{os.path.join(os.getcwd(), NEG_FILE)}\n'
+                    )
+                    fp_trans.write(f'cv_neg_wav_{i}\t播放音乐\n')
+
+        return train_scp_file, cv_scp_file, trans_file
+
+    def tearDown(self) -> None:
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_normal(self):
+        print('test start ...')
+        kwargs = dict(
+            model=self.model_id,
+            work_dir=self.tmp_dir,
+            cfg_file=self.config_file,
+            train_data=self.train_scp,
+            cv_data=self.cv_scp,
+            trans_data=self.trans_file)
+
+        trainer = build_trainer(
+            Trainers.speech_kws_fsmn_char_ctc_nearfield, default_args=kwargs)
+        trainer.train()
+
+        rank, _ = get_dist_info()
+        if rank == 0:
+            results_files = os.listdir(self.tmp_dir)
+            for i in range(self.configs.train.max_epochs):
+                self.assertIn(f'{i}.pt', results_files)
+
+            kwargs = dict(
+                test_dir=self.tmp_dir,
+                gpu=-1,
+                keywords='小云小云',
+                batch_size=4,
+            )
+            trainer.evaluate(None, None, **kwargs)
+
+            results_files = os.listdir(self.tmp_dir)
+            self.assertIn('convert.kaldi.txt', results_files)
+
+        print('test finished ...')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/audio/test_separation_trainer.py b/tests/trainers/audio/test_separation_trainer.py
new file mode 100644
index 00000000..4fdbab18
--- /dev/null
+++ b/tests/trainers/audio/test_separation_trainer.py
@@ -0,0 +1,91 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.preprocessors.audio import AudioBrainPreprocessor
+from modelscope.trainers import build_trainer
+from modelscope.utils.test_utils import test_level
+
+MIX_SPEECH_FILE = 'data/test/audios/mix_speech.wav'
+S1_SPEECH_FILE = 'data/test/audios/s1_speech.wav'
+S2_SPEECH_FILE = 'data/test/audios/s2_speech.wav'
+
+
+class TestSeparationTrainer(unittest.TestCase):
+
+    def setUp(self):
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/speech_mossformer_separation_temporal_8k'
+
+        csv_path = os.path.join(self.tmp_dir, 'test.csv')
+        mix_path = os.path.join(os.getcwd(), MIX_SPEECH_FILE)
+        s1_path = os.path.join(os.getcwd(), S1_SPEECH_FILE)
+        s2_path = os.path.join(os.getcwd(), S2_SPEECH_FILE)
+        with open(csv_path, 'w') as w:
+            w.write(f'id,mix_wav:FILE,s1_wav:FILE,s2_wav:FILE\n'
+                    f'0,{mix_path},{s1_path},{s2_path}\n')
+        self.dataset = MsDataset.load(
+            'csv', data_files={
+                'test': [csv_path]
+            }).to_torch_dataset(
+                preprocessors=[
+                    AudioBrainPreprocessor(
+                        takes='mix_wav:FILE', provides='mix_sig'),
+                    AudioBrainPreprocessor(
+                        takes='s1_wav:FILE', provides='s1_sig'),
+                    AudioBrainPreprocessor(
+                        takes='s2_wav:FILE', provides='s2_sig')
+                ],
+                to_tensor=False)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.dataset,
+            eval_dataset=self.dataset,
+            max_epochs=2,
+            work_dir=self.tmp_dir)
+        trainer = build_trainer(
+            Trainers.speech_separation, default_args=kwargs)
+        # model placement
+        trainer.model.load_check_point(device=trainer.device)
+        trainer.train()
+
+        logging_path = os.path.join(self.tmp_dir, 'train_log.txt')
+        self.assertTrue(
+            os.path.exists(logging_path),
+            f'Cannot find logging file {logging_path}')
+        save_dir = os.path.join(self.tmp_dir, 'save')
+        checkpoint_dirs = os.listdir(save_dir)
+        self.assertEqual(
+            len(checkpoint_dirs), 2, f'Cannot find checkpoint in {save_dir}!')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_eval(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=None,
+            eval_dataset=self.dataset,
+            max_epochs=2,
+            work_dir=self.tmp_dir)
+        trainer = build_trainer(
+            Trainers.speech_separation, default_args=kwargs)
+        result = trainer.evaluate(None)
+        self.assertTrue('si-snr' in result)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/easycv/test_easycv_trainer_hand_detection.py b/tests/trainers/easycv/test_easycv_trainer_hand_detection.py
index cd8383aa..e8af859a 100644
--- a/tests/trainers/easycv/test_easycv_trainer_hand_detection.py
+++ b/tests/trainers/easycv/test_easycv_trainer_hand_detection.py
@@ -43,7 +43,7 @@ class EasyCVTrainerTestHandDetection(unittest.TestCase):
         trainer = build_trainer(trainer_name, kwargs)
         trainer.train()
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_single_gpu(self):
         temp_file_dir = tempfile.TemporaryDirectory()
         tmp_dir = temp_file_dir.name
diff --git a/tests/trainers/easycv/test_easycv_trainer_panoptic_mask2former.py b/tests/trainers/easycv/test_easycv_trainer_panoptic_mask2former.py
new file mode 100644
index 00000000..f6a6c41a
--- /dev/null
+++ b/tests/trainers/easycv/test_easycv_trainer_panoptic_mask2former.py
@@ -0,0 +1,70 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+from mmcv.runner.hooks import HOOKS as MMCV_HOOKS
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import LogKeys, Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+
+@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest')
+class EasyCVTrainerTestPanopticMask2Former(unittest.TestCase):
+
+    def setUp(self):
+        self.logger = get_logger()
+        self.logger.info(('Testing %s.%s' %
+                          (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        super().tearDown()
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+    def _train(self):
+        cfg_options = {'train.max_epochs': 1}
+
+        trainer_name = Trainers.easycv
+
+        train_dataset = MsDataset.load(
+            dataset_name='COCO2017_panopic_subset', split='train')
+        eval_dataset = MsDataset.load(
+            dataset_name='COCO2017_panopic_subset', split='validation')
+        kwargs = dict(
+            model='damo/cv_r50_panoptic-segmentation_cocopan',
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_options=cfg_options)
+
+        trainer = build_trainer(trainer_name, kwargs)
+
+        hook_name = 'YOLOXLrUpdaterHook'
+        mmcv_hook = MMCV_HOOKS._module_dict.pop(hook_name, None)
+
+        trainer.train()
+
+        MMCV_HOOKS._module_dict[hook_name] = mmcv_hook
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_single_gpu_mask2former_r50(self):
+        self._train()
+
+        results_files = os.listdir(self.tmp_dir)
+        json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json'))
+        self.assertEqual(len(json_files), 1)
+        self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_dialog_intent_trainer.py b/tests/trainers/test_dialog_intent_trainer.py
index ea1cb482..ee857a50 100644
--- a/tests/trainers/test_dialog_intent_trainer.py
+++ b/tests/trainers/test_dialog_intent_trainer.py
@@ -27,7 +27,7 @@ class TestDialogIntentTrainer(unittest.TestCase):
         shutil.rmtree(self.save_dir)
         super().tearDown()
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         model_id = 'damo/nlp_space_pretrained-dialog-model'
         data_banking = MsDataset.load('banking77')
diff --git a/tests/trainers/test_dialog_modeling_trainer.py b/tests/trainers/test_dialog_modeling_trainer.py
index 9d9fd11b..900bf904 100644
--- a/tests/trainers/test_dialog_modeling_trainer.py
+++ b/tests/trainers/test_dialog_modeling_trainer.py
@@ -17,7 +17,7 @@ class TestDialogModelingTrainer(unittest.TestCase):
     model_id = 'damo/nlp_space_pretrained-dialog-model'
     output_dir = './dialog_fintune_result'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_trainer_with_model_and_args(self):
         # download data set
         data_multiwoz = MsDataset.load(
diff --git a/tests/trainers/test_finetune_faq_question_answering.py b/tests/trainers/test_finetune_faq_question_answering.py
new file mode 100644
index 00000000..01c34b63
--- /dev/null
+++ b/tests/trainers/test_finetune_faq_question_answering.py
@@ -0,0 +1,111 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.hub import read_config
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneFaqQuestionAnswering(unittest.TestCase):
+    param = {
+        'query_set': ['给妈买的，挺好的，妈妈喜欢。'],
+        'support_set': [{
+            'text': '挺好的，质量和服务都蛮好',
+            'label': '1'
+        }, {
+            'text': '内容较晦涩，小孩不感兴趣',
+            'label': '0'
+        }, {
+            'text': '贵且于我无用，买亏了',
+            'label': '0'
+        }, {
+            'text': '挺好，不错，喜欢，，',
+            'label': '1'
+        }]
+    }
+    model_id = 'damo/nlp_structbert_faq-question-answering_chinese-base'
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def build_trainer(self):
+        train_dataset = MsDataset.load(
+            'jd', namespace='DAMO_NLP',
+            split='train').remap_columns({'sentence': 'text'})
+        eval_dataset = MsDataset.load(
+            'jd', namespace='DAMO_NLP',
+            split='validation').remap_columns({'sentence': 'text'})
+
+        cfg: Config = read_config(self.model_id, revision='v1.0.1')
+        cfg.train.train_iters_per_epoch = 50
+        cfg.evaluation.val_iters_per_epoch = 2
+        cfg.train.seed = 1234
+        cfg.train.hooks = [{
+            'type': 'CheckpointHook',
+            'by_epoch': False,
+            'interval': 50
+        }, {
+            'type': 'EvaluationHook',
+            'by_epoch': False,
+            'interval': 50
+        }, {
+            'type': 'TextLoggerHook',
+            'by_epoch': False,
+            'rounding_digits': 5,
+            'interval': 10
+        }]
+        cfg_file = os.path.join(self.tmp_dir, 'config.json')
+        cfg.dump(cfg_file)
+
+        trainer = build_trainer(
+            Trainers.faq_question_answering_trainer,
+            default_args=dict(
+                model=self.model_id,
+                work_dir=self.tmp_dir,
+                train_dataset=train_dataset,
+                eval_dataset=eval_dataset,
+                cfg_file=cfg_file))
+        return trainer
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_faq_model_finetune(self):
+        trainer = self.build_trainer()
+        trainer.train()
+        evaluate_result = trainer.evaluate()
+        self.assertAlmostEqual(evaluate_result['accuracy'], 0.95, delta=0.1)
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(ModelFile.TRAIN_OUTPUT_DIR, results_files)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering, model=self.model_id)
+        result_before = pipeline_ins(self.param)
+        self.assertEqual(result_before['output'][0][0]['label'], '1')
+        self.assertAlmostEqual(
+            result_before['output'][0][0]['score'], 0.2, delta=0.2)
+        pipeline_ins = pipeline(
+            task=Tasks.faq_question_answering, model=output_dir)
+        result_after = pipeline_ins(self.param)
+        self.assertEqual(result_after['output'][0][0]['label'], '1')
+        self.assertAlmostEqual(
+            result_after['output'][0][0]['score'], 0.8, delta=0.2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_gpt3.py b/tests/trainers/test_finetune_gpt3.py
index 7a9e03d0..563d271c 100644
--- a/tests/trainers/test_finetune_gpt3.py
+++ b/tests/trainers/test_finetune_gpt3.py
@@ -52,6 +52,16 @@ class TestFinetuneTextGeneration(unittest.TestCase):
                 'batch_size_per_gpu': 16,
                 'workers_per_gpu': 1
             }
+            cfg.train.hooks.append({
+                'type': 'EvaluationHook',
+                'by_epoch': True,
+                'interval': 1
+            })
+            cfg.evaluation.dataloader = {
+                'batch_size_per_gpu': 8,
+                'workers_per_gpu': 1
+            }
+            cfg.evaluation.metrics = 'ppl'
             return cfg
 
         kwargs = dict(
@@ -73,6 +83,7 @@ class TestFinetuneTextGeneration(unittest.TestCase):
     def test_finetune_dureader(self):
         # DuReader_robust-QG is an example data set,
         # users can also use their own data set for training
+
         dataset_dict = MsDataset.load('DuReader_robust-QG')
 
         train_dataset = dataset_dict['train'].remap_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) \
@@ -81,6 +92,7 @@ class TestFinetuneTextGeneration(unittest.TestCase):
             .map(lambda example: {'src_txt': example['src_txt'].replace('[SEP]', '<sep>') + '\n'})
 
         max_epochs = 10
+
         tmp_dir = './gpt3_dureader'
 
         num_warmup_steps = 200
@@ -98,7 +110,7 @@ class TestFinetuneTextGeneration(unittest.TestCase):
                     'by_epoch': False
                 }
             }
-            cfg.train.optimizer = {'type': 'AdamW', 'lr': 3e-4}
+            cfg.train.optimizer = {'type': 'AdamW', 'lr': 1e-4}
             cfg.train.dataloader = {
                 'batch_size_per_gpu': 16,
                 'workers_per_gpu': 1
diff --git a/tests/trainers/test_finetune_mgeo.py b/tests/trainers/test_finetune_mgeo.py
new file mode 100644
index 00000000..b492497b
--- /dev/null
+++ b/tests/trainers/test_finetune_mgeo.py
@@ -0,0 +1,291 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import torch
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+from modelscope.metainfo import Preprocessors, Trainers
+from modelscope.models import Model
+from modelscope.msdatasets import MsDataset
+from modelscope.pipelines import pipeline
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class TestFinetuneMGeo(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    def finetune(self,
+                 model_id,
+                 train_dataset,
+                 eval_dataset,
+                 name=Trainers.nlp_text_ranking_trainer,
+                 cfg_modify_fn=None,
+                 **kwargs):
+        kwargs = dict(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            work_dir=self.tmp_dir,
+            cfg_modify_fn=cfg_modify_fn,
+            **kwargs)
+
+        os.environ['LOCAL_RANK'] = '0'
+        trainer = build_trainer(name=name, default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    @unittest.skipUnless(test_level() >= 4, 'skip test in current test level')
+    def test_finetune_geotes_rerank(self):
+
+        def cfg_modify_fn(cfg):
+            neg_sample = 19
+            cfg.task = 'text-ranking'
+            cfg['preprocessor'] = {'type': 'mgeo-ranking'}
+            cfg.train.optimizer.lr = 5e-5
+            cfg['dataset'] = {
+                'train': {
+                    'type': 'mgeo',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['text', 'gis'],
+                    'qid_field': 'query_id',
+                    'neg_sample': neg_sample,
+                    'sequence_length': 64
+                },
+                'val': {
+                    'type': 'mgeo',
+                    'query_sequence': 'query',
+                    'pos_sequence': 'positive_passages',
+                    'neg_sequence': 'negative_passages',
+                    'text_fileds': ['text', 'gis'],
+                    'qid_field': 'query_id'
+                },
+            }
+            cfg.evaluation.dataloader.batch_size_per_gpu = 16
+            cfg.train.dataloader.batch_size_per_gpu = 3
+            cfg.train.dataloader.workers_per_gpu = 16
+            cfg.evaluation.dataloader.workers_per_gpu = 16
+
+            cfg['evaluation']['metrics'] = 'mrr@1'
+            cfg.train.max_epochs = 1
+            cfg.model['neg_sample'] = neg_sample
+            cfg.model['gis_num'] = 2
+            cfg.model['finetune_mode'] = 'multi-modal'
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 100
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': True
+            }]
+            # lr_scheduler的配置
+
+            cfg.train.lr_scheduler = {
+                'type':
+                'LinearLR',
+                'start_factor':
+                1.0,
+                'end_factor':
+                0.5,
+                'total_iters':
+                int(len(train_ds) / cfg.train.dataloader.batch_size_per_gpu)
+                * cfg.train.max_epochs,
+                'options': {
+                    'warmup': {
+                        'type':
+                        'LinearWarmup',
+                        'warmup_iters':
+                        int(
+                            len(train_ds)
+                            / cfg.train.dataloader.batch_size_per_gpu)
+                    },
+                    'by_epoch': False
+                }
+            }
+
+            return cfg
+
+        # load dataset
+        train_dataset = MsDataset.load(
+            'GeoGLUE',
+            subset_name='GeoTES-rerank',
+            split='train',
+            namespace='damo')
+        dev_dataset = MsDataset.load(
+            'GeoGLUE',
+            subset_name='GeoTES-rerank',
+            split='validation',
+            namespace='damo')
+
+        dataset = MsDataset.load(
+            'json',
+            data_files={
+                'train': [train_dataset['train'] + '/train.json'],
+                'test': [dev_dataset['validation'] + '/dev.json']
+            })
+        train_ds = dataset['train'].to_hf_dataset()
+        dev_ds = dataset['test'].to_hf_dataset()
+
+        model_id = 'damo/mgeo_backbone_chinese_base'
+        self.finetune(
+            model_id=model_id,
+            train_dataset=train_ds,
+            eval_dataset=dev_ds,
+            cfg_modify_fn=cfg_modify_fn,
+            name=Trainers.mgeo_ranking_trainer)
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        print(f'model is saved to {output_dir}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_finetune_geoeag(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = Tasks.sentence_similarity
+            cfg['preprocessor'] = {'type': Preprocessors.sen_sim_tokenizer}
+
+            cfg.train.dataloader.batch_size_per_gpu = 64
+            cfg.evaluation.dataloader.batch_size_per_gpu = 64
+            cfg.train.optimizer.lr = 2e-5
+            cfg.train.max_epochs = 1
+
+            cfg['dataset'] = {
+                'train': {
+                    'labels': ['not_match', 'partial_match', 'exact_match'],
+                    'first_sequence': 'sentence1',
+                    'second_sequence': 'sentence2',
+                    'label': 'label',
+                    'sequence_length': 128
+                }
+            }
+            cfg['evaluation']['metrics'] = 'seq-cls-metric'
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 100
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': True
+            }]
+            cfg.train.lr_scheduler.total_iters = int(
+                len(train_dataset) / 32) * cfg.train.max_epochs
+            return cfg
+
+        # load dataset
+        train_dataset = MsDataset.load(
+            'GeoGLUE', subset_name='GeoEAG', split='train', namespace='damo')
+        dev_dataset = MsDataset.load(
+            'GeoGLUE',
+            subset_name='GeoEAG',
+            split='validation',
+            namespace='damo')
+
+        model_id = 'damo/mgeo_backbone_chinese_base'
+        self.finetune(
+            model_id=model_id,
+            train_dataset=train_dataset['train'],
+            eval_dataset=dev_dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn,
+            name='nlp-base-trainer')
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        print(f'model is saved to {output_dir}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_finetune_geoeta(self):
+
+        def cfg_modify_fn(cfg):
+            cfg.task = 'token-classification'
+            cfg['dataset'] = {
+                'train': {
+                    'labels': label_enumerate_values,
+                    'first_sequence': 'tokens',
+                    'label': 'ner_tags',
+                    'sequence_length': 128
+                }
+            }
+            cfg['preprocessor'] = {
+                'type': 'token-cls-tokenizer',
+                'padding': 'max_length'
+            }
+            cfg.train.max_epochs = 1
+            cfg.train.dataloader.batch_size_per_gpu = 32
+            cfg.train.optimizer.lr = 3e-5
+            cfg.train.hooks = [{
+                'type': 'CheckpointHook',
+                'interval': 1
+            }, {
+                'type': 'TextLoggerHook',
+                'interval': 100
+            }, {
+                'type': 'IterTimerHook'
+            }, {
+                'type': 'EvaluationHook',
+                'by_epoch': True
+            }]
+            cfg.train.lr_scheduler.total_iters = int(
+                len(train_dataset) / 32) * cfg.train.max_epochs
+
+            return cfg
+
+        def get_label_list(labels):
+            unique_labels = set()
+            for label in labels:
+                unique_labels = unique_labels | set(label)
+            label_list = list(unique_labels)
+            label_list.sort()
+            return label_list
+
+        # load dataset
+        train_dataset = MsDataset.load(
+            'GeoGLUE', subset_name='GeoETA', split='train', namespace='damo')
+        dev_dataset = MsDataset.load(
+            'GeoGLUE',
+            subset_name='GeoETA',
+            split='validation',
+            namespace='damo')
+
+        label_enumerate_values = get_label_list(
+            train_dataset._hf_ds['train']['ner_tags']
+            + dev_dataset._hf_ds['validation']['ner_tags'])
+
+        model_id = 'damo/mgeo_backbone_chinese_base'
+        self.finetune(
+            model_id=model_id,
+            train_dataset=train_dataset['train'],
+            eval_dataset=dev_dataset['validation'],
+            cfg_modify_fn=cfg_modify_fn,
+            name='nlp-base-trainer')
+
+        output_dir = os.path.join(self.tmp_dir, ModelFile.TRAIN_OUTPUT_DIR)
+        print(f'model is saved to {output_dir}')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_finetune_text_generation.py b/tests/trainers/test_finetune_text_generation.py
index 9981e228..591d4a0b 100644
--- a/tests/trainers/test_finetune_text_generation.py
+++ b/tests/trainers/test_finetune_text_generation.py
@@ -142,7 +142,6 @@ class TestFinetuneTextGeneration(unittest.TestCase):
             'tgt_txt'
         })
         num_warmup_steps = 200
-        os.environ['LOCAL_RANK'] = '0'
 
         def noam_lambda(current_step: int):
             current_step += 1
@@ -166,7 +165,7 @@ class TestFinetuneTextGeneration(unittest.TestCase):
             work_dir=self.tmp_dir,
             cfg_modify_fn=cfg_modify_fn)
         trainer = build_trainer(
-            name=Trainers.nlp_base_trainer, default_args=kwargs)
+            name=Trainers.text_generation_trainer, default_args=kwargs)
         trainer.train()
 
 
diff --git a/tests/trainers/test_general_image_classification_trainer.py b/tests/trainers/test_general_image_classification_trainer.py
index e91bde18..6aba96dc 100644
--- a/tests/trainers/test_general_image_classification_trainer.py
+++ b/tests/trainers/test_general_image_classification_trainer.py
@@ -91,6 +91,87 @@ class TestGeneralImageClassificationTestTrainer(unittest.TestCase):
         result = trainer.evaluate()
         print(result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_convnext_garbage_train(self):
+        model_id = 'damo/cv_convnext-base_image-classification_garbage'
+
+        def cfg_modify_fn(cfg):
+            cfg.train.dataloader.batch_size_per_gpu = 16
+            cfg.train.dataloader.workers_per_gpu = 1
+            cfg.train.max_epochs = self.max_epochs
+            cfg.model.mm_model.head.num_classes = 2
+            cfg.train.optimizer.lr = 1e-4
+            cfg.train.lr_config.warmup_iters = 1
+            cfg.train.evaluation.metric_options = {'topk': (1, )}
+            cfg.evaluation.metric_options = {'topk': (1, )}
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.image_classification, default_args=kwargs)
+        trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_convnext_garbage_eval(self):
+        model_id = 'damo/cv_convnext-base_image-classification_garbage'
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=None,
+            eval_dataset=self.eval_dataset)
+
+        trainer = build_trainer(
+            name=Trainers.image_classification, default_args=kwargs)
+        result = trainer.evaluate()
+        print(result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_beitv2_train_eval(self):
+        model_id = 'damo/cv_beitv2-base_image-classification_patch16_224_pt1k_ft22k_in1k'
+
+        def cfg_modify_fn(cfg):
+            cfg.train.dataloader.batch_size_per_gpu = 16
+            cfg.train.dataloader.workers_per_gpu = 1
+            cfg.train.max_epochs = self.max_epochs
+            cfg.model.mm_model.head.num_classes = 2
+            cfg.model.mm_model.head.loss.num_classes = 2
+            cfg.train.optimizer.lr = 1e-4
+            cfg.train.lr_config.warmup_iters = 1
+            cfg.train.evaluation.metric_options = {'topk': (1, )}
+            cfg.evaluation.metric_options = {'topk': (1, )}
+            return cfg
+
+        kwargs = dict(
+            model=model_id,
+            work_dir=self.tmp_dir,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            cfg_modify_fn=cfg_modify_fn)
+
+        trainer = build_trainer(
+            name=Trainers.image_classification, default_args=kwargs)
+        trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(self.max_epochs):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+        result = trainer.evaluate()
+        print(result)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/trainers/test_image_deblur_trainer.py b/tests/trainers/test_image_deblur_trainer.py
new file mode 100644
index 00000000..6ae88726
--- /dev/null
+++ b/tests/trainers/test_image_deblur_trainer.py
@@ -0,0 +1,87 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models.cv.image_deblur import NAFNetForImageDeblur
+from modelscope.msdatasets import MsDataset
+from modelscope.msdatasets.task_datasets.gopro_image_deblurring_dataset import \
+    GoproImageDeblurringDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageDeblurTrainerTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/cv_nafnet_image-deblur_gopro'
+        self.cache_path = snapshot_download(self.model_id)
+        self.config = Config.from_file(
+            os.path.join(self.cache_path, ModelFile.CONFIGURATION))
+        dataset_train = MsDataset.load(
+            'GOPRO',
+            namespace='damo',
+            subset_name='default',
+            split='test',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        dataset_val = MsDataset.load(
+            'GOPRO',
+            namespace='damo',
+            subset_name='subset',
+            split='test',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds
+        self.dataset_train = GoproImageDeblurringDataset(
+            dataset_train, self.config.dataset, is_train=True)
+        self.dataset_val = GoproImageDeblurringDataset(
+            dataset_val, self.config.dataset, is_train=False)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir, ignore_errors=True)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.dataset_train,
+            eval_dataset=self.dataset_val,
+            work_dir=self.tmp_dir)
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(1):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+        model = NAFNetForImageDeblur.from_pretrained(self.cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.dataset_train,
+            eval_dataset=self.dataset_val,
+            max_epochs=1,
+            work_dir=self.tmp_dir)
+        trainer = build_trainer(default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        for i in range(1):
+            self.assertIn(f'epoch_{i+1}.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_image_defrcn_fewshot_trainer.py b/tests/trainers/test_image_defrcn_fewshot_trainer.py
new file mode 100644
index 00000000..d007e23c
--- /dev/null
+++ b/tests/trainers/test_image_defrcn_fewshot_trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+
+from modelscope.hub.utils.utils import get_cache_dir
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.constant import DownloadMode
+from modelscope.utils.test_utils import test_level
+
+
+class TestImageDefrcnFewShotTrainer(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        cmd = [
+            sys.executable, '-m', 'pip', 'install', 'detectron2==0.3', '-f',
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html'
+        ]
+        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        self.tmp_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(self.tmp_dir):
+            os.makedirs(self.tmp_dir)
+
+        self.model_id = 'damo/cv_resnet101_detection_fewshot-defrcn'
+
+        data_voc = MsDataset.load(
+            dataset_name='VOC_fewshot',
+            namespace='shimin2023',
+            split='train',
+            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)
+        self.data_dir = os.path.join(
+            data_voc.config_kwargs['split_config']['train'], 'data')
+
+    def tearDown(self):
+        shutil.rmtree(self.tmp_dir)
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+
+        split = 1
+        kwargs = dict(
+            model=self.model_id,
+            data_dir=self.data_dir,
+            work_dir=self.tmp_dir,
+            model_weights=os.path.join(get_cache_dir(), self.model_id,
+                                       'ImageNetPretrained/MSRA/R-101.pkl'),
+            data_type='pascal_voc',
+            config_path='defrcn_det_r101_base{}.yaml'.format(split),
+            datasets_train=('voc_2007_trainval_base{}'.format(split),
+                            'voc_2012_trainval_base{}'.format(split)),
+            datasets_test=('voc_2007_test_base{}'.format(split), ))
+        trainer = build_trainer(
+            name=Trainers.image_fewshot_detection, default_args=kwargs)
+        trainer.train()
+
+        results_files = os.listdir(self.tmp_dir)
+        self.assertIn('metrics.json', results_files)
+        self.assertIn('model_final.pth', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_movie_scene_segmentation_trainer.py b/tests/trainers/test_movie_scene_segmentation_trainer.py
index f25dc92a..d598a63f 100644
--- a/tests/trainers/test_movie_scene_segmentation_trainer.py
+++ b/tests/trainers/test_movie_scene_segmentation_trainer.py
@@ -44,7 +44,6 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         self.train_dataset = MsDataset.load(
             dataset_name=train_data_cfg.name,
             split=train_data_cfg.split,
-            namespace=train_data_cfg.namespace,
             cfg=train_data_cfg.cfg,
             test_mode=train_data_cfg.test_mode)
         assert next(
@@ -53,7 +52,6 @@ class TestImageInstanceSegmentationTrainer(unittest.TestCase):
         self.test_dataset = MsDataset.load(
             dataset_name=test_data_cfg.name,
             split=test_data_cfg.split,
-            namespace=test_data_cfg.namespace,
             cfg=test_data_cfg.cfg,
             test_mode=test_data_cfg.test_mode)
         assert next(
diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py
index ab2b8cc6..f4ca7bcb 100644
--- a/tests/trainers/test_ofa_trainer.py
+++ b/tests/trainers/test_ofa_trainer.py
@@ -37,7 +37,7 @@ class TestOfaTrainer(unittest.TestCase):
              'train': {'work_dir': 'work/ckpts/recognition',
                        # 'launcher': 'pytorch',
                        'max_epochs': 1,
-                       'use_fp16': True,
+                       'use_fp16': False,
                        'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
                        'lr_scheduler': {'name': 'polynomial_decay',
                                         'warmup_proportion': 0.01,
diff --git a/tests/trainers/test_table_question_answering_trainer.py b/tests/trainers/test_table_question_answering_trainer.py
new file mode 100644
index 00000000..b168dcfc
--- /dev/null
+++ b/tests/trainers/test_table_question_answering_trainer.py
@@ -0,0 +1,46 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import json
+
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers.nlp.table_question_answering_trainer import \
+    TableQuestionAnsweringTrainer
+from modelscope.utils.constant import DownloadMode, ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TableQuestionAnsweringTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_with_model_name(self):
+        # load data
+        input_dataset = MsDataset.load(
+            'ChineseText2SQL', download_mode=DownloadMode.FORCE_REDOWNLOAD)
+        train_dataset = []
+        for name in input_dataset['train']._hf_ds.data[1]:
+            train_dataset.append(json.load(open(str(name), 'r')))
+        eval_dataset = []
+        for name in input_dataset['test']._hf_ds.data[1]:
+            eval_dataset.append(json.load(open(str(name), 'r')))
+        print('size of training set', len(train_dataset))
+        print('size of evaluation set', len(eval_dataset))
+
+        model_id = 'damo/nlp_convai_text2sql_pretrain_cn'
+        trainer = TableQuestionAnsweringTrainer(
+            model=model_id,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+        )
+        trainer.train(
+            batch_size=8,
+            total_epoches=2,
+        )
+        trainer.evaluate(
+            checkpoint_path=os.path.join(trainer.model.model_dir,
+                                         'finetuned_model.bin'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_tinynas_damoyolo_trainer.py b/tests/trainers/test_tinynas_damoyolo_trainer.py
new file mode 100644
index 00000000..cf7760d2
--- /dev/null
+++ b/tests/trainers/test_tinynas_damoyolo_trainer.py
@@ -0,0 +1,131 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import DistributedTestCase, test_level
+
+
+def _setup():
+    model_id = 'damo/cv_tinynas_object-detection_damoyolo'
+    cache_path = snapshot_download(model_id)
+    return cache_path
+
+
+class TestTinynasDamoyoloTrainerSingleGPU(unittest.TestCase):
+
+    def setUp(self):
+        self.model_id = 'damo/cv_tinynas_object-detection_damoyolo'
+        self.cache_path = _setup()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_from_scratch_singleGPU(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'configuration.json'),
+            gpu_ids=[
+                0,
+            ],
+            batch_size=2,
+            max_epochs=3,
+            num_classes=80,
+            base_lr_per_img=0.001,
+            cache_path=self.cache_path,
+            train_image_dir='./data/test/images/image_detection/images',
+            val_image_dir='./data/test/images/image_detection/images',
+            train_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json',
+            val_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json',
+        )
+        trainer = build_trainer(
+            name=Trainers.tinynas_damoyolo, default_args=kwargs)
+        trainer.train()
+        trainer.evaluate(
+            checkpoint_path=os.path.join(self.cache_path,
+                                         'damoyolo_tinynasL25_S.pt'))
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_from_scratch_singleGPU_model_id(self):
+        kwargs = dict(
+            model=self.model_id,
+            gpu_ids=[
+                0,
+            ],
+            batch_size=2,
+            max_epochs=3,
+            num_classes=80,
+            load_pretrain=True,
+            base_lr_per_img=0.001,
+            train_image_dir='./data/test/images/image_detection/images',
+            val_image_dir='./data/test/images/image_detection/images',
+            train_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json',
+            val_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json',
+        )
+        trainer = build_trainer(
+            name=Trainers.tinynas_damoyolo, default_args=kwargs)
+        trainer.train()
+        trainer.evaluate(
+            checkpoint_path=os.path.join(self.cache_path,
+                                         'damoyolo_tinynasL25_S.pt'))
+
+    @unittest.skip('multiGPU test is varified offline')
+    def test_trainer_from_scratch_multiGPU(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'configuration.json'),
+            gpu_ids=[
+                0,
+                1,
+            ],
+            batch_size=32,
+            max_epochs=3,
+            num_classes=1,
+            cache_path=self.cache_path,
+            train_image_dir='./data/test/images/image_detection/images',
+            val_image_dir='./data/test/images/image_detection/images',
+            train_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json',
+            val_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json')
+        trainer = build_trainer(
+            name=Trainers.tinynas_damoyolo, default_args=kwargs)
+        trainer.train()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer_finetune_singleGPU(self):
+        kwargs = dict(
+            cfg_file=os.path.join(self.cache_path, 'configuration.json'),
+            gpu_ids=[
+                0,
+            ],
+            batch_size=16,
+            max_epochs=3,
+            num_classes=1,
+            load_pretrain=True,
+            pretrain_model=os.path.join(self.cache_path,
+                                        'damoyolo_tinynasL25_S.pt'),
+            cache_path=self.cache_path,
+            train_image_dir='./data/test/images/image_detection/images',
+            val_image_dir='./data/test/images/image_detection/images',
+            train_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json',
+            val_ann=
+            './data/test/images/image_detection/annotations/coco_sample.json')
+        trainer = build_trainer(
+            name=Trainers.tinynas_damoyolo, default_args=kwargs)
+        trainer.train()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_trainer.py b/tests/trainers/test_trainer.py
index 660355bc..c692196a 100644
--- a/tests/trainers/test_trainer.py
+++ b/tests/trainers/test_trainer.py
@@ -141,7 +141,6 @@ class TrainerTest(unittest.TestCase):
         config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION)
         with open(config_path, 'w') as f:
             json.dump(json_cfg, f)
-
         trainer_name = Trainers.default
         kwargs = dict(
             cfg_file=config_path,
@@ -157,6 +156,10 @@ class TrainerTest(unittest.TestCase):
         results_files = os.listdir(self.tmp_dir)
 
         self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+        with open(f'{self.tmp_dir}/{trainer.timestamp}.log', 'r') as infile:
+            lines = infile.readlines()
+            self.assertTrue(len(lines) > 20)
+        self.assertIn(f'{trainer.timestamp}.log', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_1.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_2.pth', results_files)
         self.assertIn(f'{LogKeys.EPOCH}_3.pth', results_files)
diff --git a/tests/trainers/test_training_args.py b/tests/trainers/test_training_args.py
new file mode 100644
index 00000000..0aad9ddc
--- /dev/null
+++ b/tests/trainers/test_training_args.py
@@ -0,0 +1,79 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os
+import shutil
+import tempfile
+import unittest
+
+import cv2
+import json
+import numpy as np
+import torch
+
+from modelscope.trainers.training_args import (ArgAttr, CliArgumentParser,
+                                               training_args)
+from modelscope.utils.test_utils import test_level
+
+
+class TrainingArgsTest(unittest.TestCase):
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+    def tearDown(self):
+        super().tearDown()
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_define_args(self):
+        myparser = CliArgumentParser(training_args)
+        input_args = [
+            '--max_epochs', '100', '--work_dir', 'ddddd', '--train_batch_size',
+            '8', '--unkown', 'unkown'
+        ]
+        args, remainning = myparser.parse_known_args(input_args)
+        myparser.print_help()
+        self.assertTrue(args.max_epochs == 100)
+        self.assertTrue(args.work_dir == 'ddddd')
+        self.assertTrue(args.train_batch_size == 8)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_new_args(self):
+        training_args.num_classes = ArgAttr(
+            'model.mm_model.head.num_classes',
+            type=int,
+            help='number of classes')
+        training_args.mean = ArgAttr(
+            'train.data.mean', help='3-dim mean vector')
+        training_args.flip = ArgAttr('train.data.flip', help='flip or not')
+        training_args.img_size = ArgAttr(
+            'train.data.img_size', help='image size')
+        myparser = CliArgumentParser(training_args)
+        input_args = [
+            '--max_epochs', '100', '--work_dir', 'ddddd', '--train_batch_size',
+            '8', '--num_classes', '10', '--mean', '[125.0,125.0,125.0]',
+            '--flip', 'false', '--img_size', '(640,640)'
+        ]
+        args, remainning = myparser.parse_known_args(input_args)
+        myparser.print_help()
+        self.assertTrue(args.max_epochs == 100)
+        self.assertTrue(args.work_dir == 'ddddd')
+        self.assertTrue(args.train_batch_size == 8)
+        self.assertTrue(args.num_classes == 10)
+        self.assertTrue(len(args.mean) == 3)
+        self.assertTrue(not args.flip)
+        self.assertAlmostEqual(args.mean[0], 125.0)
+        self.assertAlmostEqual(args.img_size, (640, 640))
+
+        cfg_dict = myparser.get_cfg_dict(args=input_args)
+        self.assertTrue(cfg_dict['model.mm_model.head.num_classes'] == 10)
+        self.assertAlmostEqual(cfg_dict['train.data.mean'],
+                               [125.0, 125.0, 125.0])
+        self.assertTrue(not cfg_dict['train.data.flip'])
+        self.assertEqual(cfg_dict['train.dataloader.batch_size_per_gpu'], 8)
+        self.assertEqual(cfg_dict['train.work_dir'], 'ddddd')
+        self.assertEqual(cfg_dict['train.max_epochs'], 100)
+        self.assertEqual(cfg_dict['train.data.img_size'], (640, 640))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/utils/plugins/.modelscope_plugins b/tests/utils/plugins/.modelscope_plugins
new file mode 100644
index 00000000..421376db
--- /dev/null
+++ b/tests/utils/plugins/.modelscope_plugins
@@ -0,0 +1 @@
+dummy
diff --git a/tests/utils/plugins/dummy/__init__.py b/tests/utils/plugins/dummy/__init__.py
new file mode 100644
index 00000000..a0d86001
--- /dev/null
+++ b/tests/utils/plugins/dummy/__init__.py
@@ -0,0 +1 @@
+import dummy.dummy_model
diff --git a/tests/utils/plugins/dummy/dummy_model.py b/tests/utils/plugins/dummy/dummy_model.py
new file mode 100644
index 00000000..8a89c12e
--- /dev/null
+++ b/tests/utils/plugins/dummy/dummy_model.py
@@ -0,0 +1,8 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+
+
+@MODELS.register_module(group_key='dummy-group', module_name='dummy-model')
+class DummyModel(Model):
+    pass
diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py
index 850945b9..5aafdfc7 100644
--- a/tests/utils/test_ast.py
+++ b/tests/utils/test_ast.py
@@ -10,7 +10,8 @@ from pathlib import Path
 from modelscope.utils.ast_utils import (FILES_MTIME_KEY, INDEX_KEY, MD5_KEY,
                                         MODELSCOPE_PATH_KEY, REQUIREMENT_KEY,
                                         VERSION_KEY, AstScaning,
-                                        FilesAstScaning, load_index)
+                                        FilesAstScaning, generate_ast_template,
+                                        load_from_prebuilt, load_index)
 
 p = Path(__file__)
 
@@ -134,6 +135,14 @@ class AstScaningTest(unittest.TestCase):
         self.assertIsInstance(output[VERSION_KEY], str)
         self.assertIsInstance(output[FILES_MTIME_KEY], dict)
 
+        # generate ast_template
+        file_path = os.path.join(self.tmp_dir, 'index_file.py')
+        index = generate_ast_template(file_path=file_path, force_rebuild=False)
+        self.assertTrue(os.path.exists(file_path))
+        self.assertEqual(output, index)
+        index_from_prebuilt = load_from_prebuilt(file_path)
+        self.assertEqual(index, index_from_prebuilt)
+
     def test_update_load_index_method(self):
         file_number = 20
         file_list = []
diff --git a/tests/utils/test_plugin.py b/tests/utils/test_plugin.py
new file mode 100644
index 00000000..40d86f9d
--- /dev/null
+++ b/tests/utils/test_plugin.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.models.builder import MODELS
+from modelscope.utils.plugins import (discover_plugins, import_all_plugins,
+                                      import_file_plugins, import_plugins,
+                                      pushd)
+
+
+class PluginTest(unittest.TestCase):
+
+    def setUp(self):
+        self.plugins_root = 'tests/utils/plugins/'
+
+    def test_no_plugins(self):
+        available_plugins = set(discover_plugins())
+        assert available_plugins == set()
+
+    def test_file_plugins(self):
+        with pushd(self.plugins_root):
+            available_plugins = set(discover_plugins())
+            assert available_plugins == {'dummy'}
+
+            import_file_plugins()
+            assert MODELS.get('dummy-model', 'dummy-group') is not None
+
+    def test_custom_plugins(self):
+        with pushd(self.plugins_root):
+            available_plugins = set(discover_plugins())
+            assert available_plugins == {'dummy'}
+
+            import_plugins(['dummy'])
+            assert MODELS.get('dummy-model', 'dummy-group') is not None
+
+    def test_all_plugins(self):
+        with pushd(self.plugins_root):
+            available_plugins = set(discover_plugins())
+            assert available_plugins == {'dummy'}
+
+            import_all_plugins()
+            assert MODELS.get('dummy-model', 'dummy-group') is not None