add 1.6

2025-12-16 16:27:45 +01:00 · 2023-05-22 10:53:18 +08:00
parent 52aea36c12
commit 48c0d2a9af
468 changed files with 12942 additions and 7176 deletions
--- a/examples/pytorch/image_classification/finetune_image_classification.py
+++ b/examples/pytorch/image_classification/finetune_image_classification.py
@@ -1,13 +1,12 @@
 import os
 from dataclasses import dataclass, field

+from modelscope import MsDataset, TrainingArgs
 from modelscope.metainfo import Trainers
-from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.trainers.builder import build_trainer
-from modelscope.trainers.training_args import TrainingArgs


-@dataclass
+@dataclass(init=False)
 class ImageClassificationTrainingArgs(TrainingArgs):
    num_classes: int = field(
        default=None,
@@ -46,26 +45,35 @@ def create_dataset(name, split):
        dataset_name, namespace=namespace, subset_name='default', split=split)


-def train():
-    args = ImageClassificationTrainingArgs.from_cli(
+training_args = ImageClassificationTrainingArgs(
    model='damo/cv_vit-base_image-classification_ImageNet-labels',
    max_epochs=1,
    lr=1e-4,
    optimizer='AdamW',
    warmup_iters=1,
-        topk=(1, ))
-    if args.dataset_name is not None:
-        train_dataset = create_dataset(args.dataset_name, split='train')
-        val_dataset = create_dataset(args.dataset_name, split='validation')
+    topk=(1, )).parse_cli()
+config, args = training_args.to_config()
+
+
+def cfg_modify_fn(cfg):
+    if args.use_model_config:
+        cfg.merge_from_dict(config)
    else:
-        train_dataset = create_dataset(args.train_dataset_name, split='train')
-        val_dataset = create_dataset(args.val_dataset_name, split='validation')
+        cfg = config
+    return cfg
+
+
+def train():
+    train_dataset = create_dataset(
+        training_args.train_dataset_name, split=training_args.train_split)
+    val_dataset = create_dataset(
+        training_args.val_dataset_name, split=training_args.val_split)

    kwargs = dict(
        model=args.model,  # model id
        train_dataset=train_dataset,  # training dataset
        eval_dataset=val_dataset,  # validation dataset
-        cfg_modify_fn=args  # callback to modify configuration
+        cfg_modify_fn=cfg_modify_fn  # callback to modify configuration
    )

    # in distributed training, specify pytorch launcher
--- a/examples/pytorch/image_classification/run_train.sh
+++ b/examples/pytorch/image_classification/run_train.sh
@@ -2,4 +2,7 @@ PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
    examples/pytorch/image_classification/finetune_image_classification.py \
    --num_classes 2 \
    --train_dataset_name 'tany0699/cats_and_dogs' \
-    --val_dataset_name 'tany0699/cats_and_dogs'
+    --val_dataset_name 'tany0699/cats_and_dogs' \
+    --train_split train \
+    --val_split validation \
+    --use_model_config true \
--- a/examples/pytorch/multi_modal_embedding/finetune_multi_modal_embedding.py
+++ b/examples/pytorch/multi_modal_embedding/finetune_multi_modal_embedding.py
@@ -1,15 +1,13 @@
 import os
 from dataclasses import dataclass, field
-from functools import partial

+from modelscope import MsDataset, TrainingArgs
 from modelscope.metainfo import Trainers
-from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
-from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
-                                               set_flatten_value)
+from modelscope.trainers.training_args import set_flatten_value


-@dataclass
+@dataclass(init=False)
 class MultiModalEmbeddingArguments(TrainingArgs):

    trainer: str = field(
@@ -17,6 +15,12 @@ class MultiModalEmbeddingArguments(TrainingArgs):
            'help': 'The trainer used',
        })

+    work_dir: str = field(
+        default='./tmp',
+        metadata={
+            'help': 'The working path for saving checkpoint',
+        })
+
    use_fp16: bool = field(
        default=None,
        metadata={
@@ -35,7 +39,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'train.optimizer_hparams',
-            'cfg_getter': partial(get_flatten_value, exclusions=['lr']),
            'cfg_setter': set_flatten_value,
            'help': 'The optimizer init params except `lr`',
        })
@@ -51,7 +54,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'dataset.column_map',
-            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The column map for dataset',
        })
@@ -67,7 +69,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'train.lr_scheduler_hook',
-            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The parameters for lr scheduler hook',
        })
@@ -76,7 +77,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'train.optimizer_hook',
-            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The parameters for optimizer hook',
        })
@@ -92,23 +92,28 @@ class MultiModalEmbeddingArguments(TrainingArgs):
            'help': 'The data parallel world size',
        })

-    def __call__(self, config):
-        config = super().__call__(config)
-        config.merge_from_dict({'pretrained_model.model_name': self.model})
-        if self.clip_clamp:
-            config.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
-        if self.world_size > 1:
-            config.train.launcher = 'pytorch'
-        return config
+
+config, args = MultiModalEmbeddingArguments().parse_cli().to_config()
+print(config, args)


-args = MultiModalEmbeddingArguments.from_cli(task='multi-modal-embedding')
-print(args)
+def cfg_modify_fn(cfg):
+    if args.use_model_config:
+        cfg.merge_from_dict(config)
+    else:
+        cfg = config
+    cfg.merge_from_dict({'pretrained_model.model_name': args.model})
+    if args.clip_clamp:
+        cfg.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
+    if args.world_size > 1:
+        cfg.train.launcher = 'pytorch'
+    return cfg
+

 train_dataset = MsDataset.load(
-    args.dataset_name, namespace='modelscope', split='train')
+    args.train_dataset_name, namespace='modelscope', split='train')
 eval_dataset = MsDataset.load(
-    args.dataset_name, namespace='modelscope', split='validation')
+    args.train_dataset_name, namespace='modelscope', split='validation')

 os.makedirs(args.work_dir, exist_ok=True)
 kwargs = dict(
@@ -116,6 +121,6 @@ kwargs = dict(
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    work_dir=args.work_dir,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)
 trainer = build_trainer(name=args.trainer, default_args=kwargs)
 trainer.train()
--- a/examples/pytorch/multi_modal_embedding/run_train.sh
+++ b/examples/pytorch/multi_modal_embedding/run_train.sh
@@ -6,14 +6,16 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
    --trainer 'clip-multi-modal-embedding' \
    --work_dir './workspace/ckpts/clip' \
    --model 'damo/multi-modal_clip-vit-base-patch16_zh' \
-    --dataset_name 'muge' \
+    --train_dataset_name 'muge' \
    --dataset_column_map 'img=image,text=query' \
    --max_epochs 1 \
    --use_fp16 true \
    --per_device_train_batch_size 180 \
+    --train_data_worker 0 \
    --train_shuffle true \
    --train_drop_last true \
    --per_device_eval_batch_size 128 \
+    --eval_data_worker 0 \
    --eval_shuffle true \
    --eval_drop_last true \
    --save_ckpt_best true \
@@ -33,3 +35,4 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
    --optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \
    --clip_clamp true \
    --world_size $DATA_PARALLEL_SIZE \
+    --use_model_config true \
--- a/examples/pytorch/stable_diffusion/finetune_stable_diffusion.py
+++ b/examples/pytorch/stable_diffusion/finetune_stable_diffusion.py
@@ -4,30 +4,32 @@ from modelscope.msdatasets import MsDataset
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.trainers.training_args import TrainingArgs

-
-@dataclass
-class StableDiffusionArguments(TrainingArgs):
-
-    def __call__(self, config):
-        config = super().__call__(config)
-        config.train.lr_scheduler.T_max = self.max_epochs
-        config.model.inference = False
-        return config
-
-
-args = StableDiffusionArguments.from_cli(task='efficient-diffusion-tuning')
+training_args = TrainingArgs(task='efficient-diffusion-tuning').parse_cli()
+config, args = training_args.to_config()
 print(args)

-dataset = MsDataset.load(args.dataset_name, namespace=args.namespace)
+dataset = MsDataset.load(
+    args.train_dataset_name, namespace=args.train_dataset_namespace)
 train_dataset = dataset['train']
 validation_dataset = dataset['validation']

+
+def cfg_modify_fn(cfg):
+    if args.use_model_config:
+        cfg.merge_from_dict(config)
+    else:
+        cfg = config
+    cfg.train.lr_scheduler.T_max = training_args.max_epochs
+    cfg.model.inference = False
+    return cfg
+
+
 kwargs = dict(
-    model=args.model,
-    work_dir=args.work_dir,
+    model=training_args.model,
+    work_dir=training_args.work_dir,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)

 trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
 trainer.train()
--- a/examples/pytorch/stable_diffusion/run_train.sh
+++ b/examples/pytorch/stable_diffusion/run_train.sh
@@ -1,11 +1,12 @@
 PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \
    --model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
    --work_dir './tmp/stable_diffusion_tuning' \
-    --namespace 'damo' \
-    --dataset_name 'buptwq/lora-stable-diffusion-finetune-dog' \
-    --max_epochs 150 \
+    --train_dataset_namespace 'damo' \
+    --train_dataset_name 'controlnet_dataset_condition_fill50k' \
+    --max_epochs 1 \
    --save_ckpt_strategy 'by_epoch' \
    --logging_interval 100 \
    --train.dataloader.workers_per_gpu 0 \
    --evaluation.dataloader.workers_per_gpu 0 \
-    --train.optimizer.lr 1e-4
+    --train.optimizer.lr 1e-5 \
+    --use_model_config true
--- a/examples/pytorch/text_classification/finetune_text_classification.py
+++ b/examples/pytorch/text_classification/finetune_text_classification.py
@@ -1,26 +1,18 @@
 import os
 from dataclasses import dataclass, field

-from modelscope.msdatasets import MsDataset
-from modelscope.trainers import EpochBasedTrainer, build_trainer
-from modelscope.trainers.training_args import TrainingArgs
+from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
+                        build_dataset_from_file)
+from modelscope.trainers import build_trainer


-def get_labels(cfg, metadata):
-    label2id = cfg.safe_get(metadata['cfg_node'])
-    if label2id is not None:
-        return ','.join(label2id.keys())
-
-
-def set_labels(cfg, labels, metadata):
+def set_labels(labels):
    if isinstance(labels, str):
        labels = labels.split(',')
-    cfg.merge_from_dict(
-        {metadata['cfg_node']: {label: id
-                                for id, label in enumerate(labels)}})
+    return {label: id for id, label in enumerate(labels)}


-@dataclass
+@dataclass(init=False)
 class TextClassificationArguments(TrainingArgs):

    first_sequence: str = field(
@@ -49,7 +41,6 @@ class TextClassificationArguments(TrainingArgs):
        metadata={
            'help': 'The labels of the dataset',
            'cfg_node': 'preprocessor.label2id',
-            'cfg_getter': get_labels,
            'cfg_setter': set_labels,
        })

@@ -60,30 +51,39 @@ class TextClassificationArguments(TrainingArgs):
            'cfg_node': 'preprocessor.type'
        })

-    def __call__(self, config):
-        config = super().__call__(config)
-        config.model['num_labels'] = len(self.labels)
-        if config.train.lr_scheduler.type == 'LinearLR':
-            config.train.lr_scheduler['total_iters'] = \
-                int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
-        return config
+
+config, args = TextClassificationArguments().parse_cli().to_config()
+
+print(config, args)


-args = TextClassificationArguments.from_cli(
-    task='text-classification', eval_metrics='seq-cls-metric')
+def cfg_modify_fn(cfg):
+    if args.use_model_config:
+        cfg.merge_from_dict(config)
+    else:
+        cfg = config
+    cfg.model['num_labels'] = len(cfg.preprocessor.label2id)
+    if cfg.train.lr_scheduler.type == 'LinearLR':
+        cfg.train.lr_scheduler['total_iters'] = \
+            int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
+    return cfg

-print(args)

-dataset = MsDataset.load(args.dataset_name, subset_name=args.subset_name)
-train_dataset = dataset['train']
-validation_dataset = dataset['validation']
+if args.dataset_json_file is None:
+    dataset = MsDataset.load(
+        args.train_dataset_name, subset_name=args.train_subset_name)
+    train_dataset = dataset['train']
+    validation_dataset = dataset['validation']
+else:
+    train_dataset, validation_dataset = build_dataset_from_file(
+        args.dataset_json_file)

 kwargs = dict(
    model=args.model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    seed=args.seed,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)

 os.environ['LOCAL_RANK'] = str(args.local_rank)
 trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
--- a/examples/pytorch/text_classification/run_train.sh
+++ b/examples/pytorch/text_classification/run_train.sh
@@ -1,12 +1,16 @@
 PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
+    --task 'text-classification' \
    --model 'damo/nlp_structbert_backbone_base_std' \
-    --dataset_name 'clue' \
-    --subset_name 'tnews' \
+    --train_dataset_name 'clue' \
+    --train_subset_name 'tnews' \
    --first_sequence 'sentence' \
    --preprocessor.label label \
    --model.num_labels 15 \
    --labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
    --preprocessor 'sen-cls-tokenizer' \
+    --use_model_config True \
+    --max_epochs 1 \
    --train.dataloader.workers_per_gpu 0 \
    --evaluation.dataloader.workers_per_gpu 0 \
    --train.optimizer.lr 1e-5 \
+    --eval_metrics 'seq-cls-metric' \
--- a/examples/pytorch/text_generation/finetune_text_generation.py
+++ b/examples/pytorch/text_generation/finetune_text_generation.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass, field

+from modelscope import EpochBasedTrainer, MsDataset, TrainingArgs
 from modelscope.metainfo import Trainers
-from modelscope.msdatasets import MsDataset
-from modelscope.trainers import EpochBasedTrainer, build_trainer
-from modelscope.trainers.training_args import TrainingArgs
+from modelscope.trainers import build_trainer


-@dataclass
+@dataclass(init=False)
 class TextGenerationArguments(TrainingArgs):

    trainer: str = field(
@@ -67,30 +66,35 @@ class TextGenerationArguments(TrainingArgs):
            'help': 'Whether to use MegatronHook',
        })

-    def __call__(self, config):
-        config = super().__call__(config)
-        if config.train.lr_scheduler.type == 'noam':
-            config.train.lr_scheduler = {
-                'type': 'LambdaLR',
-                'lr_lambda': noam_lambda,
-                'options': {
-                    'by_epoch': False
-                }
-            }
-        if self.use_megatron:
-            config.train.hooks.append({'type': 'MegatronHook'})
-        return config
-

 def noam_lambda(current_step: int):
    current_step += 1
    return min(current_step**(-0.5), current_step * 100**(-1.5))


-args = TextGenerationArguments.from_cli(task='text-generation')
-print(args)
+config, args = TextGenerationArguments().parse_cli().to_config()
+print(config, args)

-dataset = MsDataset.load(args.dataset_name)
+
+def cfg_modify_fn(cfg):
+    if args.use_model_config:
+        cfg.merge_from_dict(config)
+    else:
+        cfg = config
+    if cfg.train.lr_scheduler.type == 'noam':
+        cfg.train.lr_scheduler = {
+            'type': 'LambdaLR',
+            'lr_lambda': noam_lambda,
+            'options': {
+                'by_epoch': False
+            }
+        }
+    if args.use_megatron:
+        cfg.train.hooks.append({'type': 'MegatronHook'})
+    return cfg
+
+
+dataset = MsDataset.load(args.train_dataset_name)
 train_dataset = dataset['train']
 eval_dataset = dataset['validation' if 'validation' in dataset else 'test']

@@ -100,7 +104,7 @@ kwargs = dict(
    eval_dataset=eval_dataset,
    seed=args.seed,
    work_dir=args.work_dir,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)

 trainer: EpochBasedTrainer = build_trainer(
    name=args.trainer, default_args=kwargs)
--- a/examples/pytorch/text_generation/run_train_gpt3.sh
+++ b/examples/pytorch/text_generation/run_train_gpt3.sh
@@ -8,7 +8,7 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
    --trainer 'nlp-gpt3-trainer' \
    --work_dir './tmp' \
    --model 'damo/nlp_gpt3_text-generation_1.3B' \
-    --dataset_name 'chinese-poetry-collection' \
+    --train_dataset_name 'chinese-poetry-collection' \
    --preprocessor 'text-gen-jieba-tokenizer' \
    --src_txt 'text1' \
    --tgt_txt 'text2' \
@@ -20,4 +20,5 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
    --world_size $WORLD_SIZE \
    --tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \
    --use_megatron true \
-    # --dataset_name 'DuReader_robust-QG' \ # input&output
+    --use_model_config true \
+    # --train_dataset_name 'DuReader_robust-QG' \ # input&output
--- a/examples/pytorch/text_generation/run_train_mt5.sh
+++ b/examples/pytorch/text_generation/run_train_mt5.sh
@@ -0,0 +1,13 @@
+PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.py \
+    --trainer 'text-generation-trainer' \
+    --work_dir './tmp' \
+    --task 'text2text-generation' \
+    --model 'damo/nlp_mt5_zero-shot-augment_chinese-base' \
+    --train_dataset_name 'DuReader_robust-QG' \
+    --src_txt 'text1' \
+    --tgt_txt 'text2' \
+    --max_epochs 1 \
+    --use_model_config True \
+    --per_device_train_batch_size 8 \
+    --lr 1e-3 \
+    --lr_scheduler 'noam' \
--- a/examples/pytorch/text_generation/run_train_palm.sh
+++ b/examples/pytorch/text_generation/run_train_palm.sh
@@ -2,10 +2,11 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
    --trainer 'text-generation-trainer' \
    --work_dir './tmp' \
    --model 'damo/nlp_palm2.0_pretrained_chinese-base' \
-    --dataset_name 'DuReader_robust-QG' \
+    --train_dataset_name 'DuReader_robust-QG' \
    --src_txt 'text1' \
    --tgt_txt 'text2' \
-    --max_epochs 15 \
+    --max_epochs 1 \
+    --use_model_config True \
    --per_device_train_batch_size 8 \
    --lr 1e-3 \
    --lr_scheduler 'noam' \
--- a/examples/pytorch/token_classification/finetune_token_classification.py
+++ b/examples/pytorch/token_classification/finetune_token_classification.py
@@ -1,20 +1,22 @@
 from dataclasses import dataclass, field

-from modelscope.metainfo import Trainers
-from modelscope.msdatasets import MsDataset
-from modelscope.trainers import build_trainer
-from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
-                                               set_flatten_value)
+from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
+                        build_dataset_from_file)


-@dataclass
+@dataclass(init=False)
 class TokenClassificationArguments(TrainingArgs):
-
    trainer: str = field(
-        default=Trainers.default, metadata={
+        default=None, metadata={
            'help': 'The trainer used',
        })

+    work_dir: str = field(
+        default='./tmp',
+        metadata={
+            'help': 'The working path for saving checkpoint',
+        })
+
    preprocessor: str = field(
        default=None,
        metadata={
@@ -29,31 +31,41 @@ class TokenClassificationArguments(TrainingArgs):
            'cfg_node': 'preprocessor.padding'
        })

-    train_dataset_params: str = field(
+    mode: str = field(
+        default='inference',
+        metadata={
+            'help': 'The preprocessor padding',
+            'cfg_node': 'preprocessor.mode'
+        })
+
+    first_sequence: str = field(
        default=None,
        metadata={
-            'cfg_node': 'dataset.train',
-            'cfg_getter': get_flatten_value,
-            'cfg_setter': set_flatten_value,
+            'cfg_node': 'preprocessor.first_sequence',
            'help': 'The parameters for train dataset',
        })

-    def __call__(self, config):
-        config = super().__call__(config)
-        if config.safe_get('dataset.train.label') == 'ner_tags':
-            ner_tags_labels = train_dataset['ner_tags'] + eval_dataset[
-                'ner_tags']
-            label_enumerate_values = self._get_label_list(ner_tags_labels)
-            config.merge_from_dict(
-                {'dataset.train.labels': label_enumerate_values})
-        if config.train.lr_scheduler.type == 'LinearLR':
-            config.train.lr_scheduler['total_iters'] = \
-                int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
-        return config
+    label: str = field(
+        default=None,
+        metadata={
+            'cfg_node': 'preprocessor.label',
+            'help': 'The parameters for train dataset',
+        })

-    # TODO: Future performance optimization in MsDataset
-    @staticmethod
-    def _get_label_list(labels):
+    sequence_length: int = field(
+        default=128,
+        metadata={
+            'cfg_node': 'preprocessor.sequence_length',
+            'help': 'The parameters for train dataset',
+        })
+
+
+training_args = TokenClassificationArguments().parse_cli()
+config, args = training_args.to_config()
+print(args)
+
+
+def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
@@ -62,27 +74,56 @@ class TokenClassificationArguments(TrainingArgs):
    return label_list


-args = TokenClassificationArguments.from_cli(task='token-classification')
-print(args)
+def cfg_modify_fn(cfg):
+    if args.use_model_config:
+        cfg.merge_from_dict(config)
+    else:
+        cfg = config
+    labels = train_dataset[training_args.label] + validation_dataset[
+        training_args.label]
+    label_enumerate_values = get_label_list(labels)
+    cfg.merge_from_dict({
+        'preprocessor.label2id':
+        {label: id
+         for id, label in enumerate(label_enumerate_values)}
+    })
+    cfg.merge_from_dict({'model.num_labels': len(label_enumerate_values)})
+    cfg.merge_from_dict({'preprocessor.use_fast': True})
+    cfg.merge_from_dict({
+        'evaluation.metrics': {
+            'type': 'token-cls-metric',
+            'label2id':
+            {label: id
+             for id, label in enumerate(label_enumerate_values)}
+        }
+    })
+    if cfg.train.lr_scheduler.type == 'LinearLR':
+        cfg.train.lr_scheduler['total_iters'] = \
+            int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
+    return cfg

-# load dataset
-train_dataset = MsDataset.load(
-    args.dataset_name,
-    subset_name=args.subset_name,
+
+if args.dataset_json_file is None:
+    train_dataset = MsDataset.load(
+        args.train_dataset_name,
+        subset_name=args.train_subset_name,
        split='train',
-    namespace='damo')['train']
-eval_dataset = MsDataset.load(
-    args.dataset_name,
-    subset_name=args.subset_name,
+        namespace=args.train_dataset_namespace)['train']
+    validation_dataset = MsDataset.load(
+        args.train_dataset_name,
+        subset_name=args.train_subset_name,
        split='validation',
-    namespace='damo')['validation']
+        namespace=args.train_dataset_namespace)['validation']
+else:
+    train_dataset, validation_dataset = build_dataset_from_file(
+        args.dataset_json_file)

 kwargs = dict(
    model=args.model,
    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
+    eval_dataset=validation_dataset,
    work_dir=args.work_dir,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)

-trainer = build_trainer(name=args.trainer, default_args=kwargs)
+trainer = EpochBasedTrainer(**kwargs)
 trainer.train()
--- a/examples/pytorch/token_classification/run_train_mgeo.sh
+++ b/examples/pytorch/token_classification/run_train_mgeo.sh
@@ -1,15 +1,22 @@
-PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
+PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
+    --task 'token-classification' \
    --trainer 'nlp-base-trainer' \
    --work_dir './tmp' \
    --model 'damo/mgeo_backbone_chinese_base' \
-    --dataset_name 'GeoGLUE' \
-    --subset_name 'GeoETA' \
-    --train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
+    --train_dataset_name 'GeoGLUE' \
+    --train_subset_name 'GeoETA' \
+    --train_dataset_namespace 'damo' \
+    --first_sequence 'tokens' \
+    --eval_strategy by_step \
+    --eval_interval 10 \
+    --label 'ner_tags' \
+    --sequence_length 128 \
    --preprocessor 'token-cls-tokenizer' \
    --preprocessor_padding 'max_length' \
    --max_epochs 1 \
+    --mode 'inference' \
+    --use_model_config True \
    --per_device_train_batch_size 32 \
+    --train_data_worker 0 \
+    --eval_data_worker 0 \
    --lr 3e-5 \
-    --save_ckpt_strategy 'by_epoch' \
-    --logging_interval 100 \
-    --eval_strategy 'by_epoch' \
--- a/examples/pytorch/token_classification/run_train_structbert.sh
+++ b/examples/pytorch/token_classification/run_train_structbert.sh
@@ -1,16 +1,22 @@
-PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
+PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
+    --task 'token-classification' \
    --trainer 'nlp-base-trainer' \
    --work_dir './tmp' \
    --model 'damo/nlp_structbert_backbone_base_std' \
-    --dataset_name 'GeoGLUE' \
-    --subset_name 'GeoETA' \
-    --train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
+    --train_dataset_name 'GeoGLUE' \
+    --train_subset_name 'GeoETA' \
+    --train_dataset_namespace 'damo' \
+    --first_sequence 'tokens' \
+    --eval_strategy by_step \
+    --eval_interval 20 \
+    --label 'ner_tags' \
+    --sequence_length 128 \
    --preprocessor 'token-cls-tokenizer' \
    --preprocessor_padding 'max_length' \
    --max_epochs 2 \
+    --mode 'inference' \
+    --use_model_config True \
    --per_device_train_batch_size 32 \
+    --train_data_worker 0 \
+    --eval_data_worker 0 \
    --lr 3e-5 \
-    --save_ckpt_strategy 'by_epoch' \
-    --logging_interval 1 \
-    --eval_strategy 'by_step' \
-    --eval_interval 20 \
--- a/examples/pytorch/transformers/configuration.json
+++ b/examples/pytorch/transformers/configuration.json
@@ -1 +0,0 @@
-{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}
--- a/examples/pytorch/transformers/finetune_transformers_model.py
+++ b/examples/pytorch/transformers/finetune_transformers_model.py
@@ -5,11 +5,11 @@ from datasets import load_dataset
 from transformers import (BertForSequenceClassification, BertTokenizerFast,
                          default_data_collator)

+from modelscope import TrainingArgs
 from modelscope.trainers import EpochBasedTrainer, build_trainer
-from modelscope.trainers.default_config import DEFAULT_CONFIG, TrainingArgs


-@dataclass
+@dataclass(init=False)
 class TransformersArguments(TrainingArgs):

    num_labels: int = field(
@@ -17,13 +17,27 @@ class TransformersArguments(TrainingArgs):
            'help': 'The number of labels',
        })

+    sentence: str = field(
+        default=None, metadata={
+            'help': 'The sentence key',
+        })

-args = TransformersArguments.from_cli(
-    task='text-classification', eval_metrics='seq-cls-metric')
+    label: str = field(
+        default=None, metadata={
+            'help': 'The label key',
+        })

-print(args)

-dataset = load_dataset(args.dataset_name, args.subset_name)
+training_args = TransformersArguments(
+    task='text-classification', eval_metrics='seq-cls-metric').parse_cli()
+config, args = training_args.to_config()
+
+print(config, args)
+
+train_dataset = load_dataset(
+    args.train_dataset_name, args.train_subset_name, split=args.train_split)
+val_dataset = load_dataset(
+    args.val_dataset_name, args.val_subset_name, split=args.val_split)

 model = BertForSequenceClassification.from_pretrained(
    args.model, num_labels=args.num_labels)
@@ -31,26 +45,30 @@ tokenizer = BertTokenizerFast.from_pretrained(args.model)


 def tokenize_sentence(row):
-    return tokenizer(row['sentence'], padding='max_length', max_length=128)
+    return tokenizer(
+        row[training_args.sentence], padding='max_length', max_length=128)


 # Extra columns, Rename columns
-dataset = dataset.map(tokenize_sentence).remove_columns(['sentence',
-                                                         'idx']).rename_column(
-                                                             'label', 'labels')
+train_dataset = train_dataset.map(tokenize_sentence)
+val_dataset = val_dataset.map(tokenize_sentence)
+if training_args.label != 'labels':
+    train_dataset = train_dataset.rename_columns(
+        {training_args.label: 'labels'})
+    val_dataset = val_dataset.rename_columns({training_args.label: 'labels'})

 cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
-DEFAULT_CONFIG.dump(cfg_file)
+config.dump(cfg_file)

 kwargs = dict(
    model=model,
    cfg_file=cfg_file,
    # data_collator
    data_collator=default_data_collator,
-    train_dataset=dataset['train'],
-    eval_dataset=dataset['validation'],
-    seed=args.seed,
-    cfg_modify_fn=args)
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    remove_unused_data=True,
+    seed=args.seed)

 os.environ['LOCAL_RANK'] = str(args.local_rank)
 trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
--- a/examples/pytorch/transformers/run_train.sh
+++ b/examples/pytorch/transformers/run_train.sh
@@ -1,5 +1,14 @@
 PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
    --model bert-base-uncased \
    --num_labels 15 \
-    --dataset_name clue \
-    --subset_name tnews
+    --train_dataset_name clue \
+    --train_subset_name tnews \
+    --train_split train \
+    --val_dataset_name clue \
+    --val_subset_name tnews \
+    --train_split train \
+    --val_split validation \
+    --sentence sentence \
+    --label label \
+    --eval_strategy by_step \
+    --eval_interval 100
--- a/modelscope/init.py
+++ b/modelscope/init.py
@@ -1,4 +1,79 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .version import __release_datetime__, __version__
+from typing import TYPE_CHECKING

-__all__ = ['__version__', '__release_datetime__']
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .version import __release_datetime__, __version__
+    from .trainers import EpochBasedTrainer, TrainingArgs, build_dataset_from_file
+    from .trainers import Hook, Priority
+    from .exporters import Exporter
+    from .exporters import TfModelExporter
+    from .exporters import TorchModelExporter
+    from .hub.api import HubApi
+    from .hub.snapshot_download import snapshot_download
+    from .hub.push_to_hub import push_to_hub, push_to_hub_async
+    from .hub.check_model import check_model_is_id, check_local_model_is_latest
+    from .metrics import AudioNoiseMetric, Metric, task_default_metrics, ImageColorEnhanceMetric, ImageDenoiseMetric, \
+        ImageInstanceSegmentationCOCOMetric, ImagePortraitEnhancementMetric, SequenceClassificationMetric, \
+        TextGenerationMetric, TokenClassificationMetric, VideoSummarizationMetric, MovieSceneSegmentationMetric, \
+        AccuracyMetric, BleuMetric, ImageInpaintingMetric, ReferringVideoObjectSegmentationMetric, \
+        VideoFrameInterpolationMetric, VideoStabilizationMetric, VideoSuperResolutionMetric, PplMetric, \
+        ImageQualityAssessmentDegradationMetric, ImageQualityAssessmentMosMetric, TextRankingMetric, \
+        LossMetric, ImageColorizationMetric, OCRRecognitionMetric
+    from .models import Model, TorchModel
+    from .preprocessors import Preprocessor
+    from .pipelines import Pipeline, pipeline
+    from .utils.hub import read_config, create_model_if_not_exist
+    from .utils.logger import get_logger
+    from .msdatasets import MsDataset
+
+else:
+    _import_structure = {
+        'version': ['__release_datetime__', '__version__'],
+        'trainers': [
+            'EpochBasedTrainer', 'TrainingArgs', 'Hook', 'Priority',
+            'build_dataset_from_file'
+        ],
+        'exporters': [
+            'Exporter',
+            'TfModelExporter',
+            'TorchModelExporter',
+        ],
+        'hub.api': ['HubApi'],
+        'hub.snapshot_download': ['snapshot_download'],
+        'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'],
+        'hub.check_model':
+        ['check_model_is_id', 'check_local_model_is_latest'],
+        'metrics': [
+            'AudioNoiseMetric', 'Metric', 'task_default_metrics',
+            'ImageColorEnhanceMetric', 'ImageDenoiseMetric',
+            'ImageInstanceSegmentationCOCOMetric',
+            'ImagePortraitEnhancementMetric', 'SequenceClassificationMetric',
+            'TextGenerationMetric', 'TokenClassificationMetric',
+            'VideoSummarizationMetric', 'MovieSceneSegmentationMetric',
+            'AccuracyMetric', 'BleuMetric', 'ImageInpaintingMetric',
+            'ReferringVideoObjectSegmentationMetric',
+            'VideoFrameInterpolationMetric', 'VideoStabilizationMetric',
+            'VideoSuperResolutionMetric', 'PplMetric',
+            'ImageQualityAssessmentDegradationMetric',
+            'ImageQualityAssessmentMosMetric', 'TextRankingMetric',
+            'LossMetric', 'ImageColorizationMetric', 'OCRRecognitionMetric'
+        ],
+        'models': ['Model', 'TorchModel'],
+        'preprocessors': ['Preprocessor'],
+        'pipelines': ['Pipeline', 'pipeline'],
+        'utils.hub': ['read_config', 'create_model_if_not_exist'],
+        'utils.logger': ['get_logger'],
+        'msdatasets': ['MsDataset']
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/cli/template/template.tpl
+++ b/modelscope/cli/template/template.tpl
@@ -122,10 +122,11 @@ class ${pipeline_name}(Pipeline):
 # Tips: usr_config_path is the temporary save configuration location， after upload modelscope hub, it is the model_id
 usr_config_path = '${configuration_path}'
 config = Config({
-    'framework': 'pytorch',
-    'task': '${task_name}',
-    'model': {'type': 'my-custom-model'},
-    "pipeline": {"type": "my-custom-pipeline"}
+    "framework": 'pytorch',
+    "task": '${task_name}',
+    "model": {'type': 'my-custom-model'},
+    "pipeline": {"type": "my-custom-pipeline"},
+    "allow_remote": True
 })
 config.dump('${configuration_path}' + 'configuration.json')

--- a/modelscope/models/cv/human_wholebody_keypoint/init.py
+++ b/modelscope/models/cv/human_wholebody_keypoint/init.py
@@ -1,14 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+
 from typing import TYPE_CHECKING

 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
-    from .human_wholebody_keypoint import HumanWholeBodyKeypoint
-
+    from .ans_dfsmn_exporter import ANSDFSMNExporter
 else:
    _import_structure = {
-        'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
+        'ans_dfsmn_exporter': ['ANSDFSMNExporter'],
    }

    import sys
--- a/modelscope/exporters/audio/ans_dfsmn_exporter.py
+++ b/modelscope/exporters/audio/ans_dfsmn_exporter.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import torch
+
+from modelscope.exporters.builder import EXPORTERS
+from modelscope.exporters.torch_model_exporter import TorchModelExporter
+from modelscope.metainfo import Models
+from modelscope.utils.constant import ModelFile, Tasks
+
+INPUT_NAME = 'input'
+OUTPUT_NAME = 'output'
+
+
+@EXPORTERS.register_module(
+    Tasks.acoustic_noise_suppression, module_name=Models.speech_dfsmn_ans)
+class ANSDFSMNExporter(TorchModelExporter):
+
+    def export_onnx(self, output_dir: str, opset=9, **kwargs):
+        """Export the model as onnx format files.
+
+        Args:
+            output_dir: The output dir.
+            opset: The version of the ONNX operator set to use.
+            kwargs:
+                device: The device used to forward.
+        Returns:
+            A dict containing the model key - model file path pairs.
+        """
+        model = self.model if 'model' not in kwargs else kwargs.pop('model')
+        device_name = 'cpu' if 'device' not in kwargs else kwargs.pop('device')
+        model_bin_file = os.path.join(model.model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE)
+        if os.path.exists(model_bin_file):
+            checkpoint = torch.load(model_bin_file, map_location='cpu')
+            model.load_state_dict(checkpoint)
+        onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
+
+        with torch.no_grad():
+            model.eval()
+            device = torch.device(device_name)
+            model.to(device)
+            model_script = torch.jit.script(model)
+            fbank_input = torch.zeros((1, 3, 120), dtype=torch.float32)
+            torch.onnx.export(
+                model_script,
+                fbank_input,
+                onnx_file,
+                opset_version=opset,
+                input_names=[INPUT_NAME],
+                output_names=[OUTPUT_NAME],
+                dynamic_axes={
+                    INPUT_NAME: {
+                        0: 'batch_size',
+                        1: 'number_of_frame'
+                    },
+                    OUTPUT_NAME: {
+                        0: 'batch_size',
+                        1: 'number_of_frame'
+                    }
+                })
+        return {'model': onnx_file}
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -6,6 +6,7 @@ import functools
 import os
 import pickle
 import platform
+import re
 import shutil
 import tempfile
 import uuid
@@ -15,10 +16,10 @@ from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import Dict, List, Optional, Tuple, Union

+import requests
 from requests import Session
 from requests.adapters import HTTPAdapter, Retry

-from modelscope import __version__
 from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
                                      API_RESPONSE_FIELD_DATA,
                                      API_RESPONSE_FIELD_EMAIL,
@@ -45,7 +46,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       MASTER_MODEL_BRANCH, DatasetFormations,
                                       DatasetMetaFormats,
                                       DatasetVisibilityMap, DownloadChannel,
-                                       ModelFile)
+                                       ModelFile, VirgoDatasetConfig)
 from modelscope.utils.logger import get_logger
 from .utils.utils import (get_endpoint, get_release_datetime,
                          model_id_to_group_owner_name)
@@ -160,6 +161,7 @@ class HubApi:
            'Visibility': visibility,  # server check
            'License': license,
            'OriginalModelId': original_model_id,
+            'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''),
        }
        r = self.session.post(
            path, json=body, cookies=cookies, headers=self.headers)
@@ -236,8 +238,10 @@ class HubApi:
                   license: Optional[str] = Licenses.APACHE_V2,
                   chinese_name: Optional[str] = None,
                   commit_message: Optional[str] = 'upload model',
+                   tag: Optional[str] = None,
                   revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
-                   original_model_id: Optional[str] = None):
+                   original_model_id: Optional[str] = None,
+                   ignore_file_pattern: Optional[Union[List[str], str]] = None):
        """Upload model from a given directory to given repository. A valid model directory
        must contain a configuration.json file.

@@ -268,10 +272,13 @@ class HubApi:
                chinese name of the new created model.
            commit_message(`str`, *optional*, defaults to `None`):
                commit message of the push request.
+            tag(`str`, *optional*, defaults to `None`):
+                The tag on this commit
            revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
                which branch to push. If the branch is not exists, It will create a new
                branch and push to it.
            original_model_id (str, optional): The base model id which this model is trained from
+            ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading

        Raises:
            InvalidParameter: Parameter invalid.
@@ -292,6 +299,10 @@ class HubApi:
        if cookies is None:
            raise NotLoginException('Must login before upload!')
        files_to_save = os.listdir(model_dir)
+        if ignore_file_pattern is None:
+            ignore_file_pattern = []
+        if isinstance(ignore_file_pattern, str):
+            ignore_file_pattern = [ignore_file_pattern]
        try:
            self.get_model(model_id=model_id)
        except Exception:
@@ -325,6 +336,8 @@ class HubApi:
                        shutil.rmtree(src, ignore_errors=True)
            for f in files_to_save:
                if f[0] != '.':
+                    if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
+                        continue
                    src = os.path.join(model_dir, f)
                    if os.path.isdir(src):
                        shutil.copytree(src, os.path.join(tmp_dir, f))
@@ -338,6 +351,8 @@ class HubApi:
                commit_message=commit_message,
                local_branch=revision,
                remote_branch=revision)
+            if tag is not None:
+                repo.tag_and_push(tag, tag)
        except Exception:
            raise
        finally:
@@ -581,6 +596,17 @@ class HubApi:
        file_list = file_list['Files']
        return file_list

+    @staticmethod
+    def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
+        """
+        Dump the data_type as a local file, in order to get the dataset formation without calling the datahub.
+        More details, please refer to the class `modelscope.utils.constant.DatasetFormations`.
+        """
+        dataset_type_file_path = os.path.join(meta_cache_dir,
+                                              f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
+        with open(dataset_type_file_path, 'w') as fp:
+            fp.write('*** Automatically-generated file, do not modify ***')
+
    def get_dataset_meta_files_local_paths(self, dataset_name: str,
                                           namespace: str,
                                           revision: str,
@@ -591,10 +617,7 @@ class HubApi:
        cookies = ModelScopeConfig.get_cookies()

        # Dump the data_type as a local file
-        dataset_type_file_path = os.path.join(meta_cache_dir,
-                                              f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
-        with open(dataset_type_file_path, 'w') as fp:
-            fp.write('*** Automatically-generated file, do not modify ***')
+        HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)

        for file_info in file_list:
            file_path = file_info['Path']
@@ -661,7 +684,6 @@ class HubApi:
            cookies = self._check_cookie(use_cookies=True)
        else:
            cookies = ModelScopeConfig.get_cookies()
-        r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)

        r = self.session.get(
            url=datahub_url, cookies=cookies, headers=self.headers)
@@ -669,6 +691,31 @@ class HubApi:
        raise_on_error(resp)
        return resp['Data']

+    def get_virgo_meta(self, dataset_id: str, version: int = 1) -> dict:
+        """
+        Get virgo dataset meta info.
+        """
+        virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
+        if not virgo_endpoint:
+            raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
+
+        virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
+        cookies = requests.utils.dict_from_cookiejar(ModelScopeConfig.get_cookies())
+
+        dataset_info = dict(
+            dataSetId=dataset_id,
+            dataSetVersion=version
+        )
+        data = dict(
+            data=dataset_info,
+        )
+        r = self.session.post(url=virgo_dataset_url, json=data, cookies=cookies, headers=self.headers, timeout=900)
+        resp = r.json()
+        if resp['code'] != 0:
+            raise RuntimeError(f'Failed to get virgo dataset: {resp}')
+
+        return resp['data']
+
    def get_dataset_access_config_for_unzipped(self,
                                               dataset_name: str,
                                               namespace: str,
@@ -895,6 +942,7 @@ class ModelScopeConfig:
        if MODELSCOPE_CLOUD_USERNAME in os.environ:
            user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]

+        from modelscope import __version__
        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
            __version__,
            platform.python_version(),
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -2,6 +2,7 @@

 from http import HTTPStatus

+import requests
 from requests.exceptions import HTTPError

 from modelscope.utils.logger import get_logger
@@ -57,13 +58,22 @@ def is_ok(rsp):
    return rsp['Code'] == HTTPStatus.OK and rsp['Success']


+def _decode_response_error(response: requests.Response):
+    if 'application/json' in response.headers.get('content-type', ''):
+        message = response.json()
+    else:
+        message = response.content.decode('utf-8')
+    return message
+
+
 def handle_http_post_error(response, url, request_body):
    try:
        response.raise_for_status()
    except HTTPError as error:
        logger.error('Request %s with body: %s exception' %
                     (url, request_body))
-        logger.error('Response details: %s' % response.content)
+        message = _decode_response_error(response)
+        logger.error('Response details: %s' % message)
        raise error


@@ -75,7 +85,8 @@ def handle_http_response(response, logger, cookies, model_id):
            logger.error(
                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                private. Please login first.')
-        logger.error('Response details: %s' % response.content)
+        message = _decode_response_error(response)
+        logger.error('Response details: %s' % message)
        raise error


--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -12,7 +12,6 @@ import requests
 from requests.adapters import Retry
 from tqdm import tqdm

-from modelscope import __version__
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
                                      API_FILE_DOWNLOAD_RETRY_TIMES,
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -55,16 +55,10 @@ class GitCommandWrapper(metaclass=Singleton):
            response.check_returncode()
            return response
        except subprocess.CalledProcessError as error:
-            if response.returncode == 1:
-                logger.info('Nothing to commit.')
-                return response
-            else:
-                logger.error(
-                    'There are error run git command, you may need to login first.'
-                )
-                raise GitError('stdout: %s, stderr: %s' %
-                               (response.stdout.decode('utf8'),
-                                error.stderr.decode('utf8')))
+            logger.error('There are error run git command.')
+            raise GitError(
+                'stdout: %s, stderr: %s' %
+                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))

    def config_auth_token(self, repo_dir, auth_token):
        url = self.get_repo_remote_url(repo_dir)
@@ -199,8 +193,11 @@ class GitCommandWrapper(metaclass=Singleton):
        else:
            return ['/'.join(line.split('/')[1:]) for line in info[1:]]

-    def pull(self, repo_dir: str):
-        cmds = ['-C', repo_dir, 'pull']
+    def pull(self,
+             repo_dir: str,
+             remote: str = 'origin',
+             branch: str = 'master'):
+        cmds = ['-C', repo_dir, 'pull', remote, branch]
        return self._run_git_command(*cmds)

    def push(self,
--- a/modelscope/hub/push_to_hub.py
+++ b/modelscope/hub/push_to_hub.py
@@ -4,8 +4,8 @@ import concurrent.futures
 import os

 from modelscope.hub.api import HubApi
-from modelscope.hub.constants import Licenses, ModelVisibility
-from modelscope.hub.errors import NotExistError
+from modelscope.hub.constants import ModelVisibility
+from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -18,7 +18,10 @@ def _api_push_to_hub(repo_name,
                     token,
                     private=True,
                     commit_message='',
-                     source_repo=''):
+                     tag=None,
+                     source_repo='',
+                     ignore_file_pattern=None,
+                     revision=DEFAULT_REPOSITORY_REVISION):
    try:
        api = HubApi()
        api.login(token)
@@ -29,7 +32,10 @@ def _api_push_to_hub(repo_name,
            if not private else ModelVisibility.PRIVATE,
            chinese_name=repo_name,
            commit_message=commit_message,
-            original_model_id=source_repo)
+            tag=tag,
+            original_model_id=source_repo,
+            ignore_file_pattern=ignore_file_pattern,
+            revision=revision)
        commit_message = commit_message or 'No commit message'
        logger.info(
            f'Successfully upload the model to {repo_name} with message: {commit_message}'
@@ -48,7 +54,10 @@ def push_to_hub(repo_name,
                private=True,
                retry=3,
                commit_message='',
-                source_repo=''):
+                tag=None,
+                source_repo='',
+                ignore_file_pattern=None,
+                revision=DEFAULT_REPOSITORY_REVISION):
    """
    Args:
        repo_name: The repo name for the modelhub repo
@@ -57,13 +66,18 @@ def push_to_hub(repo_name,
        private: If is a private repo, default True
        retry: Retry times if something error in uploading, default 3
        commit_message: The commit message
+        tag: The tag of this commit
        source_repo: The source repo (model id) which this model comes from
-
+        ignore_file_pattern: The file pattern to be ignored in uploading.
+        revision: The branch to commit to
    Returns:
        The boolean value to represent whether the model is uploaded.
    """
    if token is None:
        token = os.environ.get('MODELSCOPE_API_TOKEN')
+    if ignore_file_pattern is None:
+        ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
+    assert repo_name is not None
    assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
    assert os.path.isdir(output_dir)
    assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
@@ -73,7 +87,8 @@ def push_to_hub(repo_name,
        f'Uploading {output_dir} to {repo_name} with message {commit_message}')
    for i in range(retry):
        if _api_push_to_hub(repo_name, output_dir, token, private,
-                            commit_message, source_repo):
+                            commit_message, tag, source_repo,
+                            ignore_file_pattern, revision):
            return True
    return False

@@ -83,7 +98,10 @@ def push_to_hub_async(repo_name,
                      token=None,
                      private=True,
                      commit_message='',
-                      source_repo=''):
+                      tag=None,
+                      source_repo='',
+                      ignore_file_pattern=None,
+                      revision=DEFAULT_REPOSITORY_REVISION):
    """
    Args:
        repo_name: The repo name for the modelhub repo
@@ -91,13 +109,18 @@ def push_to_hub_async(repo_name,
        token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None
        private: If is a private repo, default True
        commit_message: The commit message
+        tag: The tag of this commit
        source_repo: The source repo (model id) which this model comes from
-
+        ignore_file_pattern: The file pattern to be ignored in uploading
+        revision: The branch to commit to
    Returns:
        A handler to check the result and the status
    """
    if token is None:
        token = os.environ.get('MODELSCOPE_API_TOKEN')
+    if ignore_file_pattern is None:
+        ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
+    assert repo_name is not None
    assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
    assert os.path.isdir(output_dir)
    assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
@@ -106,4 +129,5 @@ def push_to_hub_async(repo_name,
    logger.info(
        f'Uploading {output_dir} to {repo_name} with message {commit_message}')
    return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
-                            private, commit_message, source_repo)
+                            private, commit_message, tag, source_repo,
+                            ignore_file_pattern, revision)
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -88,6 +88,26 @@ class Repository:
            remote = None
        return remote

+    def pull(self, remote: str = 'origin', branch: str = 'master'):
+        """Pull remote branch
+
+        Args:
+            remote (str, optional): The remote name. Defaults to 'origin'.
+            branch (str, optional): The remote branch. Defaults to 'master'.
+        """
+        self.git_wrapper.pull(self.model_dir, remote=remote, branch=branch)
+
+    def add_lfs_type(self, file_name_suffix: str):
+        """Add file suffix to lfs list.
+
+        Args:
+            file_name_suffix (str): The file name suffix.
+                examples '*.safetensors'
+        """
+        os.system(
+            "printf '%s filter=lfs diff=lfs merge=lfs -text\n'>>%s" %
+            (file_name_suffix, os.path.join(self.model_dir, '.gitattributes')))
+
    def push(self,
             commit_message: str,
             local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
@@ -120,7 +140,6 @@ class Repository:
                                       self.model_repo_name)

        url = self.git_wrapper.get_repo_remote_url(self.model_dir)
-        self.git_wrapper.pull(self.model_dir)

        self.git_wrapper.add(self.model_dir, all_files=True)
        self.git_wrapper.commit(self.model_dir, commit_message)
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -116,15 +116,9 @@ class Models(object):
    bad_image_detecting = 'bad-image-detecting'
    controllable_image_generation = 'controllable-image-generation'
    longshortnet = 'longshortnet'
+    fastinst = 'fastinst'
    pedestrian_attribute_recognition = 'pedestrian-attribute-recognition'

-    # EasyCV models
-    yolox = 'YOLOX'
-    segformer = 'Segformer'
-    hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
-    image_object_detection_auto = 'image-object-detection-auto'
-    dino = 'DINO'
-
    # nlp models
    bert = 'bert'
    palm = 'palm-v2'
@@ -177,6 +171,7 @@ class Models(object):
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
    speech_dfsmn_ans = 'speech_dfsmn_ans'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
+    speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
    speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
    speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
    kws_kwsbp = 'kws-kwsbp'
@@ -187,6 +182,9 @@ class Models(object):
    generic_sv = 'generic-sv'
    ecapa_tdnn_sv = 'ecapa-tdnn-sv'
    campplus_sv = 'cam++-sv'
+    eres2net_sv = 'eres2net-sv'
+    scl_sd = 'scl-sd'
+    rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
    generic_lm = 'generic-lm'

    # multi-modal models
@@ -205,6 +203,8 @@ class Models(object):
    hitea = 'hitea'
    soonet = 'soonet'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
+    mplug_owl = 'mplug-owl'
+    clip_interrogator = 'clip-interrogator'

    # science models
    unifold = 'unifold'
@@ -255,6 +255,7 @@ class Pipelines(object):
    should use task name for this pipeline.
        For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
    """
+    pipeline_template = 'pipeline-template'
    # vision tasks
    portrait_matting = 'unet-image-matting'
    universal_matting = 'unet-universal-matting'
@@ -277,8 +278,6 @@ class Pipelines(object):
    tbs_detection = 'tbs-detection'
    object_detection = 'vit-object-detection'
    abnormal_object_detection = 'abnormal-object-detection'
-    easycv_detection = 'easycv-detection'
-    easycv_segmentation = 'easycv-segmentation'
    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
    salient_detection = 'u2net-salient-detection'
    salient_boudary_detection = 'res2net-salient-detection'
@@ -347,7 +346,6 @@ class Pipelines(object):
    video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking'
    video_multi_object_tracking = 'video-multi-object-tracking'
    image_panoptic_segmentation = 'image-panoptic-segmentation'
-    image_panoptic_segmentation_easycv = 'image-panoptic-segmentation-easycv'
    video_summarization = 'googlenet_pgl_video_summarization'
    language_guided_video_summarization = 'clip-it-video-summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
@@ -402,7 +400,7 @@ class Pipelines(object):
    nerf_recon_acc = 'nerf-recon-acc'
    bad_image_detecting = 'bad-image-detecting'
    controllable_image_generation = 'controllable-image-generation'
-
+    fast_instance_segmentation = 'fast-instance-segmentation'
    image_quality_assessment_mos = 'image-quality-assessment-mos'
    image_quality_assessment_man = 'image-quality-assessment-man'
    image_quality_assessment_degradation = 'image-quality-assessment-degradation'
@@ -485,6 +483,9 @@ class Pipelines(object):
    speaker_diarization_inference = 'speaker-diarization-inference'
    vad_inference = 'vad-inference'
    speaker_verification = 'speaker-verification'
+    speaker_verification_rdino = 'speaker-verification-rdino'
+    speaker_verification_eres2net = 'speaker-verification-eres2net'
+    speaker_change_locating = 'speaker-change-locating'
    lm_inference = 'language-score-prediction'
    speech_timestamp_inference = 'speech-timestamp-inference'

@@ -514,6 +515,7 @@ class Pipelines(object):
    gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
    soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
+    multimodal_dialogue = 'multimodal-dialogue'

    # science tasks
    protein_structure = 'unifold-protein-structure'
@@ -881,6 +883,7 @@ class NLPTrainers(object):
    document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
    document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
    siamese_uie_trainer = 'siamese-uie-trainer'
+    translation_evaluation_trainer = 'translation-evaluation-trainer'


 class MultiModalTrainers(object):
@@ -911,7 +914,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
    """

    default = 'trainer'
-    easycv = 'easycv'
    tinynas_damoyolo = 'tinynas-damoyolo'

    @staticmethod
@@ -933,8 +935,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
            return Fields.multi_modal
        elif attribute_or_value == Trainers.default:
            return Trainers.default
-        elif attribute_or_value == Trainers.easycv:
-            return Trainers.easycv
        else:
            return 'unknown'

@@ -1034,6 +1034,8 @@ class Preprocessors(object):
    vldoc_preprocessor = 'vldoc-preprocessor'
    hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
    diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
+    mplug_owl_preprocessor = 'mplug-owl-preprocessor'
+    image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'

    # science preprocessor
    unifold_preprocessor = 'unifold-preprocessor'
@@ -1098,6 +1100,8 @@ class Metrics(object):
    # metric for image-colorization task
    image_colorization_metric = 'image-colorization-metric'
    ocr_recognition_metric = 'ocr-recognition-metric'
+    # metric for translation evaluation
+    translation_evaluation_metric = 'translation-evaluation-metric'


 class Optimizers(object):
@@ -1165,14 +1169,6 @@ class LR_Schedulers(object):
 class CustomDatasets(object):
    """ Names for different datasets.
    """
-    ClsDataset = 'ClsDataset'
-    Face2dKeypointsDataset = 'FaceKeypointDataset'
-    HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
-    HumanWholeBodyKeypointDataset = 'WholeBodyCocoTopDownDataset'
-    SegDataset = 'SegDataset'
-    DetDataset = 'DetDataset'
-    DetImagesMixDataset = 'DetImagesMixDataset'
-    PanopticDataset = 'PanopticDataset'
    PairedDataset = 'PairedDataset'
    SiddDataset = 'SiddDataset'
    GoproDataset = 'GoproDataset'
--- a/modelscope/metrics/init.py
+++ b/modelscope/metrics/init.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
    from .loss_metric import LossMetric
    from .image_colorization_metric import ImageColorizationMetric
    from .ocr_recognition_metric import OCRRecognitionMetric
+    from .translation_evaluation_metric import TranslationEvaluationMetric
 else:
    _import_structure = {
        'audio_noise_metric': ['AudioNoiseMetric'],
@@ -62,7 +63,8 @@ else:
        'text_ranking_metric': ['TextRankingMetric'],
        'loss_metric': ['LossMetric'],
        'image_colorization_metric': ['ImageColorizationMetric'],
-        'ocr_recognition_metric': ['OCRRecognitionMetric']
+        'ocr_recognition_metric': ['OCRRecognitionMetric'],
+        'translation_evaluation_metric': ['TranslationEvaluationMetric']
    }

    import sys
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -42,6 +42,7 @@ class MetricKeys(object):
    NDCG = 'ndcg'
    AR = 'AR'
    Colorfulness = 'colorfulness'
+    Kendall_Tau_Correlation = 'kendall_tau_correlation'


 task_default_metrics = {
@@ -76,6 +77,7 @@ task_default_metrics = {
    Tasks.bad_image_detecting: [Metrics.accuracy],
    Tasks.ocr_recognition: [Metrics.ocr_recognition_metric],
    Tasks.efficient_diffusion_tuning: [Metrics.loss_metric],
+    Tasks.translation_evaluation: [Metrics.translation_evaluation_metric]
 }


--- a/modelscope/metrics/translation_evaluation_metric.py
+++ b/modelscope/metrics/translation_evaluation_metric.py
@@ -0,0 +1,174 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import importlib
+from typing import Dict, List, Union
+
+from pandas import DataFrame
+
+from modelscope.metainfo import Metrics
+from modelscope.metrics.base import Metric
+from modelscope.metrics.builder import METRICS, MetricKeys
+from modelscope.models.nlp.unite.configuration import InputFormat
+from modelscope.utils.logger import get_logger
+from modelscope.utils.registry import default_group
+
+logger = get_logger()
+
+
+@METRICS.register_module(
+    group_key=default_group, module_name=Metrics.translation_evaluation_metric)
+class TranslationEvaluationMetric(Metric):
+    r"""The metric class for translation evaluation.
+
+    """
+
+    def __init__(self, gap_threshold: float = 25.0):
+        r"""Build a translation evaluation metric, following the designed
+            Kendall's tau correlation from WMT Metrics Shared Task competitions.
+
+            Args:
+                gap_threshold: The score gap denoting the available hypothesis pair.
+
+            Returns:
+                A metric for translation evaluation.
+        """
+        self.gap_threshold = gap_threshold
+
+        self.lp = list()
+        self.segment_id = list()
+        self.raw_score = list()
+        self.score = list()
+        self.input_format = list()
+
+    def clear(self) -> None:
+        r"""Clear all the stored variables.
+        """
+        self.lp.clear()
+        self.segment_id.clear()
+        self.raw_score.clear()
+        self.input_format.clear()
+
+        self.score.clear()
+
+        return
+
+    def add(self, outputs: Dict[str, List[float]],
+            inputs: Dict[str, List[Union[float, int]]]) -> None:
+        r"""Collect the related results for processing.
+
+            Args:
+                outputs: Dict containing 'scores'
+                inputs: Dict containing 'labels' and 'segment_ids'
+
+        """
+
+        self.lp += inputs['lp']
+        self.segment_id += inputs['segment_id']
+        self.raw_score += inputs['raw_score']
+        self.input_format += inputs['input_format']
+
+        self.score += outputs['score']
+
+        return
+
+    def evaluate(self) -> Dict[str, Dict[str, float]]:
+        r"""Compute the Kendall's tau correlation.
+
+            Returns:
+                A dict denoting Kendall's tau correlation.
+
+        """
+
+        data = {
+            'lp': self.lp,
+            'segment_id': self.segment_id,
+            'raw_score': self.raw_score,
+            'input_format': self.input_format,
+            'score': self.score
+        }
+        data = DataFrame(data=data)
+        correlation = dict()
+
+        for input_format in data.input_format.unique():
+            logger.info('Evaluation results for %s input format'
+                        % input_format.value)
+            input_format_data = data[data.input_format == input_format]
+
+            temp_correlation = dict()
+
+            for lp in sorted(input_format_data.lp.unique()):
+                sub_data = input_format_data[input_format_data.lp == lp]
+                temp_correlation[input_format.value + '_'
+                                 + lp] = self.compute_kendall_tau(sub_data)
+                logger.info(
+                    '\t%s: %f' %
+                    (lp,
+                     temp_correlation[input_format.value + '_' + lp] * 100))
+
+            avg_correlation = sum(
+                temp_correlation.values()) / len(temp_correlation)
+            correlation[input_format.value + '_avg'] = avg_correlation
+            logger.info('Average evaluation result for %s input format: %f' %
+                        (input_format.value, avg_correlation))
+            logger.info('')
+            correlation.update(temp_correlation)
+
+        return correlation
+
+    def merge(self, other: 'TranslationEvaluationMetric') -> None:
+        r"""Merge the predictions from other TranslationEvaluationMetric objects.
+
+            Args:
+                other: Another TranslationEvaluationMetric object.
+
+        """
+
+        self.lp += other.lp
+        self.segment_id += other.segment_ids
+        self.raw_score += other.raw_score
+        self.input_format += other.input_format
+
+        self.score += other.score
+
+        return
+
+    def compute_kendall_tau(self, csv_data: DataFrame) -> float:
+        r"""Compute kendall's tau correlation.
+
+            Args:
+                csv_data: The pandas dataframe.
+
+            Returns:
+                float: THe kendall's Tau correlation.
+
+        """
+        concor = discor = 0
+
+        for segment_id in sorted(csv_data.segment_id.unique()):
+            group_csv_data = csv_data[csv_data.segment_id == segment_id]
+
+            examples = group_csv_data.to_dict('records')
+
+            for i in range(0, len(examples)):
+                for j in range(i + 1, len(examples)):
+                    if self.raw_score[i] - self.raw_score[
+                            j] >= self.gap_threshold:
+                        if self.score[i] > self.score[j]:
+                            concor += 1
+                        elif self.score[i] < self.score[j]:
+                            discor += 1
+                    elif self.raw_score[i] - self.raw_score[
+                            j] <= -self.gap_threshold:
+                        if self.score[i] < self.score[j]:
+                            concor += 1
+                        elif self.score[i] > self.score[j]:
+                            discor += 1
+
+        if concor + discor == 0:
+            logger.warning(
+                'We don\'t have available pairs when evaluation. '
+                'Marking the kendall tau correlation as the lowest value (-1.0).'
+            )
+            return -1.0
+        else:
+            return (concor - discor) / (concor + discor)
--- a/modelscope/models/audio/ans/conv_stft.py
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -39,7 +39,7 @@ class ConvSTFT(nn.Module):
        super(ConvSTFT, self).__init__()

        if fft_len is None:
-            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
+            self.fft_len = int(2**np.ceil(np.log2(win_len)))
        else:
            self.fft_len = fft_len

@@ -78,7 +78,7 @@ class ConviSTFT(nn.Module):
                 fix=True):
        super(ConviSTFT, self).__init__()
        if fft_len is None:
-            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
+            self.fft_len = int(2**np.ceil(np.log2(win_len)))
        else:
            self.fft_len = fft_len
        kernel, window = init_kernels(
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -45,27 +45,5 @@ class GenericAutomaticSpeechRecognition(Model):
    def forward(self) -> Dict[str, Any]:
        """preload model and return the info of the model
        """
-        if self.model_cfg['model_config']['type'] == Frameworks.tf:
-            from easyasr import asr_inference_paraformer_tf
-            if hasattr(asr_inference_paraformer_tf, 'preload'):
-                model_workspace = self.model_cfg['model_workspace']
-                model_path = os.path.join(model_workspace,
-                                          self.model_cfg['am_model'])
-                vocab_path = os.path.join(
-                    model_workspace,
-                    self.model_cfg['model_config']['vocab_file'])
-                sampled_ids = 'seq2seq/sampled_ids'
-                sampled_lengths = 'seq2seq/sampled_lengths'
-                if 'sampled_ids' in self.model_cfg['model_config']:
-                    sampled_ids = self.model_cfg['model_config']['sampled_ids']
-                if 'sampled_lengths' in self.model_cfg['model_config']:
-                    sampled_lengths = self.model_cfg['model_config'][
-                        'sampled_lengths']
-                asr_inference_paraformer_tf.preload(
-                    ngpu=1,
-                    asr_model_file=model_path,
-                    vocab_file=vocab_path,
-                    sampled_ids=sampled_ids,
-                    sampled_lengths=sampled_lengths)

        return self.model_cfg
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
@@ -0,0 +1,233 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
+from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32
+
+
+class DFSMNUnit(nn.Module):
+    """ one multi-channel deep fsmn unit
+    Args:
+        dimin:                  input dimension
+        dimexpand:              feature expansion dimension
+        dimout:                 output dimension
+        lorder:                 left ofder
+        rorder:                 right order
+    """
+
+    def __init__(self,
+                 dimin=64,
+                 dimexpand=128,
+                 dimout=64,
+                 lorder=10,
+                 rorder=1):
+        super(DFSMNUnit, self).__init__()
+
+        self.expand = AffineTransform(dimin, dimexpand)
+        self.shrink = LinearTransform(dimexpand, dimout)
+        self.fsmn = Fsmn(dimout, dimout, lorder, rorder, 1, 1)
+
+        self.debug = False
+        self.dataout = None
+
+    def forward(self, input):
+        """
+        Args:
+            input: [batch, time, feature]
+        """
+        out1 = F.relu(self.expand(input))
+        out2 = self.shrink(out1)
+        out3 = self.fsmn(out2)
+
+        # add skip connection for matched data
+        if input.shape[-1] == out3.shape[-1]:
+            out3 = input + out3
+        if self.debug:
+            self.dataout = out3
+        return out3
+
+    def print_model(self):
+        self.expand.printModel()
+        self.shrink.printModel()
+        self.fsmn.printModel()
+
+    def to_kaldi_nnet(self):
+        re_str = self.expand.toKaldiNNet()
+        relu = RectifiedLinear(self.expand.linear.out_features,
+                               self.expand.linear.out_features)
+        re_str += relu.toKaldiNNet()
+        re_str = self.shrink.toKaldiNNet()
+        re_str += self.fsmn.toKaldiNNet()
+        return re_str
+
+
+class FSMNSeleNetV3(nn.Module):
+    """ Deep FSMN model with channel selection performs multi-channel kws.
+    Zhang, Shiliang, et al. "Deep-FSMN for large vocabulary continuous speech
+    recognition." 2018 IEEE International Conference on Acoustics, Speech and
+    Signal Processing (ICASSP). IEEE, 2018.
+
+    Args:
+        input_dim:              input dimension
+        linear_dim:             fsmn input dimension
+        proj_dim:               fsmn projection dimension
+        lorder:                 fsmn left order
+        rorder:                 fsmn right order
+        num_syn:                output dimension
+        fsmn_layers:            no. of fsmn units
+    """
+
+    def __init__(self,
+                 input_dim=120,
+                 linear_dim=128,
+                 proj_dim=64,
+                 lorder=10,
+                 rorder=1,
+                 num_syn=5,
+                 fsmn_layers=5):
+        super(FSMNSeleNetV3, self).__init__()
+
+        self.mem = []
+        # the first unit, mapping input dim to proj dim
+        unit = DFSMNUnit(input_dim, linear_dim, proj_dim, lorder, rorder)
+        self.mem.append(unit)
+        self.add_module('mem_{:d}'.format(0), unit)
+
+        # deep fsmn layers with skip connection
+        for i in range(1, fsmn_layers):
+            unit = DFSMNUnit(proj_dim, linear_dim, proj_dim, lorder, rorder)
+            self.mem.append(unit)
+            self.add_module('mem_{:d}'.format(i), unit)
+
+        self.expand2 = AffineTransform(proj_dim, linear_dim)
+        self.decision = AffineTransform(linear_dim, num_syn)
+
+    def forward(self, input):
+        # multi-channel temp space, [batch, time, channel, feature]
+        if torch.cuda.is_available():
+            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
+                            self.expand2.linear.out_features).cuda()
+        else:
+            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
+                            self.expand2.linear.out_features)
+
+        for n in range(input.shape[2]):
+            chin = input[:, :, n, :]
+
+            for unit in self.mem:
+                chout = unit(chin)
+                chin = chout
+
+            x[:, :, n, :] = F.relu(self.expand2(chout))
+
+        # perform max pooling
+        pool = nn.MaxPool2d((x.shape[2], 1), stride=(x.shape[2], 1))
+        y = pool(x)
+
+        # remove channel dimension
+        y = torch.squeeze(y, -2)
+        z = self.decision(y)
+
+        return z
+
+    def print_model(self):
+        for unit in self.mem:
+            unit.print_model()
+
+        self.expand2.printModel()
+        self.decision.printModel()
+
+    def print_header(self):
+        """ get DFSMN params
+        """
+        input_dim = self.mem[0].expand.linear.in_features
+        linear_dim = self.mem[0].expand.linear.out_features
+        proj_dim = self.mem[0].shrink.linear.out_features
+        lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
+        rorder = 0
+        if self.mem[0].fsmn.conv_right is not None:
+            rorder = self.mem[0].fsmn.conv_right.kernel_size[0]
+
+        num_syn = self.decision.linear.out_features
+        fsmn_layers = len(self.mem)
+
+        # no. of output channels, 0.0 means the same as numins
+        numouts = 1.0
+
+        #
+        # write total header
+        #
+        header = [0.0] * HEADER_BLOCK_SIZE * 5
+        # numins
+        header[0] = 0.0
+        # numouts
+        header[1] = numouts
+        # dimins
+        header[2] = input_dim
+        # dimouts
+        header[3] = num_syn
+        # numlayers
+        header[4] = 4
+
+        #
+        # write each layer's header
+        #
+        hidx = 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DFSMN.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 4] = proj_dim
+        header[HEADER_BLOCK_SIZE * hidx + 5] = lorder
+        header[HEADER_BLOCK_SIZE * hidx + 6] = rorder
+        header[HEADER_BLOCK_SIZE * hidx + 7] = fsmn_layers
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DENSE.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = proj_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
+        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
+            ActivationType.ACTIVATION_RELU.value)
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_MAX_POOLING.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
+        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
+        hidx += 1
+
+        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
+            LayerType.LAYER_DENSE.value)
+        header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
+        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
+        header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
+        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
+        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
+            ActivationType.ACTIVATION_SOFTMAX.value)
+
+        for h in header:
+            print(f32ToI32(h))
+
+    def to_kaldi_nnet(self):
+        re_str = '<Nnet>\n'
+        for unit in self.mem:
+            re_str += unit.to_kaldi_nnet()
+        re_str = self.expand2.toKaldiNNet()
+        relu = RectifiedLinear(self.expand2.linear.out_features,
+                               self.expand2.linear.out_features)
+        re_str += relu.toKaldiNNet()
+        re_str += self.decision.toKaldiNNet()
+        re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
+                                         self.decision.linear.out_features)
+        re_str += '<!EndOfComponent>\n'
+        re_str += '</Nnet>\n'
+
+        return re_str
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -11,6 +11,7 @@ from modelscope.models.builder import MODELS
 from modelscope.utils.audio.audio_utils import update_conf
 from modelscope.utils.constant import Tasks
 from .fsmn_sele_v2 import FSMNSeleNetV2
+from .fsmn_sele_v3 import FSMNSeleNetV3


@MODELS.register_module(
@@ -18,6 +19,7 @@ from .fsmn_sele_v2 import FSMNSeleNetV2
 class FSMNSeleNetV2Decorator(TorchModel):
    r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """

+    MODEL_CLASS = FSMNSeleNetV2
    MODEL_TXT = 'model.txt'
    SC_CONFIG = 'sound_connect.conf'

@@ -33,7 +35,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
        """
        super().__init__(model_dir, *args, **kwargs)
        if training:
-            self.model = FSMNSeleNetV2(*args, **kwargs)
+            self.model = self.MODEL_CLASS(*args, **kwargs)
        else:
            sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
            model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
@@ -42,7 +44,7 @@ class FSMNSeleNetV2Decorator(TorchModel):

            self._sc = None
            if os.path.exists(model_txt_file):
-                conf_dict = dict(mode=56542, kws_model=model_txt_file)
+                conf_dict = dict(kws_model=model_txt_file)
                update_conf(sc_config_file, new_config_file, conf_dict)
                import py_sound_connect
                self._sc = py_sound_connect.SoundConnect(new_config_file)
@@ -50,8 +52,8 @@ class FSMNSeleNetV2Decorator(TorchModel):
                self.size_out = self._sc.bytesPerBlockOut()
            else:
                raise Exception(
-                    f'Invalid model directory! Failed to load model file: {model_txt_file}.'
-                )
+                    f'Invalid model directory! Failed to load model file:'
+                    f' {model_txt_file}.')

    def __del__(self):
        if hasattr(self, 'tmp_dir'):
@@ -73,3 +75,24 @@ class FSMNSeleNetV2Decorator(TorchModel):
                'confidence': self._sc.kwsConfidence()
            }
        return result
+
+
+@MODELS.register_module(
+    Tasks.keyword_spotting,
+    module_name=Models.speech_dfsmn_kws_char_farfield_iot)
+class FSMNSeleNetV3Decorator(FSMNSeleNetV2Decorator):
+    r""" A decorator of FSMNSeleNetV3 for integrating into modelscope framework """
+
+    MODEL_CLASS = FSMNSeleNetV3
+
+    def __init__(self,
+                 model_dir: str,
+                 training: Optional[bool] = False,
+                 *args,
+                 **kwargs):
+        """initialize the dfsmn model from the `model_dir` path.
+
+        Args:
+            model_dir (str): the model path.
+        """
+        super().__init__(model_dir, training, *args, **kwargs)
--- a/modelscope/models/audio/sv/DTDNN.py
+++ b/modelscope/models/audio/sv/DTDNN.py
@@ -76,11 +76,13 @@ class CAMPPlus(nn.Module):
                 bn_size=4,
                 init_channels=128,
                 config_str='batchnorm-relu',
-                 memory_efficient=True):
+                 memory_efficient=True,
+                 output_level='segment'):
        super(CAMPPlus, self).__init__()

        self.head = FCM(feat_dim=feat_dim)
        channels = self.head.out_channels
+        self.output_level = output_level

        self.xvector = nn.Sequential(
            OrderedDict([
@@ -118,10 +120,14 @@ class CAMPPlus(nn.Module):
        self.xvector.add_module('out_nonlinear',
                                get_nonlinear(config_str, channels))

+        if self.output_level == 'segment':
            self.xvector.add_module('stats', StatsPool())
            self.xvector.add_module(
                'dense',
-            DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
+                DenseLayer(
+                    channels * 2, embedding_size, config_str='batchnorm_'))
+        else:
+            assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. '

        for m in self.modules():
            if isinstance(m, (nn.Conv1d, nn.Linear)):
@@ -133,6 +139,8 @@ class CAMPPlus(nn.Module):
        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
        x = self.head(x)
        x = self.xvector(x)
+        if self.output_level == 'frame':
+            x = x.transpose(1, 2)
        return x


--- a/modelscope/models/audio/sv/ERes2Net.py
+++ b/modelscope/models/audio/sv/ERes2Net.py
@@ -0,0 +1,344 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
+    ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
+    fusion (LFF) fuses the features within one single residual block to extract the local signal.
+    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+import modelscope.models.audio.sv.pooling_layers as pooling_layers
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.models.audio.sv.fusion import AFF
+from modelscope.utils.constant import Tasks
+
+
+class ReLU(nn.Hardtanh):
+
+    def __init__(self, inplace=False):
+        super(ReLU, self).__init__(0, 20, inplace)
+
+    def __repr__(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return self.__class__.__name__ + ' (' \
+            + inplace_str + ')'
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    '1x1 convolution without padding'
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=1,
+        stride=stride,
+        padding=0,
+        bias=False)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    '3x3 convolution with padding'
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlockRes2Net(nn.Module):
+    expansion = 2
+
+    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
+        super(BasicBlockRes2Net, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.conv1 = conv1x1(in_planes, width * scale, stride)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale
+
+        convs = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(conv3x3(width, width))
+            bns.append(nn.BatchNorm2d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.relu = ReLU(inplace=True)
+
+        self.conv3 = conv1x1(width * scale, planes * self.expansion)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes,
+                    self.expansion * planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False), nn.BatchNorm2d(self.expansion * planes))
+        self.stride = stride
+        self.width = width
+        self.scale = scale
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicBlockRes2Net_diff_AFF(nn.Module):
+    expansion = 2
+
+    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
+        super(BasicBlockRes2Net_diff_AFF, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.conv1 = conv1x1(in_planes, width * scale, stride)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale
+
+        convs = []
+        fuse_models = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(conv3x3(width, width))
+            bns.append(nn.BatchNorm2d(width))
+        for j in range(self.nums - 1):
+            fuse_models.append(AFF(channels=width))
+
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.fuse_models = nn.ModuleList(fuse_models)
+        self.relu = ReLU(inplace=True)
+
+        self.conv3 = conv1x1(width * scale, planes * self.expansion)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes,
+                    self.expansion * planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False), nn.BatchNorm2d(self.expansion * planes))
+        self.stride = stride
+        self.width = width
+        self.scale = scale
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = self.fuse_models[i - 1](sp, spx[i])
+
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ERes2Net(nn.Module):
+
+    def __init__(self,
+                 block=BasicBlockRes2Net,
+                 block_fuse=BasicBlockRes2Net_diff_AFF,
+                 num_blocks=[3, 4, 6, 3],
+                 m_channels=32,
+                 feat_dim=80,
+                 embed_dim=192,
+                 pooling_func='TSTP',
+                 two_emb_layer=False):
+        super(ERes2Net, self).__init__()
+        self.in_planes = m_channels
+        self.feat_dim = feat_dim
+        self.embed_dim = embed_dim
+        self.stats_dim = int(feat_dim / 8) * m_channels * 8
+        self.two_emb_layer = two_emb_layer
+
+        self.conv1 = nn.Conv2d(
+            1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(m_channels)
+        self.layer1 = self._make_layer(
+            block, m_channels, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(
+            block, m_channels * 2, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(
+            block_fuse, m_channels * 4, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(
+            block_fuse, m_channels * 8, num_blocks[3], stride=2)
+
+        # downsampling
+        self.layer1_downsample = nn.Conv2d(
+            m_channels * 2,
+            m_channels * 4,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.layer2_downsample = nn.Conv2d(
+            m_channels * 4,
+            m_channels * 8,
+            kernel_size=3,
+            padding=1,
+            stride=2,
+            bias=False)
+        self.layer3_downsample = nn.Conv2d(
+            m_channels * 8,
+            m_channels * 16,
+            kernel_size=3,
+            padding=1,
+            stride=2,
+            bias=False)
+
+        # bottom-up fusion
+        self.fuse_mode12 = AFF(channels=m_channels * 4)
+        self.fuse_mode123 = AFF(channels=m_channels * 8)
+        self.fuse_mode1234 = AFF(channels=m_channels * 16)
+
+        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=self.stats_dim * block.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
+                               embed_dim)
+        if self.two_emb_layer:
+            self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
+            self.seg_2 = nn.Linear(embed_dim, embed_dim)
+        else:
+            self.seg_bn_1 = nn.Identity()
+            self.seg_2 = nn.Identity()
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out1 = self.layer1(out)
+
+        # bottom-up fusion
+        out2 = self.layer2(out1)
+        out1_downsample = self.layer1_downsample(out1)
+        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
+
+        out3 = self.layer3(out2)
+        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
+        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
+
+        out4 = self.layer4(out3)
+        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
+        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
+        stats = self.pool(fuse_out1234)
+
+        embed_a = self.seg_1(stats)
+        if self.two_emb_layer:
+            out = F.relu(embed_a)
+            out = self.seg_bn_1(out)
+            embed_b = self.seg_2(out)
+            return embed_b
+        else:
+            return embed_a
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.eres2net_sv)
+class SpeakerVerificationERes2Net(TorchModel):
+    r"""Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
+    of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
+    interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.other_config = kwargs
+        self.feature_dim = 80
+
+        self.embedding_model = ERes2Net()
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        assert len(audio.shape) == 2 and audio.shape[
+            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
+        # audio shape: [1, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model(feature)
+
+        return embedding
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        self.embedding_model.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_model_name),
+                map_location=device),
+            strict=True)
--- a/modelscope/models/audio/sv/fusion.py
+++ b/modelscope/models/audio/sv/fusion.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import torch
+import torch.nn as nn
+
+
+class AFF(nn.Module):
+
+    def __init__(self, channels=64, r=4):
+        super(AFF, self).__init__()
+        inter_channels = int(channels // r)
+
+        self.local_att = nn.Sequential(
+            nn.Conv2d(
+                channels * 2,
+                inter_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.SiLU(inplace=True),
+            nn.Conv2d(
+                inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+
+    def forward(self, x, ds_y):
+        xa = torch.cat((x, ds_y), dim=1)
+        x_att = self.local_att(xa)
+        x_att = 1.0 + torch.tanh(x_att)
+        xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
+
+        return xo
--- a/modelscope/models/audio/sv/pooling_layers.py
+++ b/modelscope/models/audio/sv/pooling_layers.py
@@ -0,0 +1,107 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker.
+"""
+import torch
+import torch.nn as nn
+
+
+class TAP(nn.Module):
+    """
+    Temporal average pooling, only first-order mean is considered
+    """
+
+    def __init__(self, **kwargs):
+        super(TAP, self).__init__()
+
+    def forward(self, x):
+        pooling_mean = x.mean(dim=-1)
+        # To be compatable with 2D input
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        return pooling_mean
+
+
+class TSDP(nn.Module):
+    """
+    Temporal standard deviation pooling, only second-order std is considered
+    """
+
+    def __init__(self, **kwargs):
+        super(TSDP, self).__init__()
+
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        return pooling_std
+
+
+class TSTP(nn.Module):
+    """
+    Temporal statistics pooling, concatenate mean and std, which is used in
+    x-vector
+    Comment: simple concatenation can not make full use of both statistics
+    """
+
+    def __init__(self, **kwargs):
+        super(TSTP, self).__init__()
+
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_mean = x.mean(dim=-1)
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        pooling_std = pooling_std.flatten(start_dim=1)
+
+        stats = torch.cat((pooling_mean, pooling_std), 1)
+        return stats
+
+
+class ASTP(nn.Module):
+    """ Attentive statistics pooling: Channel- and context-dependent
+        statistics pooling, first used in ECAPA_TDNN.
+    """
+
+    def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
+        super(ASTP, self).__init__()
+        self.global_context_att = global_context_att
+
+        # Use Conv1d with stride == 1 rather than Linear, then we don't
+        # need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(
+                in_dim * 3, bottleneck_dim,
+                kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(
+                in_dim, bottleneck_dim,
+                kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(
+            bottleneck_dim, in_dim,
+            kernel_size=1)  # equals V and k in the paper
+
+    def forward(self, x):
+        """
+        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
+            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(x.shape) == 4:
+            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
+        assert len(x.shape) == 3
+
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(
+                torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+
+        # DON'T use ReLU here! ReLU may be hard to converge.
+        alpha = torch.tanh(
+            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        var = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(var.clamp(min=1e-10))
+        return torch.cat([mean, std], dim=1)
--- a/modelscope/models/audio/sv/rdino.py
+++ b/modelscope/models/audio/sv/rdino.py
@@ -0,0 +1,573 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
+    RDINOHead implementation is adapted from DINO framework.
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.utils.constant import Tasks
+
+
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().long().item()
+    mask = torch.arange(
+        max_len, device=length.device, dtype=length.dtype).expand(
+            len(length), max_len) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    if device is None:
+        device = length.device
+
+    mask = torch.as_tensor(mask, dtype=dtype, device=device)
+    return mask
+
+
+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
+    if stride > 1:
+        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+        L_out = stride * (n_steps - 1) + kernel_size * dilation
+        padding = [kernel_size // 2, kernel_size // 2]
+
+    else:
+        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+
+        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+    return padding
+
+
+class Conv1d(nn.Module):
+
+    def __init__(
+        self,
+        out_channels,
+        kernel_size,
+        in_channels,
+        stride=1,
+        dilation=1,
+        padding='same',
+        groups=1,
+        bias=True,
+        padding_mode='reflect',
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            padding=0,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        if self.padding == 'same':
+            x = self._manage_padding(x, self.kernel_size, self.dilation,
+                                     self.stride)
+
+        elif self.padding == 'causal':
+            num_pad = (self.kernel_size - 1) * self.dilation
+            x = F.pad(x, (num_pad, 0))
+
+        elif self.padding == 'valid':
+            pass
+
+        else:
+            raise ValueError(
+                "Padding must be 'same', 'valid' or 'causal'. Got "
+                + self.padding)
+
+        wx = self.conv(x)
+
+        return wx
+
+    def _manage_padding(
+        self,
+        x,
+        kernel_size: int,
+        dilation: int,
+        stride: int,
+    ):
+        L_in = x.shape[-1]
+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
+        x = F.pad(x, padding, mode=self.padding_mode)
+
+        return x
+
+
+class BatchNorm1d(nn.Module):
+
+    def __init__(
+        self,
+        input_size,
+        eps=1e-05,
+        momentum=0.1,
+    ):
+        super().__init__()
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TDNNBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+        activation=nn.ReLU,
+        groups=1,
+    ):
+        super(TDNNBlock, self).__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+
+    def forward(self, x):
+        return self.norm(self.activation(self.conv(x)))
+
+
+class Res2NetBlock(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale=8,
+                 kernel_size=3,
+                 dilation=1):
+        super(Res2NetBlock, self).__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+
+        self.blocks = nn.ModuleList([
+            TDNNBlock(
+                in_channel,
+                hidden_channel,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ) for i in range(scale - 1)
+        ])
+        self.scale = scale
+
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = torch.cat(y, dim=1)
+        return y
+
+
+class SEBlock(nn.Module):
+
+    def __init__(self, in_channels, se_channels, out_channels):
+        super(SEBlock, self).__init__()
+
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1)
+        self.relu = torch.nn.ReLU(inplace=True)
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(dim=2, keepdim=True)
+            s = (x * mask).sum(dim=2, keepdim=True) / total
+        else:
+            s = x.mean(dim=2, keepdim=True)
+
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+
+        return s * x
+
+
+class AttentiveStatisticsPooling(nn.Module):
+
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        super().__init__()
+
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1)
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+
+        def _compute_statistics(x, m, dim=2, eps=self.eps):
+            mean = (m * x).sum(dim)
+            std = torch.sqrt(
+                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
+            return mean, std
+
+        if lengths is None:
+            lengths = torch.ones(x.shape[0], device=x.device)
+
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+        mask = mask.unsqueeze(1)
+
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            # torch.std is unstable for backward computation
+            # https://github.com/pytorch/pytorch/issues/4320
+            total = mask.sum(dim=2, keepdim=True).float()
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).repeat(1, 1, L)
+            std = std.unsqueeze(2).repeat(1, 1, L)
+            attn = torch.cat([x, mean, std], dim=1)
+        else:
+            attn = x
+
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+
+        # Filter out zero-paddings
+        attn = attn.masked_fill(mask == 0, float('-inf'))
+
+        attn = F.softmax(attn, dim=2)
+        mean, std = _compute_statistics(x, attn)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+
+        return pooled_stats
+
+
+class SERes2NetBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+        activation=torch.nn.ReLU,
+        groups=1,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels,
+                                          res2net_scale, kernel_size, dilation)
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+
+    def forward(self, x, lengths=None):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+
+        return x + residual
+
+
+class ECAPA_TDNN(nn.Module):
+    """An implementation of the speaker embedding model in a paper.
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
+    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
+    """
+
+    def __init__(
+        self,
+        input_size,
+        device='cpu',
+        lin_neurons=512,
+        activation=torch.nn.ReLU,
+        channels=[512, 512, 512, 512, 1536],
+        kernel_sizes=[5, 3, 3, 3, 1],
+        dilations=[1, 2, 3, 4, 1],
+        attention_channels=128,
+        res2net_scale=8,
+        se_channels=128,
+        global_context=True,
+        groups=[1, 1, 1, 1, 1],
+    ):
+
+        super().__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.blocks = nn.ModuleList()
+
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation,
+                groups[0],
+            ))
+
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation,
+                    groups=groups[i],
+                ))
+
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-1],
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation,
+            groups=groups[-1],
+        )
+
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context,
+        )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=lin_neurons,
+            kernel_size=1,
+        )
+
+    def forward(self, x, lengths=None):
+        """Returns the embedding vector.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape (batch, time, channel).
+        """
+        x = x.transpose(1, 2)
+
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+
+        # Multi-layer feature aggregation
+        x = torch.cat(xl[1:], dim=1)
+        x = self.mfa(x)
+
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+
+        # Final linear transformation
+        x = self.fc(x)
+
+        x = x.transpose(1, 2).squeeze(1)
+        return x
+
+
+class RDINOHead(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 use_bn=False,
+                 norm_last_layer=True,
+                 nlayers=3,
+                 hidden_dim=2048,
+                 bottleneck_dim=256,
+                 add_dim=8192):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        if nlayers == 1:
+            self.mlp = nn.Linear(in_dim, bottleneck_dim)
+        else:
+            layers = [nn.Linear(in_dim, hidden_dim)]
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+            for _ in range(nlayers - 2):
+                layers.append(nn.Linear(hidden_dim, hidden_dim))
+                if use_bn:
+                    layers.append(nn.BatchNorm1d(hidden_dim))
+                layers.append(nn.GELU())
+
+            layers.append(nn.Linear(hidden_dim, add_dim))
+            self.mlp = nn.Sequential(*layers)
+        self.add_layer = nn.Linear(add_dim, bottleneck_dim)
+        self.apply(self._init_weights)
+        self.last_layer = nn.utils.weight_norm(
+            nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+        if norm_last_layer:
+            self.last_layer.weight_g.requires_grad = False
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            torch.nn.init.trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        vicr_out = self.mlp(x)
+        x = self.add_layer(vicr_out)
+        x = nn.functional.normalize(x, dim=-1, p=2)
+        x = self.last_layer(x)
+        return vicr_out, x
+
+
+class Combine(nn.Module):
+
+    def __init__(self, backbone, head):
+        super(Combine, self).__init__()
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x):
+        x = self.backbone(x)
+        output = self.head(x)
+        return output
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.rdino_tdnn_sv)
+class SpeakerVerification_RDINO(TorchModel):
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.other_config = kwargs
+        if self.model_config['channel'] != 1024:
+            raise ValueError(
+                'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
+            )
+
+        self.feature_dim = 80
+        channels_config = [1024, 1024, 1024, 1024, 3072]
+
+        self.embedding_model = ECAPA_TDNN(
+            self.feature_dim, channels=channels_config)
+        self.embedding_model = Combine(self.embedding_model,
+                                       RDINOHead(512, 65536, True))
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        assert len(audio.shape) == 2 and audio.shape[
+            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
+        # audio shape: [1, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model.backbone(feature)
+
+        return embedding
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        state_dict = torch.load(
+            os.path.join(self.model_dir, pretrained_model_name),
+            map_location=device)
+        state_dict_tea = {
+            k.replace('module.', ''): v
+            for k, v in state_dict['teacher'].items()
+        }
+        self.embedding_model.load_state_dict(state_dict_tea, strict=True)
--- a/modelscope/models/audio/sv/speaker_change_locator.py
+++ b/modelscope/models/audio/sv/speaker_change_locator.py
@@ -0,0 +1,319 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from collections import OrderedDict
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.models.audio.sv.DTDNN import CAMPPlus
+from modelscope.utils.constant import Tasks
+
+
+class MultiHeadSelfAttention(nn.Module):
+
+    def __init__(self, n_units, h=8, dropout=0.1):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.linearQ = nn.Linear(n_units, n_units)
+        self.linearK = nn.Linear(n_units, n_units)
+        self.linearV = nn.Linear(n_units, n_units)
+        self.linearO = nn.Linear(n_units, n_units)
+        self.d_k = n_units // h
+        self.h = h
+        self.dropout = nn.Dropout(p=dropout)
+        self.att = None
+
+    def forward(self, x, batch_size):
+        # x: (BT, F)
+        q = self.linearQ(x).reshape(batch_size, -1, self.h, self.d_k)
+        k = self.linearK(x).reshape(batch_size, -1, self.h, self.d_k)
+        v = self.linearV(x).reshape(batch_size, -1, self.h, self.d_k)
+        scores = torch.matmul(q.transpose(1, 2), k.permute(
+            0, 2, 3, 1)) / np.sqrt(self.d_k)
+        # scores: (B, h, T, T)
+        self.att = F.softmax(scores, dim=3)
+        p_att = self.dropout(self.att)
+        # v : (B, T, h, d_k)
+        # p_att : (B, h, T, T)
+        x = torch.matmul(p_att, v.transpose(1, 2))
+        # x : (B, h, T, d_k)
+        x = x.transpose(1, 2).reshape(-1, self.h * self.d_k)
+        return self.linearO(x)
+
+
+class PositionwiseFeedForward(nn.Module):
+
+    def __init__(self, n_units, d_units, dropout):
+        super(PositionwiseFeedForward, self).__init__()
+        self.linear1 = nn.Linear(n_units, d_units)
+        self.linear2 = nn.Linear(d_units, n_units)
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x):
+        return self.linear2(self.dropout(F.relu(self.linear1(x))))
+
+
+class PosEncoding(nn.Module):
+
+    def __init__(self, max_seq_len, d_word_vec):
+        super(PosEncoding, self).__init__()
+        pos_enc = np.array([[
+            pos / np.power(10000, 2.0 * (j // 2) / d_word_vec)
+            for j in range(d_word_vec)
+        ] for pos in range(max_seq_len)])
+        pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
+        pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
+        pad_row = np.zeros([1, d_word_vec])
+        pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
+
+        self.pos_enc = torch.nn.Embedding(max_seq_len + 1, d_word_vec)
+        self.pos_enc.weight = torch.nn.Parameter(
+            torch.from_numpy(pos_enc), requires_grad=False)
+
+    def forward(self, input_len):
+        max_len = torch.max(input_len)
+        input_pos = torch.LongTensor([
+            list(range(1, len + 1)) + [0] * (max_len - len)
+            for len in input_len
+        ])
+
+        return self.pos_enc(input_pos)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self,
+                 idim,
+                 n_units=256,
+                 n_layers=2,
+                 e_units=512,
+                 h=4,
+                 dropout=0.1):
+        super(TransformerEncoder, self).__init__()
+        self.linear_in = nn.Linear(idim, n_units)
+        self.lnorm_in = nn.LayerNorm(n_units)
+
+        self.n_layers = n_layers
+        self.dropout = nn.Dropout(p=dropout)
+        for i in range(n_layers):
+            setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('self_att_', i),
+                    MultiHeadSelfAttention(n_units, h))
+            setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('ff_', i),
+                    PositionwiseFeedForward(n_units, e_units, dropout))
+        self.lnorm_out = nn.LayerNorm(n_units)
+
+    def forward(self, x):
+        # x: [B, num_anchors, T, n_in]
+        bs, num, tframe, dim = x.size()
+        x = x.reshape(bs * num, tframe, -1)  # [B*num_anchors, T, dim]
+        # x: (B, T, F) ... batch, time, (mel)freq
+        B_size, T_size, _ = x.shape
+        # e: (BT, F)
+        e = self.linear_in(x.reshape(B_size * T_size, -1))
+        # Encoder stack
+        for i in range(self.n_layers):
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
+            # self-attention
+            s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
+            # residual
+            e = e + self.dropout(s)
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
+            # positionwise feed-forward
+            s = getattr(self, '{}{:d}'.format('ff_', i))(e)
+            # residual
+            e = e + self.dropout(s)
+        # final layer normalization
+        # output: (BT, F)
+        # output: (B, F, T)
+        output = self.lnorm_out(e).reshape(B_size, T_size, -1)
+        output = output.reshape(bs, num, tframe,
+                                -1)  # [B, num_anchors, T, dim]
+        return output
+
+
+class TransformerEncoder_out(nn.Module):
+
+    def __init__(self,
+                 idim,
+                 n_units=256,
+                 n_layers=2,
+                 e_units=512,
+                 h=4,
+                 dropout=0.1):
+        super(TransformerEncoder_out, self).__init__()
+        self.linear_in = nn.Linear(idim, n_units)
+        self.lnorm_in = nn.LayerNorm(n_units)
+
+        self.n_layers = n_layers
+        self.dropout = nn.Dropout(p=dropout)
+        for i in range(n_layers):
+            setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('self_att_', i),
+                    MultiHeadSelfAttention(n_units, h))
+            setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
+            setattr(self, '{}{:d}'.format('ff_', i),
+                    PositionwiseFeedForward(n_units, e_units, dropout))
+        self.lnorm_out = nn.LayerNorm(n_units)
+
+    def forward(self, x):
+        # x: (B, T, F)
+        B_size, T_size, _ = x.shape
+        # e: (BT, F)
+        e = self.linear_in(x.reshape(B_size * T_size, -1))
+        # Encoder stack
+        for i in range(self.n_layers):
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
+            # self-attention
+            s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
+            # residual
+            e = e + self.dropout(s)
+            # layer normalization
+            e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
+            # positionwise feed-forward
+            s = getattr(self, '{}{:d}'.format('ff_', i))(e)
+            # residual
+            e = e + self.dropout(s)
+        # final layer normalization
+        # output: (BT, F)
+        # output: (B, T, F)
+        output = self.lnorm_out(e).reshape(B_size, T_size, -1)
+        return output
+
+
+class OutLayer(nn.Module):
+
+    def __init__(self, n_units=256, num_anchors=2):
+        super(OutLayer, self).__init__()
+        self.combine = TransformerEncoder_out(num_anchors * n_units, n_units)
+        self.out_linear = nn.Linear(n_units // num_anchors, 1)
+
+    def forward(self, input):
+        # input: [B, num_anchors, T, dim]
+        bs, num, tframe, dim = input.size()
+        output = input.permute(0, 2, 1,
+                               3).reshape(bs, tframe,
+                                          -1)  # [Bs, t, num_anchors*dim]
+        output = self.combine(output)  # [Bs, t, n_units]
+        output = output.reshape(
+            bs, tframe, num, -1)  # [Bs, t, num_anchors, n_units//num_anchors]
+        output = self.out_linear(output).squeeze(-1)  # [Bs, t, num_anchors]
+
+        return output
+
+
+class TransformerDetector(nn.Module):
+
+    def __init__(self,
+                 frame_dim=512,
+                 anchor_dim=192,
+                 hidden_dim=256,
+                 max_seq_len=1000):
+        super(TransformerDetector, self).__init__()
+        self.detection = TransformerEncoder(
+            idim=frame_dim + anchor_dim, n_units=hidden_dim)
+        self.output = OutLayer(n_units=hidden_dim)
+        self.pos_enc = PosEncoding(max_seq_len, hidden_dim)
+
+    def forward(self, feats, anchors):
+        # feats: [1, t, fdim]
+        num_frames = feats.shape[1]
+        num_anchors = anchors.shape[1]
+        bs = feats.shape[0]
+        feats = feats.unsqueeze(1).repeat(
+            1, num_anchors, 1, 1)  # shape: [Bs, num_anchors, t, fdim]
+        anchors = anchors.unsqueeze(2).repeat(
+            1, 1, num_frames, 1)  # shape: [Bs, num_anchors, t, xdim]
+        sd_in = torch.cat((feats, anchors),
+                          dim=-1)  # shape: [Bs, num_anchors, t, fdim+xdim]
+        sd_out = self.detection(sd_in)  # shape: [Bs, num_anchors, t, sd_dim]
+
+        # pos
+        pos_emb = self.pos_enc(torch.tensor([num_frames] * (bs * num_anchors)))
+        pos_emb = pos_emb.reshape(bs, num_anchors, num_frames, -1)
+        sd_out += pos_emb
+
+        # output
+        output = self.output(sd_out)  # shape: [Bs, t, num_anchors]
+
+        return output
+
+
+@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.scl_sd)
+class SpeakerChangeLocatorTransformer(TorchModel):
+    r"""A speaekr change locator using the transformer architecture as the backbone.
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+
+        self.feature_dim = self.model_config['fbank_dim']
+        frame_size = self.model_config['frame_size']
+        anchor_size = self.model_config['anchor_size']
+
+        self.encoder = CAMPPlus(self.feature_dim, output_level='frame')
+        self.backend = TransformerDetector(
+            frame_dim=frame_size, anchor_dim=anchor_size)
+
+        pretrained_encoder = kwargs['pretrained_encoder']
+        pretrained_backend = kwargs['pretrained_backend']
+
+        self.__load_check_point(pretrained_encoder, pretrained_backend)
+
+        self.encoder.eval()
+        self.backend.eval()
+
+    def forward(self, audio, anchors):
+        assert len(audio.shape) == 2 and audio.shape[
+            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
+        assert len(
+            anchors.shape
+        ) == 3 and anchors.shape[0] == 1 and anchors.shape[
+            1] == 2, 'modelscope error: the shape of input anchors to model needs to be [1, 2, D]'
+        # audio shape: [1, T]
+        feature = self.__extract_feature(audio)
+        frame_state = self.encoder(feature)
+        output = self.backend(frame_state, anchors)
+        output = output.squeeze(0).detach().cpu().sigmoid()
+
+        time_scale_factor = int(np.ceil(feature.shape[1] / output.shape[0]))
+        output = output.unsqueeze(1).expand(-1, time_scale_factor,
+                                            -1).reshape(-1, output.shape[-1])
+        return output
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self,
+                           pretrained_encoder,
+                           pretrained_backend,
+                           device=None):
+        if not device:
+            device = torch.device('cpu')
+        self.encoder.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_encoder),
+                map_location=device))
+
+        self.backend.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_backend),
+                map_location=device))
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -17,11 +17,9 @@ from kantts.train.trainer import GAN_Trainer, Sambert_Trainer, distributed_init
 from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit
 from torch.utils.data import DataLoader

-from modelscope import __version__
 from modelscope.utils.audio.audio_utils import TtsCustomParams
 from modelscope.utils.audio.tts_exceptions import (
    TtsModelConfigurationException, TtsModelNotExistsException)
-from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger

 logger = get_logger()
@@ -394,6 +392,7 @@ class Voice:
        logger.info(f'TRAINING steps: {train_max_steps}')
        config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime())
+        from modelscope import __version__
        config['modelscope_version'] = __version__

        with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
@@ -558,6 +557,7 @@ class Voice:
        logger.info(f'resume from: {resume_from}')
        config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime())
+        from modelscope import __version__
        config['modelscope_version'] = __version__

        with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -4,9 +4,8 @@
 from . import (action_recognition, animal_recognition, bad_image_detecting,
               body_2d_keypoints, body_3d_keypoints, cartoon,
               cmdssl_video_embedding, controllable_image_generation,
-               crowd_counting, face_2d_keypoints, face_detection,
-               face_generation, face_reconstruction, human_reconstruction,
-               human_wholebody_keypoint, image_classification,
+               crowd_counting, face_detection, face_generation,
+               face_reconstruction, human_reconstruction, image_classification,
               image_color_enhance, image_colorization, image_defrcn_fewshot,
               image_denoise, image_inpainting, image_instance_segmentation,
               image_matching, image_mvs_depth_estimation,
--- a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
+++ b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
@@ -72,7 +72,7 @@ class PoseHighResolutionNetV2(TorchModel):
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)
        """final four layers"""
-        last_inp_channels = np.int(np.sum(pre_stage_channels))
+        last_inp_channels = int(np.sum(pre_stage_channels))
        self.final_layer = nn.Sequential(
            nn.Conv2d(
                in_channels=last_inp_channels,
--- a/modelscope/models/cv/cartoon/facelib/face_landmark.py
+++ b/modelscope/models/cv/cartoon/facelib/face_landmark.py
@@ -81,7 +81,7 @@ class FaceLandmark:
        bbox[2] = center[0] + one_edge // 2
        bbox[3] = center[1] + one_edge // 2

-        bbox = bbox.astype(np.int)
+        bbox = bbox.astype(int)
        crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
        h, w, _ = crop_image.shape
        crop_image = cv2.resize(
--- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
+++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
@@ -356,7 +356,7 @@ class HighResolutionNet(nn.Module):
                                                       num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)
-        last_inp_channels = np.int(np.sum(pre_stage_channels)) + 256
+        last_inp_channels = int(np.sum(pre_stage_channels)) + 256
        self.redc_layer = nn.Sequential(
            nn.Conv2d(
                in_channels=last_inp_channels,
--- a/modelscope/models/cv/easycv_base.py
+++ b/modelscope/models/cv/easycv_base.py
@@ -1,25 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.base import BaseModel
-from easycv.utils.ms_utils import EasyCVMeta
-
-from modelscope.models.base import TorchModel
-
-
-class EasyCVBaseModel(BaseModel, TorchModel):
-    """Base model for EasyCV."""
-
-    def __init__(self, model_dir=None, args=(), kwargs={}):
-        kwargs.pop(EasyCVMeta.ARCH, None)  # pop useless keys
-        BaseModel.__init__(self)
-        TorchModel.__init__(self, model_dir=model_dir)
-
-    def forward(self, img, mode='train', **kwargs):
-        if self.training:
-            losses = self.forward_train(img, **kwargs)
-            loss, log_vars = self._parse_losses(losses)
-            return dict(loss=loss, log_vars=log_vars)
-        else:
-            return self.forward_test(img, **kwargs)
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
--- a/modelscope/models/cv/face_2d_keypoints/init.py
+++ b/modelscope/models/cv/face_2d_keypoints/init.py
@@ -1,20 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import TYPE_CHECKING
-
-from modelscope.utils.import_utils import LazyImportModule
-
-if TYPE_CHECKING:
-    from .face_2d_keypoints_align import Face2DKeypoints
-
-else:
-    _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
-
-    import sys
-
-    sys.modules[__name__] = LazyImportModule(
-        __name__,
-        globals()['__file__'],
-        _import_structure,
-        module_spec=__spec__,
-        extra_objects={},
-    )
--- a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
+++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
@@ -1,16 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.face.face_keypoint import FaceKeypoint
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
-class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        FaceKeypoint.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py
+++ b/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py
@@ -82,7 +82,7 @@ class FaceLandmark:
        bbox[2] = center[0] + one_edge // 2
        bbox[3] = center[1] + one_edge // 2

-        bbox = bbox.astype(np.int)
+        bbox = bbox.astype(int)
        crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
        h, w, _ = crop_image.shape
        crop_image = cv2.resize(crop_image,
--- a/modelscope/models/cv/hand_2d_keypoints/init.py
+++ b/modelscope/models/cv/hand_2d_keypoints/init.py
@@ -1,20 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import TYPE_CHECKING
-
-from modelscope.utils.import_utils import LazyImportModule
-
-if TYPE_CHECKING:
-    from .hand_2d_keypoints import Hand2dKeyPoints
-
-else:
-    _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
-
-    import sys
-
-    sys.modules[__name__] = LazyImportModule(
-        __name__,
-        globals()['__file__'],
-        _import_structure,
-        module_spec=__spec__,
-        extra_objects={},
-    )
--- a/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
+++ b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
@@ -1,16 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.pose import TopDown
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
-class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        TopDown.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/human_reconstruction/models/human_segmenter.py
+++ b/modelscope/models/cv/human_reconstruction/models/human_segmenter.py
@@ -31,7 +31,7 @@ class human_segmenter(object):
            img = np.dstack((img, img, img))
        elif img.shape[2] == 4:
            img = img[:, :, :3]
-        img = img.astype(np.float)
+        img = img.astype(float)
        return img

    def run(self, img):
--- a/modelscope/models/cv/human_reconstruction/utils.py
+++ b/modelscope/models/cv/human_reconstruction/utils.py
@@ -69,8 +69,8 @@ def eval_grid(coords,
              num_samples=512 * 512 * 512):
    resolution = coords.shape[1:4]
    sdf = np.zeros(resolution)
-    dirty = np.ones(resolution, dtype=np.bool)
-    grid_mask = np.zeros(resolution, dtype=np.bool)
+    dirty = np.ones(resolution, dtype=bool)
+    grid_mask = np.zeros(resolution, dtype=bool)
    reso = resolution[0] // init_resolution

    while reso > 0:
--- a/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
+++ b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
@@ -1,17 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.pose.top_down import TopDown
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.human_wholebody_keypoint,
-    module_name=Models.human_wholebody_keypoint)
-class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        TopDown.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
@@ -163,7 +163,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
                    os.path.join(split_dir,
                                 'box_{}shot_{}_train.txt'.format(shot,
                                                                  cls))) as f:
-                fileids_ = np.loadtxt(f, dtype=np.str).tolist()
+                fileids_ = np.loadtxt(f, dtype=np.str_).tolist()
                if isinstance(fileids_, str):
                    fileids_ = [fileids_]
                fileids_ = [
@@ -219,7 +219,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
        with PathManager.open(
                os.path.join(root, dirname, 'ImageSets', 'Main',
                             split + '.txt')) as f:
-            fileids = np.loadtxt(f, dtype=np.str)
+            fileids = np.loadtxt(f, dtype=np.str_)

        for fileid in fileids:
            anno_file = os.path.join(root, dirname, 'Annotations',
--- a/modelscope/models/cv/image_instance_segmentation/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/init.py
@@ -8,10 +8,12 @@ if TYPE_CHECKING:
    from .maskdino_swin import MaskDINOSwin
    from .model import CascadeMaskRCNNSwinModel
    from .maskdino_model import MaskDINOSwinModel
+    from .fastinst_model import FastInst
    from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
 else:
    _import_structure = {
        'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
+        'fastinst_model': ['FastInst'],
        'maskdino_swin': ['MaskDINOSwin'],
        'model': ['CascadeMaskRCNNSwinModel'],
        'maskdino_model': ['MaskDINOSwinModel'],
--- a/modelscope/models/cv/image_instance_segmentation/backbones/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/init.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .swin_transformer import SwinTransformer
    from .swin_transformer import D2SwinTransformer
+    from .resnet import build_resnet_backbone

 else:
    _import_structure = {
        'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
+        'resnet': ['build_resnet_backbone']
    }

    import sys
--- a/modelscope/models/cv/image_instance_segmentation/backbones/resnet.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/resnet.py
@@ -0,0 +1,114 @@
+# Part of the implementation is borrowed and modified from Detectron2, publicly available at
+# https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py
+
+import torch.nn.functional as F
+from torch import nn
+
+from modelscope.models.cv.image_human_parsing.backbone.deeplab_resnet import (
+    BottleneckBlock, DeeplabResNet, get_norm)
+from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
+    Conv2d
+
+
+class BasicStem(nn.Module):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+
+    def __init__(self, in_channels=3, out_channels=64, norm='BN'):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = 4
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+
+def build_resnet_backbone(out_features, depth, num_groups, width_per_group,
+                          norm, stem_out_channels, res2_out_channels,
+                          stride_in_1x1, res4_dilation, res5_dilation,
+                          res5_multi_grid, input_shape):
+    stem = BasicStem(
+        in_channels=input_shape['channels'],
+        out_channels=stem_out_channels,
+        norm=norm)
+    bottleneck_channels = num_groups * width_per_group
+    in_channels = stem_out_channels
+    out_channels = res2_out_channels
+
+    assert res4_dilation in {
+        1, 2
+    }, 'res4_dilation cannot be {}.'.format(res4_dilation)
+    assert res5_dilation in {
+        1, 2, 4
+    }, 'res5_dilation cannot be {}.'.format(res5_dilation)
+    if res4_dilation == 2:
+        # Always dilate res5 if res4 is dilated.
+        assert res5_dilation == 4
+
+    num_blocks_per_stage = {
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3]
+    }[depth]
+
+    stages = []
+    out_stage_idx = [{
+        'res2': 2,
+        'res3': 3,
+        'res4': 4,
+        'res5': 5
+    }[f] for f in out_features]
+    max_stage_idx = max(out_stage_idx)
+    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
+        if stage_idx == 4:
+            dilation = res4_dilation
+        elif stage_idx == 5:
+            dilation = res5_dilation
+        else:
+            dilation = 1
+        first_stride = 1 if idx == 0 or dilation > 1 else 2
+        stride_per_block = [first_stride]
+        stride_per_block += [1] * (num_blocks_per_stage[idx] - 1)
+        stage_kargs = {
+            'num_blocks': num_blocks_per_stage[idx],
+            'stride_per_block': stride_per_block,
+            'in_channels': in_channels,
+            'out_channels': out_channels,
+            'norm': norm,
+            'bottleneck_channels': bottleneck_channels,
+            'stride_in_1x1': stride_in_1x1,
+            'dilation': dilation,
+            'num_groups': num_groups,
+            'block_class': BottleneckBlock
+        }
+        if stage_idx == 5:
+            stage_kargs.pop('dilation')
+            stage_kargs['dilation_per_block'] = [
+                dilation * mg for mg in res5_multi_grid
+            ]
+        blocks = DeeplabResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return DeeplabResNet(stem, stages, out_features=out_features)
--- a/modelscope/models/cv/image_instance_segmentation/fastinst/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst/init.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
--- a/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_decoder.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_decoder.py
@@ -0,0 +1,351 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import (
+    MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer)
+
+
+class QueryProposal(nn.Module):
+
+    def __init__(self, num_features, num_queries, num_classes):
+        super().__init__()
+        self.topk = num_queries
+        self.num_classes = num_classes
+
+        self.conv_proposal_cls_logits = nn.Sequential(
+            nn.Conv2d(
+                num_features, num_features, kernel_size=3, stride=1,
+                padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                num_features,
+                num_classes + 1,
+                kernel_size=1,
+                stride=1,
+                padding=0),
+        )
+
+    @torch.no_grad()
+    def compute_coordinates(self, x):
+        h, w = x.size(2), x.size(3)
+        y_loc = torch.linspace(0, 1, h, device=x.device)
+        x_loc = torch.linspace(0, 1, w, device=x.device)
+        y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
+        locations = torch.stack([x_loc, y_loc], 0).unsqueeze(0)
+        return locations
+
+    def seek_local_maximum(self, x, epsilon=1e-6):
+        """
+        inputs:
+            x: torch.tensor, shape [b, c, h, w]
+        return:
+            torch.tensor, shape [b, c, h, w]
+        """
+        x_pad = F.pad(x, (1, 1, 1, 1), 'constant', 0)
+        # top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
+        maximum = (x >= x_pad[:, :, :-2, 1:-1]) & \
+                  (x >= x_pad[:, :, 2:, 1:-1]) & \
+                  (x >= x_pad[:, :, 1:-1, :-2]) & \
+                  (x >= x_pad[:, :, 1:-1, 2:]) & \
+                  (x >= x_pad[:, :, :-2, :-2]) & \
+                  (x >= x_pad[:, :, :-2, 2:]) & \
+                  (x >= x_pad[:, :, 2:, :-2]) & \
+                  (x >= x_pad[:, :, 2:, 2:]) & \
+                  (x >= epsilon)
+        return maximum.to(x)
+
+    def forward(self, x, pos_embeddings):
+
+        proposal_cls_logits = self.conv_proposal_cls_logits(x)  # b, c, h, w
+        proposal_cls_probs = proposal_cls_logits.softmax(dim=1)  # b, c, h, w
+        proposal_cls_one_hot = F.one_hot(
+            proposal_cls_probs[:, :-1, :, :].max(1)[1],
+            num_classes=self.num_classes + 1).permute(0, 3, 1, 2)  # b, c, h, w
+        proposal_cls_probs = proposal_cls_probs.mul(proposal_cls_one_hot)
+        proposal_local_maximum_map = self.seek_local_maximum(
+            proposal_cls_probs)  # b, c, h, w
+        proposal_cls_probs = proposal_cls_probs + proposal_local_maximum_map  # b, c, h, w
+
+        # top-k indices
+        topk_indices = torch.topk(
+            proposal_cls_probs[:, :-1, :, :].flatten(2).max(1)[0],
+            self.topk,
+            dim=1)[1]  # b, q
+        topk_indices = topk_indices.unsqueeze(1)  # b, 1, q
+
+        # topk queries
+        topk_proposals = torch.gather(
+            x.flatten(2), dim=2, index=topk_indices.repeat(1, x.shape[1],
+                                                           1))  # b, c, q
+        pos_embeddings = pos_embeddings.repeat(x.shape[0], 1, 1, 1).flatten(2)
+        topk_pos_embeddings = torch.gather(
+            pos_embeddings,
+            dim=2,
+            index=topk_indices.repeat(1, pos_embeddings.shape[1],
+                                      1))  # b, c, q
+        if self.training:
+            locations = self.compute_coordinates(x).repeat(x.shape[0], 1, 1, 1)
+            topk_locations = torch.gather(
+                locations.flatten(2),
+                dim=2,
+                index=topk_indices.repeat(1, locations.shape[1], 1))
+            topk_locations = topk_locations.transpose(-1, -2)  # b, q, 2
+        else:
+            topk_locations = None
+        return topk_proposals, topk_pos_embeddings, topk_locations, proposal_cls_logits
+
+
+class FastInstDecoder(nn.Module):
+
+    def __init__(self, in_channels, *, num_classes: int, hidden_dim: int,
+                 num_queries: int, num_aux_queries: int, nheads: int,
+                 dim_feedforward: int, dec_layers: int, pre_norm: bool,
+                 mask_dim: int):
+        """
+        Args:
+            in_channels: channels of the input features
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            num_aux_queries: number of auxiliary queries
+            nheads: number of heads
+            dim_feedforward: feature dimension in feedforward network
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            mask_dim: mask feature dimension
+        """
+        super().__init__()
+        self.num_heads = nheads
+        self.num_layers = dec_layers
+        self.num_queries = num_queries
+        self.num_aux_queries = num_aux_queries
+        self.num_classes = num_classes
+
+        meta_pos_size = int(round(math.sqrt(self.num_queries)))
+        self.meta_pos_embed = nn.Parameter(
+            torch.empty(1, hidden_dim, meta_pos_size, meta_pos_size))
+        if num_aux_queries > 0:
+            self.empty_query_features = nn.Embedding(num_aux_queries,
+                                                     hidden_dim)
+            self.empty_query_pos_embed = nn.Embedding(num_aux_queries,
+                                                      hidden_dim)
+
+        self.query_proposal = QueryProposal(hidden_dim, num_queries,
+                                            num_classes)
+
+        self.transformer_query_cross_attention_layers = nn.ModuleList()
+        self.transformer_query_self_attention_layers = nn.ModuleList()
+        self.transformer_query_ffn_layers = nn.ModuleList()
+        self.transformer_mask_cross_attention_layers = nn.ModuleList()
+        self.transformer_mask_ffn_layers = nn.ModuleList()
+        for idx in range(self.num_layers):
+            self.transformer_query_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm))
+            self.transformer_query_self_attention_layers.append(
+                SelfAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm))
+            self.transformer_query_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm))
+            self.transformer_mask_cross_attention_layers.append(
+                CrossAttentionLayer(
+                    d_model=hidden_dim,
+                    nhead=nheads,
+                    dropout=0.0,
+                    normalize_before=pre_norm))
+            self.transformer_mask_ffn_layers.append(
+                FFNLayer(
+                    d_model=hidden_dim,
+                    dim_feedforward=dim_feedforward,
+                    dropout=0.0,
+                    normalize_before=pre_norm))
+
+        self.decoder_query_norm_layers = nn.ModuleList()
+        self.class_embed_layers = nn.ModuleList()
+        self.mask_embed_layers = nn.ModuleList()
+        self.mask_features_layers = nn.ModuleList()
+        for idx in range(self.num_layers + 1):
+            self.decoder_query_norm_layers.append(nn.LayerNorm(hidden_dim))
+            self.class_embed_layers.append(
+                MLP(hidden_dim, hidden_dim, num_classes + 1, 3))
+            self.mask_embed_layers.append(
+                MLP(hidden_dim, hidden_dim, mask_dim, 3))
+            self.mask_features_layers.append(nn.Linear(hidden_dim, mask_dim))
+
+    def forward(self, x, mask_features, targets=None):
+        bs = x[0].shape[0]
+        proposal_size = x[1].shape[-2:]
+        pixel_feature_size = x[2].shape[-2:]
+
+        pixel_pos_embeds = F.interpolate(
+            self.meta_pos_embed,
+            size=pixel_feature_size,
+            mode='bilinear',
+            align_corners=False)
+        proposal_pos_embeds = F.interpolate(
+            self.meta_pos_embed,
+            size=proposal_size,
+            mode='bilinear',
+            align_corners=False)
+
+        pixel_features = x[2].flatten(2).permute(2, 0, 1)
+        pixel_pos_embeds = pixel_pos_embeds.flatten(2).permute(2, 0, 1)
+
+        query_features, query_pos_embeds, query_locations, proposal_cls_logits = self.query_proposal(
+            x[1], proposal_pos_embeds)
+        query_features = query_features.permute(2, 0, 1)
+        query_pos_embeds = query_pos_embeds.permute(2, 0, 1)
+        if self.num_aux_queries > 0:
+            aux_query_features = self.empty_query_features.weight.unsqueeze(
+                1).repeat(1, bs, 1)
+            aux_query_pos_embed = self.empty_query_pos_embed.weight.unsqueeze(
+                1).repeat(1, bs, 1)
+            query_features = torch.cat([query_features, aux_query_features],
+                                       dim=0)
+            query_pos_embeds = torch.cat(
+                [query_pos_embeds, aux_query_pos_embed], dim=0)
+
+        outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
+            query_features,
+            pixel_features,
+            pixel_feature_size,
+            -1,
+            return_attn_mask=True)
+        predictions_class = [outputs_class]
+        predictions_mask = [outputs_mask]
+        predictions_matching_index = [None]
+        query_feature_memory = [query_features]
+        pixel_feature_memory = [pixel_features]
+
+        for i in range(self.num_layers):
+            query_features, pixel_features = self.forward_one_layer(
+                query_features, pixel_features, query_pos_embeds,
+                pixel_pos_embeds, attn_mask, i)
+            if i < self.num_layers - 1:
+                outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
+                    query_features,
+                    pixel_features,
+                    pixel_feature_size,
+                    i,
+                    return_attn_mask=True,
+                )
+            else:
+                outputs_class, outputs_mask, _, matching_indices, gt_attn_mask = self.forward_prediction_heads(
+                    query_features,
+                    pixel_features,
+                    pixel_feature_size,
+                    i,
+                )
+            predictions_class.append(outputs_class)
+            predictions_mask.append(outputs_mask)
+            predictions_matching_index.append(None)
+            query_feature_memory.append(query_features)
+            pixel_feature_memory.append(pixel_features)
+
+        out = {
+            'proposal_cls_logits':
+            proposal_cls_logits,
+            'query_locations':
+            query_locations,
+            'pred_logits':
+            predictions_class[-1],
+            'pred_masks':
+            predictions_mask[-1],
+            'pred_indices':
+            predictions_matching_index[-1],
+            'aux_outputs':
+            self._set_aux_loss(predictions_class, predictions_mask,
+                               predictions_matching_index, query_locations)
+        }
+        return out
+
+    def forward_one_layer(self, query_features, pixel_features,
+                          query_pos_embeds, pixel_pos_embeds, attn_mask, i):
+        pixel_features = self.transformer_mask_cross_attention_layers[i](
+            pixel_features,
+            query_features,
+            query_pos=pixel_pos_embeds,
+            pos=query_pos_embeds)
+        pixel_features = self.transformer_mask_ffn_layers[i](pixel_features)
+
+        query_features = self.transformer_query_cross_attention_layers[i](
+            query_features,
+            pixel_features,
+            memory_mask=attn_mask,
+            query_pos=query_pos_embeds,
+            pos=pixel_pos_embeds)
+        query_features = self.transformer_query_self_attention_layers[i](
+            query_features, query_pos=query_pos_embeds)
+        query_features = self.transformer_query_ffn_layers[i](query_features)
+        return query_features, pixel_features
+
+    def forward_prediction_heads(self,
+                                 query_features,
+                                 pixel_features,
+                                 pixel_feature_size,
+                                 idx_layer,
+                                 return_attn_mask=False,
+                                 return_gt_attn_mask=False,
+                                 targets=None,
+                                 query_locations=None):
+        decoder_query_features = self.decoder_query_norm_layers[idx_layer + 1](
+            query_features[:self.num_queries])
+        decoder_query_features = decoder_query_features.transpose(0, 1)
+        if idx_layer + 1 == self.num_layers:
+            outputs_class = self.class_embed_layers[idx_layer + 1](
+                decoder_query_features)
+        else:
+            outputs_class = None
+        outputs_mask_embed = self.mask_embed_layers[idx_layer + 1](
+            decoder_query_features)
+        outputs_mask_features = self.mask_features_layers[idx_layer + 1](
+            pixel_features.transpose(0, 1))
+
+        outputs_mask = torch.einsum('bqc,blc->bql', outputs_mask_embed,
+                                    outputs_mask_features)
+        outputs_mask = outputs_mask.reshape(-1, self.num_queries,
+                                            *pixel_feature_size)
+
+        if return_attn_mask:
+            # outputs_mask.shape: b, q, h, w
+            attn_mask = F.pad(outputs_mask,
+                              (0, 0, 0, 0, 0, self.num_aux_queries),
+                              'constant', 1)
+            attn_mask = (attn_mask < 0.).flatten(2)  # b, q, hw
+            invalid_query = attn_mask.all(-1, keepdim=True)  # b, q, 1
+            attn_mask = (~invalid_query) & attn_mask  # b, q, hw
+            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1,
+                                                      1).flatten(0, 1)
+            attn_mask = attn_mask.detach()
+        else:
+            attn_mask = None
+
+        matching_indices = None
+        gt_attn_mask = None
+
+        return outputs_class, outputs_mask, attn_mask, matching_indices, gt_attn_mask
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks, output_indices,
+                      output_query_locations):
+        return [{
+            'query_locations': output_query_locations,
+            'pred_logits': a,
+            'pred_masks': b,
+            'pred_matching_indices': c
+        } for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],
+                             output_indices[:-1])]
--- a/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_encoder.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_encoder.py
@@ -0,0 +1,180 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
+    Conv2d
+
+
+# This is a modified FPN decoder.
+class BaseFPN(nn.Module):
+
+    def __init__(
+        self,
+        input_shape,
+        *,
+        convs_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            convs_dim: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__()
+
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
+        self.in_features = [k for k, v in input_shape
+                            ]  # starting from "res3" to "res5"
+        feature_channels = [v['channels'] for k, v in input_shape]
+
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ''
+        for idx, in_channels in enumerate(feature_channels):
+            lateral_norm = nn.GroupNorm(32, convs_dim)
+            output_norm = nn.GroupNorm(32, convs_dim)
+
+            lateral_conv = Conv2d(
+                in_channels,
+                convs_dim,
+                kernel_size=1,
+                bias=use_bias,
+                norm=lateral_norm)
+            output_conv = Conv2d(
+                convs_dim,
+                convs_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+                activation=F.relu,
+            )
+            self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
+            self.add_module('layer_{}'.format(idx + 1), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+
+        self.convs_dim = convs_dim
+        self.num_feature_levels = 3  # always use 3 scales
+
+    def forward_features(self, features):
+        multi_scale_features = []
+        num_cur_levels = 0
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if idx == 0:
+                y = lateral_conv(x)
+            else:
+                cur_fpn = lateral_conv(x)
+                y = cur_fpn + F.interpolate(
+                    y,
+                    size=cur_fpn.shape[-2:],
+                    mode='bilinear',
+                    align_corners=False)
+            y = output_conv(y)
+
+            if num_cur_levels < self.num_feature_levels:
+                multi_scale_features.append(y)
+                num_cur_levels += 1
+        return None, multi_scale_features
+
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            'Calling forward() may cause unpredicted behavior of PixelDecoder module.'
+        )
+        return self.forward_features(features)
+
+
+class PyramidPoolingModule(nn.Module):
+
+    def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
+        super().__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, channels, size) for size in sizes])
+        self.bottleneck = Conv2d(in_channels + len(sizes) * channels,
+                                 in_channels, 1)
+
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = Conv2d(features, out_features, 1)
+        return nn.Sequential(prior, conv)
+
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [
+            F.interpolate(
+                input=F.relu_(stage(feats)),
+                size=(h, w),
+                mode='bilinear',
+                align_corners=False) for stage in self.stages
+        ] + [feats]
+        out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
+        return out
+
+
+class PyramidPoolingModuleFPN(BaseFPN):
+
+    def __init__(
+        self,
+        input_shape,
+        *,
+        convs_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            convs_dim: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__(
+            input_shape, convs_dim=convs_dim, mask_dim=mask_dim, norm=norm)
+        self.ppm = PyramidPoolingModule(convs_dim, convs_dim // 4)
+
+    def forward_features(self, features):
+        multi_scale_features = []
+        num_cur_levels = 0
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if idx == 0:
+                y = self.ppm(lateral_conv(x))
+            else:
+                cur_fpn = lateral_conv(x)
+                y = cur_fpn + F.interpolate(
+                    y,
+                    size=cur_fpn.shape[-2:],
+                    mode='bilinear',
+                    align_corners=False)
+            y = output_conv(y)
+
+            if num_cur_levels < self.num_feature_levels:
+                multi_scale_features.append(y)
+                num_cur_levels += 1
+
+        return None, multi_scale_features
--- a/modelscope/models/cv/image_instance_segmentation/fastinst_model.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst_model.py
@@ -0,0 +1,221 @@
+# Part of implementation is borrowed and modified from Mask2Former, publicly available at
+# https://github.com/facebookresearch/Mask2Former.
+import os
+from typing import Any, Dict, List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \
+    ImageList
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .backbones import build_resnet_backbone
+from .fastinst.fastinst_decoder import FastInstDecoder
+from .fastinst.fastinst_encoder import PyramidPoolingModuleFPN
+
+logger = get_logger()
+
+
+@MODELS.register_module(Tasks.image_segmentation, module_name=Models.fastinst)
+class FastInst(TorchModel):
+
+    def __init__(self,
+                 model_dir,
+                 backbone=None,
+                 encoder=None,
+                 decoder=None,
+                 pretrained=None,
+                 classes=None,
+                 **kwargs):
+        """
+        Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394
+        Args:
+            backbone (dict): backbone config.
+            encoder (dict): encoder config.
+            decoder (dict): decoder config.
+            pretrained (bool): whether to use pretrained model
+            classes (list): class names
+        """
+        super(FastInst, self).__init__(model_dir, **kwargs)
+
+        self.backbone = build_resnet_backbone(
+            **backbone, input_shape={'channels': 3})
+        in_features = encoder.pop('in_features')
+        input_shape = {
+            k: v
+            for k, v in self.backbone.output_shape().items()
+            if k in in_features
+        }
+        encoder = PyramidPoolingModuleFPN(input_shape=input_shape, **encoder)
+        decoder = FastInstDecoder(in_channels=encoder.convs_dim, **decoder)
+        self.sem_seg_head = FastInstHead(
+            pixel_decoder=encoder, transformer_predictor=decoder)
+
+        self.num_classes = decoder.num_classes
+        self.num_queries = decoder.num_queries
+        self.size_divisibility = 32
+        self.register_buffer(
+            'pixel_mean',
+            torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
+        self.register_buffer(
+            'pixel_std',
+            torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
+        self.classes = classes
+        self.test_topk_per_image = 100
+
+        if pretrained:
+            model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+            logger.info(f'loading model from {model_path}')
+            weight = torch.load(model_path, map_location='cpu')['model']
+            tgt_weight = self.state_dict()
+            for name in list(weight.keys()):
+                if name in tgt_weight:
+                    load_size = weight[name].size()
+                    tgt_size = tgt_weight[name].size()
+                    mis_match = False
+                    if len(load_size) != len(tgt_size):
+                        mis_match = True
+                    else:
+                        for n1, n2 in zip(load_size, tgt_size):
+                            if n1 != n2:
+                                mis_match = True
+                                break
+                    if mis_match:
+                        logger.info(
+                            f'size mismatch for {name} '
+                            f'({load_size} -> {tgt_size}), skip loading.')
+                        del weight[name]
+                else:
+                    logger.info(
+                        f'{name} doesn\'t exist in current model, skip loading.'
+                    )
+
+            self.load_state_dict(weight, strict=False)
+            logger.info('load model done')
+
+    def forward(self, batched_inputs: List[dict]) -> Dict[str, Any]:
+        images = [x['image'].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+
+        return dict(
+            outputs=outputs, batched_inputs=batched_inputs, images=images)
+
+    def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = input['outputs']
+        batched_inputs = input['batched_inputs']
+        images = input['images']
+        if self.training:
+            raise NotImplementedError
+        else:
+            mask_cls_results = outputs['pred_logits']  # (B, Q, C+1)
+            mask_pred_results = outputs['pred_masks']  # (B, Q, H, W)
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode='bilinear',
+                align_corners=False,
+            )
+
+            del outputs
+
+            processed_results = []
+            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+                    mask_cls_results, mask_pred_results, batched_inputs,
+                    images.image_sizes):
+                height = input_per_image.get('height', image_size[0])
+                width = input_per_image.get('width', image_size[1])
+                processed_results.append({})  # for each image
+
+                mask_pred_result = self.sem_seg_postprocess(
+                    mask_pred_result, image_size, height, width)
+                mask_cls_result = mask_cls_result.to(mask_pred_result)
+
+                instance_r = self.instance_inference(mask_cls_result,
+                                                     mask_pred_result)
+                processed_results[-1]['instances'] = instance_r
+
+        return dict(eval_result=processed_results)
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def sem_seg_postprocess(self, result, img_size, output_height,
+                            output_width):
+        result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
+        result = F.interpolate(
+            result,
+            size=(output_height, output_width),
+            mode='bilinear',
+            align_corners=False)[0]
+        return result
+
+    def instance_inference(self, mask_cls, mask_pred):
+        # mask_pred is already processed to have the same shape as original input
+        image_size = mask_pred.shape[-2:]
+
+        # [Q, K]
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        labels = torch.arange(
+            self.num_classes,
+            device=self.device).unsqueeze(0).repeat(self.num_queries,
+                                                    1).flatten(0, 1)
+        scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+            self.test_topk_per_image, sorted=False)
+        labels_per_image = labels[topk_indices]
+
+        topk_indices = topk_indices // self.num_classes
+        mask_pred = mask_pred[topk_indices]
+
+        result = {'image_size': image_size}
+        # mask (before sigmoid)
+        mask_pred_sigmoid = mask_pred.sigmoid()
+        result['pred_masks'] = (mask_pred_sigmoid > 0.5).float()
+
+        # calculate average mask prob
+        mask_scores_per_image = (mask_pred_sigmoid.flatten(1)
+                                 * result['pred_masks'].flatten(1)).sum(1) / (
+                                     result['pred_masks'].flatten(1).sum(1)
+                                     + 1e-6)
+        result['scores'] = scores_per_image * mask_scores_per_image
+        result['pred_classes'] = labels_per_image
+        return result
+
+
+class FastInstHead(nn.Module):
+
+    def __init__(
+            self,
+            *,
+            pixel_decoder: nn.Module,
+            # extra parameters
+            transformer_predictor: nn.Module):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            pixel_decoder: the pixel decoder module
+            transformer_predictor: the transformer decoder that makes prediction
+        """
+        super().__init__()
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+
+    def forward(self, features, targets=None):
+        return self.layers(features, targets)
+
+    def layers(self, features, targets=None):
+        mask_features, multi_scale_features = self.pixel_decoder.forward_features(
+            features)
+        predictions = self.predictor(multi_scale_features, mask_features,
+                                     targets)
+        return predictions
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -108,16 +108,16 @@ def get_img_ins_seg_result(img_seg_result=None,
    for seg_result in img_seg_result:

        box = [
-            np.int(seg_result[0]),
-            np.int(seg_result[1]),
-            np.int(seg_result[2]),
-            np.int(seg_result[3])
+            int(seg_result[0]),
+            int(seg_result[1]),
+            int(seg_result[2]),
+            int(seg_result[3])
        ]
-        score = np.float(seg_result[4])
+        score = float(seg_result[4])
        category = seg_result[5]

        mask = np.array(seg_result[6], order='F', dtype='uint8')
-        mask = mask.astype(np.float)
+        mask = mask.astype(float)

        results_dict[OutputKeys.BOXES].append(box)
        results_dict[OutputKeys.MASKS].append(mask)
--- a/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py
+++ b/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py
@@ -382,7 +382,7 @@ def processing_single_scene(args):
                points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
                points3d[p3d_id].xyz[2], 1
            ])
-            zs.append(np.asscalar(transformed[2]))
+            zs.append(transformed[2].item())
        zs_sorted = sorted(zs)
        # relaxed depth range
        max_ratio = 0.1
--- a/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py
+++ b/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py
@@ -40,7 +40,7 @@ def read_mask(filename):

 # save a binary mask
 def save_mask(filename, mask):
-    assert mask.dtype == np.bool
+    assert mask.dtype == bool
    mask = mask.astype(np.uint8) * 255
    Image.fromarray(mask).save(filename)

--- a/modelscope/models/cv/image_panoptic_segmentation/init.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/init.py
@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
    from .panseg_model import SwinLPanopticSegmentation
-    from .r50_panseg_model import R50PanopticSegmentation

 else:
    _import_structure = {
--- a/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py
@@ -1,18 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-from easycv.models.segmentation import Mask2Former
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.image_segmentation,
-    module_name=Models.r50_panoptic_segmentation)
-class R50PanopticSegmentation(EasyCVBaseModel, Mask2Former):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        Mask2Former.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_semantic_segmentation/segformer.py
+++ b/modelscope/models/cv/image_semantic_segmentation/segformer.py
@@ -1,16 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.segmentation import EncoderDecoder
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.image_segmentation, module_name=Models.segformer)
-class Segformer(EasyCVBaseModel, EncoderDecoder):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        EncoderDecoder.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -60,7 +60,7 @@ class SemanticSegmentation(TorchModel):
        ids = ids[legal_indices]

        segms = (semantic_result[None] == ids[:, None, None])
-        masks = [it.astype(np.int) for it in segms]
+        masks = [it.astype(int) for it in segms]
        labels_txt = np.array(self.CLASSES)[ids].tolist()

        results = {
--- a/modelscope/models/cv/image_skychange/ptsemseg/hrnet_backnone.py
+++ b/modelscope/models/cv/image_skychange/ptsemseg/hrnet_backnone.py
@@ -458,7 +458,7 @@ class HrnetBackBone(nn.Module):
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)

-        self.backbone_last_inp_channels = np.int(np.sum(pre_stage_channels))
+        self.backbone_last_inp_channels = int(np.sum(pre_stage_channels))

    def _make_transition_layer(self, num_channels_pre_layer,
                               num_channels_cur_layer):
--- a/modelscope/models/cv/image_skychange/ptsemseg/hrnet_super_and_ocr.py
+++ b/modelscope/models/cv/image_skychange/ptsemseg/hrnet_super_and_ocr.py
@@ -259,7 +259,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
            num_channels = [64, last_inp_channels]
            self.stage_super, super_stage_channels = self._make_stage(
                self.super_dict, num_channels)
-            last_inp_channels = np.int(np.sum(super_stage_channels))
+            last_inp_channels = int(np.sum(super_stage_channels))

            if self.is_contain_aspp:
                aspp_param = kwargs['aspp']
@@ -372,7 +372,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
            num_channels = [64, ocr_mid_channels]
            self.stage_super, super_stage_channels = self._make_stage(
                self.super_dict, num_channels)
-            last_inp_channels = np.int(np.sum(super_stage_channels))
+            last_inp_channels = int(np.sum(super_stage_channels))

            self.cls_head = nn.Sequential(
                nn.Conv2d(
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -13,7 +13,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.transforms as TF
 from PIL import Image
-from shotdetect_scenedetect_lgss import shot_detect
+from shotdetect_scenedetect_lgss import shot_detector
+from tqdm import tqdm

 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
@@ -60,6 +61,9 @@ class MovieSceneSegmentationModel(TorchModel):
        self.head_sbd = nn.Linear(hdim, 2)
        load_param_with_prefix('head_sbd', self.head_sbd, params)

+        self.shot_detector = shot_detector()
+        self.shot_detector.init(**self.cfg.preprocessor.shot_detect)
+
        self.test_transform = TF.Compose([
            TF.Resize(size=256, interpolation=Image.BICUBIC),
            TF.CenterCrop(224),
@@ -98,29 +102,45 @@ class MovieSceneSegmentationModel(TorchModel):
    def inference(self, batch):
        logger.info('Begin scene detect ......')
        bs = self.cfg.pipeline.batch_size_per_gpu
-        sids = batch['sid']
-        inputs = batch['shot_feat']
+        device = self.crn.attention_mask.device

-        shot_num = len(sids)
+        shot_timecode_lst = batch['shot_timecode_lst']
+        shot_idx_lst = batch['shot_idx_lst']
+
+        shot_num = len(shot_timecode_lst)
        cnt = math.ceil(shot_num / bs)

-        infer_sid, infer_pred = [], []
+        infer_pred = []
        infer_result = {}
-        for i in range(cnt):
+        self.shot_detector.start()
+
+        for i in tqdm(range(cnt)):
            start = i * bs
            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
-            input_ = inputs[start:end]
-            sid_ = sids[start:end]
-            input_ = torch.stack(input_)
+
+            batch_shot_idx_lst = shot_idx_lst[start:end]
+
+            shot_start_idx = batch_shot_idx_lst[0][0]
+            shot_end_idx = batch_shot_idx_lst[-1][-1]
+            batch_timecode_lst = {
+                i: shot_timecode_lst[i]
+                for i in range(shot_start_idx, shot_end_idx + 1)
+            }
+            batch_shot_keyf_lst = self.shot_detector.get_frame_img(
+                batch_timecode_lst, shot_start_idx, shot_num)
+            inputs = self.get_batch_input(batch_shot_keyf_lst, shot_start_idx,
+                                          batch_shot_idx_lst)
+
+            input_ = torch.stack(inputs).to(device)
            outputs = self.shared_step(input_)  # shape [b,2]
            prob = F.softmax(outputs, dim=1)
-            infer_sid.extend(sid_.cpu().detach().numpy())
            infer_pred.extend(prob[:, 1].cpu().detach().numpy())
-        infer_result.update({'pred': np.stack(infer_pred)})
-        infer_result.update({'sid': infer_sid})

-        assert len(infer_result['sid']) == len(sids)
-        assert len(infer_result['pred']) == len(inputs)
+        infer_result.update({'pred': np.stack(infer_pred)})
+        infer_result.update({'sid': np.arange(shot_num)})
+
+        assert len(infer_result['pred']) == shot_num
+        self.shot_detector.release()
        return infer_result

    def shared_step(self, inputs):
@@ -162,38 +182,48 @@ class MovieSceneSegmentationModel(TorchModel):
        logger.info('Generate scene .......')

        pred_dict = inputs['feat']
+        shot2keyf = inputs['shot2keyf']
        thres = self.cfg.pipeline.save_threshold

        anno_dict = get_pred_boundary(pred_dict, thres)
        scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
-            self.shot2keyf, anno_dict)
+            shot2keyf, anno_dict)
        if self.cfg.pipeline.save_split_scene:
            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
            print(f'Split scene video saved to {re_dir}')
        return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst

-    def preprocess(self, inputs):
-        logger.info('Begin shot detect......')
-        shot_keyf_lst, anno, shot2keyf = shot_detect(
-            inputs, **self.cfg.preprocessor.shot_detect)
-        logger.info('Shot detect done!')
+    def get_batch_input(self, shot_keyf_lst, shot_start_idx, shot_idx_lst):

-        single_shot_feat, sid = [], []
+        single_shot_feat = []
        for idx, one_shot in enumerate(shot_keyf_lst):
            one_shot = [
                self.test_transform(one_frame) for one_frame in one_shot
            ]
            one_shot = torch.stack(one_shot, dim=0)
            single_shot_feat.append(one_shot)
-            sid.append(idx)
+
        single_shot_feat = torch.stack(single_shot_feat, dim=0)
+
        shot_feat = []
+        for idx, shot_idx in enumerate(shot_idx_lst):
+            shot_idx_ = shot_idx - shot_start_idx
+            _one_shot = single_shot_feat[shot_idx_]
+            shot_feat.append(_one_shot)
+
+        return shot_feat
+
+    def preprocess(self, inputs):
+        logger.info('Begin shot detect......')
+        shot_timecode_lst, anno, shot2keyf = self.shot_detector.shot_detect(
+            inputs, **self.cfg.preprocessor.shot_detect)
+        logger.info('Shot detect done!')
+
+        shot_idx_lst = []
        for idx, one_shot in enumerate(anno):
            shot_idx = int(one_shot['shot_id']) + np.arange(
                -self.neighbor_size, self.neighbor_size + 1)
-            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
-            _one_shot = single_shot_feat[shot_idx]
-            shot_feat.append(_one_shot)
-        self.shot2keyf = shot2keyf
-        self.anno = anno
-        return shot_feat, sid
+            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'] - 1)
+            shot_idx_lst.append(shot_idx)
+
+        return shot2keyf, anno, shot_timecode_lst, shot_idx_lst
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -10,11 +10,12 @@ from tqdm import tqdm


 def get_pred_boundary(pred_dict, threshold=0.5):
-    pred = pred_dict['pred']
+    pred = pred_dict['pred'].cpu().numpy()
+    sid = pred_dict['sid'].cpu().numpy().astype(np.int32)
    tmp = (pred > threshold).astype(np.int32)
    anno_dict = {}
    for idx in range(len(tmp)):
-        anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
+        anno_dict.update({str(sid[idx]).zfill(4): int(tmp[idx])})
    return anno_dict


--- a/modelscope/models/cv/nerf_recon_acc/network/segmenter.py
+++ b/modelscope/models/cv/nerf_recon_acc/network/segmenter.py
@@ -31,7 +31,7 @@ class ObjectSegmenter(object):
        elif img.shape[2] == 4:
            img = img[:, :, :3]
        img = img[:, :, ::-1]
-        img = img.astype(np.float)
+        img = img.astype(float)
        return img

    def run_mask(self, img):
--- a/modelscope/models/cv/object_detection/dino.py
+++ b/modelscope/models/cv/object_detection/dino.py
@@ -1,16 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.detection.detectors import Detection as _Detection
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.image_object_detection, module_name=Models.dino)
-class DINO(EasyCVBaseModel, _Detection):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        _Detection.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/object_detection/yolox_pai.py
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -1,21 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from easycv.models.detection.detectors import YOLOX as _YOLOX
-
-from modelscope.metainfo import Models
-from modelscope.models.builder import MODELS
-from modelscope.models.cv.easycv_base import EasyCVBaseModel
-from modelscope.utils.constant import Tasks
-
-
-@MODELS.register_module(
-    group_key=Tasks.image_object_detection, module_name=Models.yolox)
-@MODELS.register_module(
-    group_key=Tasks.image_object_detection,
-    module_name=Models.image_object_detection_auto)
-@MODELS.register_module(
-    group_key=Tasks.domain_specific_object_detection, module_name=Models.yolox)
-class YOLOX(EasyCVBaseModel, _YOLOX):
-
-    def __init__(self, model_dir=None, *args, **kwargs):
-        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
-        _YOLOX.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/object_detection_3d/depe/result_vis.py
+++ b/modelscope/models/cv/object_detection_3d/depe/result_vis.py
@@ -30,7 +30,7 @@ def depth2color(depth):
    if gray == 1:
        return tuple(colors[-1].tolist())
    num_rank = len(colors) - 1
-    rank = np.floor(gray * num_rank).astype(np.int)
+    rank = np.floor(gray * num_rank).astype(int)
    diff = (gray - rank / num_rank) * num_rank
    tmp = colors[rank + 1] - colors[rank]
    return tuple((colors[rank] + tmp * diff).tolist())
@@ -136,7 +136,7 @@ def plot_result(res_path,
            l2g = get_lidar2global(infos)
            corners_lidar = corners_global @ np.linalg.inv(l2g).T
            corners_lidar = corners_lidar[:, :3]
-        pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
+        pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=bool)
        scores = [
            pred_res[rid]['detection_score'] for rid in range(len(pred_res))
        ]
@@ -151,7 +151,7 @@ def plot_result(res_path,
                   origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
            corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
                                           axis=0)
-            gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
+            gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=bool)
            pred_flag = np.concatenate(
                [pred_flag, np.logical_not(gt_flag)], axis=0)
            scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
@@ -169,7 +169,7 @@ def plot_result(res_path,
                check_point_in_img(corners_img, img.shape[0], img.shape[1]))
            valid = valid.reshape(
                -1, 8)  # valid means: d>0 and visible in current view
-            corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
+            corners_img = corners_img.reshape(-1, 8, 2).astype(int)
            for aid in range(valid.shape[0]):
                if scores[aid] < vis_thred and pred_flag[aid]:
                    continue
--- a/modelscope/models/cv/ocr_recognition/model.py
+++ b/modelscope/models/cv/ocr_recognition/model.py
@@ -90,8 +90,15 @@ class OCRRecognition(TorchModel):
                f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
            )
        if model_path != '':
-            self.recognizer.load_state_dict(
-                torch.load(model_path, map_location='cpu'))
+            params_pretrained = torch.load(model_path, map_location='cpu')
+            model_dict = self.recognizer.state_dict()
+            # remove prefix for finetuned models
+            check_point = {
+                k.replace('recognizer.', ''): v
+                for k, v in params_pretrained.items()
+            }
+            model_dict.update(check_point)
+            self.recognizer.load_state_dict(model_dict)

        dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
        self.labelMapping = dict()
--- a/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
+++ b/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
@@ -176,8 +176,7 @@ class OpenVocabularyDetectionViLD(Model):
        # Filter out invalid rois (nmsed rois)
        valid_indices = np.where(
            np.logical_and(
-                np.isin(
-                    np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
+                np.isin(np.arange(len(roi_scores), dtype=int), nmsed_indices),
                np.logical_and(
                    np.logical_not(np.all(roi_boxes == 0., axis=-1)),
                    np.logical_and(roi_scores >= min_rpn_score_thresh,
--- a/modelscope/models/cv/panorama_depth_estimation/networks/layers.py
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/layers.py
@@ -72,7 +72,7 @@ class Cube2Equirec(nn.Module):
                self.equ_h, 0), 3 * self.equ_w // 8, 1)

        # Prepare ceil mask
-        mask = np.zeros((self.equ_h, self.equ_w // 4), np.bool)
+        mask = np.zeros((self.equ_h, self.equ_w // 4), bool)
        idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
        idx = self.equ_h // 2 - np.round(
            np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)
--- a/modelscope/models/cv/video_depth_estimation/utils/depth.py
+++ b/modelscope/models/cv/video_depth_estimation/utils/depth.py
@@ -29,7 +29,7 @@ def load_depth(file):
    elif file.endswith('png'):
        depth_png = np.array(load_image(file), dtype=int)
        assert (np.max(depth_png) > 255), 'Wrong .png depth file'
-        return depth_png.astype(np.float) / 256.
+        return depth_png.astype(float) / 256.
    else:
        raise NotImplementedError('Depth extension not supported.')

--- a/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py
+++ b/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py
@@ -85,7 +85,7 @@ def do_scene_detect(F01_tensor, F10_tensor, img0_tensor, img1_tensor):
    img_diff = ori_img.float() - ref_img.float()
    img_diff = torch.abs(img_diff)

-    kernel = np.ones([8, 8], np.float) / 64
+    kernel = np.ones([8, 8], float) / 64
    kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
    diff = F.conv2d(img_diff, kernel, padding=4)

--- a/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py
@@ -27,7 +27,7 @@ def linear_assignment(cost_matrix, thresh):


 def ious(atlbrs, btlbrs):
-    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
+    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=float)
    if ious.size == 0:
        return ious

@@ -60,13 +60,13 @@ def embedding_distance(tracks, detections, metric='cosine'):
        cost_matrix: np.ndarray
    """

-    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
+    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=float)
    if cost_matrix.size == 0:
        return cost_matrix
    det_features = np.asarray([track.curr_feat for track in detections],
-                              dtype=np.float)
+                              dtype=float)
    track_features = np.asarray([track.smooth_feat for track in tracks],
-                                dtype=np.float)
+                                dtype=float)
    cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
    return cost_matrix

--- a/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py
@@ -28,7 +28,7 @@ class STrack(BaseTrack):
    def __init__(self, tlwh, score, temp_feat, buffer_size=30):

        # wait activate
-        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self._tlwh = np.asarray(tlwh, dtype=float)
        self.kalman_filter = None
        self.mean, self.covariance = None, None
        self.is_activated = False
--- a/modelscope/models/multi_modal/init.py
+++ b/modelscope/models/multi_modal/init.py
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
    from .vldoc import VLDocForDocVLEmbedding
    from .video_synthesis import TextToVideoSynthesis
    from .efficient_diffusion_tuning import EfficientStableDiffusion
+    from .mplug_owl import MplugOwlForConditionalGeneration
+    from .clip_interrogator import CLIP_Interrogator

 else:
    _import_structure = {
@@ -37,7 +39,9 @@ else:
        ['MultiStageDiffusionForTextToImageSynthesis'],
        'vldoc': ['VLDocForDocVLEmbedding'],
        'video_synthesis': ['TextToVideoSynthesis'],
-        'efficient_diffusion_tuning': ['EfficientStableDiffusion']
+        'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
+        'mplug_owl': ['MplugOwlForConditionalGeneration'],
+        'clip_interrogator': ['CLIP_Interrogator'],
    }

    import sys
--- a/modelscope/models/multi_modal/clip_interrogator/init.py
+++ b/modelscope/models/multi_modal/clip_interrogator/init.py
@@ -0,0 +1 @@
+from .model import CLIP_Interrogator
--- a/modelscope/models/multi_modal/clip_interrogator/model.py
+++ b/modelscope/models/multi_modal/clip_interrogator/model.py
@@ -0,0 +1,599 @@
+# This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at
+# https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py
+
+import hashlib
+import math
+import os
+import time
+from dataclasses import dataclass
+from typing import List, Optional
+
+import numpy as np
+import open_clip
+import requests
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from safetensors.numpy import load_file, save_file
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoProcessor,
+                          Blip2ForConditionalGeneration,
+                          BlipForConditionalGeneration)
+
+from modelscope.metainfo import Models
+from modelscope.models.base import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['CLIP_Interrogator']
+
+CAPTION_MODELS = {
+    'blip-base': 'blip-image-captioning-base',
+    'blip-large': 'blip-image-captioning-large',
+    'blip2-2.7b': 'blip2-opt-2.7b',
+    'blip2-flan-t5-xl': 'blip2-flan-t5-xl',
+    'git-large-coco': 'git-large-coco',
+}
+
+
+@dataclass
+class Config:
+    # models can optionally be passed in directly
+    caption_model = None
+    caption_processor = None
+    clip_model = None
+    clip_preprocess = None
+
+    # blip settings
+    caption_max_length: int = 32
+    caption_model_name: Optional[
+        str] = 'blip-large'  # use a key from CAPTION_MODELS or None
+    caption_offload: bool = False
+
+    # clip settings
+    clip_model_name: str = 'ViT-L-14/openai'
+    clip_model_path: Optional[str] = None
+    clip_offload: bool = False
+
+    # interrogator settings
+    cache_path: str = 'cache'  # path to store cached text embeddings
+    download_cache: bool = False  # when true, cached embeds are downloaded from huggingface
+    chunk_size: int = 2048  # batch size for CLIP, use smaller for lower VRAM
+    data_path: str = os.path.join(os.path.dirname(__file__), 'data')
+    device: str = ('cuda' if torch.cuda.is_available() else 'cpu')
+    flavor_intermediate_count: int = 2048
+    quiet: bool = False  # when quiet progress bars are not shown
+
+    def apply_low_vram_defaults(self):
+        self.caption_model_name = 'blip-base'
+        self.caption_offload = True
+        self.clip_offload = True
+        self.chunk_size = 1024
+        self.flavor_intermediate_count = 1024
+
+
+# CLIP-Interrogator utilize CLIP and BLIP to generate rich caption for images.
+# CLIP is a zero-shot image classifier which can be used to generate image and text embeddings.
+# BLIP is a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks.
+# BLIP effectively utilizes the noisy web data by bootstrapping the captions, where
+# a captioner generates synthetic captions and a filter removes the noisy ones.
+# Please infer to the paper CLIP: Learning Transferable Visual Models From Natural Language Supervision
+# https://arxiv.org/abs/2103.00020
+# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
+# https://arxiv.org/abs/2201.12086
+
+
+class Interrogator():
+
+    def __init__(self, config: Config):
+        self.config = config
+        self.device = config.device
+        self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
+        self.caption_offloaded = True
+        self.clip_offloaded = True
+        self.load_caption_model()
+        self.load_clip_model()
+
+    def load_caption_model(self):
+        if self.config.caption_model is None and self.config.caption_model_name:
+            if not self.config.quiet:
+                print(
+                    f'Loading caption model {self.config.caption_model_name}...'
+                )
+
+            model_path = CAPTION_MODELS[self.config.caption_model_name]
+            if self.config.caption_model_name.startswith('git-'):
+                caption_model = AutoModelForCausalLM.from_pretrained(
+                    os.path.join(self.config.cache_path, model_path),
+                    torch_dtype=torch.float32)
+            elif self.config.caption_model_name.startswith('blip2-'):
+                caption_model = Blip2ForConditionalGeneration.from_pretrained(
+                    os.path.join(self.config.cache_path, model_path),
+                    torch_dtype=self.dtype)
+            else:
+                caption_model = BlipForConditionalGeneration.from_pretrained(
+                    os.path.join(self.config.cache_path, model_path),
+                    torch_dtype=self.dtype)
+            self.caption_processor = AutoProcessor.from_pretrained(
+                os.path.join(self.config.cache_path, model_path))
+
+            caption_model.eval()
+            if not self.config.caption_offload:
+                caption_model = caption_model.to(self.config.device)
+            self.caption_model = caption_model
+        else:
+            self.caption_model = self.config.caption_model
+            self.caption_processor = self.config.caption_processor
+
+    def load_clip_model(self):
+        start_time = time.time()
+        config = self.config
+
+        clip_model_name, clip_model_pretrained_name = config.clip_model_name.split(
+            '/', 2)
+
+        if config.clip_model is None:
+            if not config.quiet:
+                print(f'Loading CLIP model {config.clip_model_name}...')
+
+            self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
+                clip_model_name,
+                pretrained=clip_model_pretrained_name,
+                precision='fp16' if config.device == 'cuda' else 'fp32',
+                device=config.device,
+                jit=False,
+                cache_dir=config.clip_model_path)
+            self.clip_model.eval()
+        else:
+            self.clip_model = config.clip_model
+            self.clip_preprocess = config.clip_preprocess
+        self.tokenize = open_clip.get_tokenizer(clip_model_name)
+
+        sites = [
+            'Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart',
+            'dribbble', 'flickr', 'instagram', 'pexels', 'pinterest',
+            'pixabay', 'pixiv', 'polycount', 'reddit', 'shutterstock',
+            'tumblr', 'unsplash', 'zbrush central'
+        ]
+        trending_list = [site for site in sites]
+        trending_list.extend(['trending on ' + site for site in sites])
+        trending_list.extend(['featured on ' + site for site in sites])
+        trending_list.extend([site + ' contest winner' for site in sites])
+
+        raw_artists = load_list(config.data_path, 'artists.txt')
+        artists = [f'by {a}' for a in raw_artists]
+        artists.extend([f'inspired by {a}' for a in raw_artists])
+
+        self._prepare_clip()
+        self.artists = LabelTable(artists, 'artists', self)
+        self.flavors = LabelTable(
+            load_list(config.data_path, 'flavors.txt'), 'flavors', self)
+        self.mediums = LabelTable(
+            load_list(config.data_path, 'mediums.txt'), 'mediums', self)
+        self.movements = LabelTable(
+            load_list(config.data_path, 'movements.txt'), 'movements', self)
+        self.trendings = LabelTable(trending_list, 'trendings', self)
+        self.negative = LabelTable(
+            load_list(config.data_path, 'negative.txt'), 'negative', self)
+
+        end_time = time.time()
+        if not config.quiet:
+            print(
+                f'Loaded CLIP model and data in {end_time-start_time:.2f} seconds.'
+            )
+
+    def chain(self,
+              image_features: torch.Tensor,
+              phrases: List[str],
+              best_prompt: str = '',
+              best_sim: float = 0,
+              min_count: int = 8,
+              max_count: int = 32,
+              desc='Chaining',
+              reverse: bool = False) -> str:
+        self._prepare_clip()
+
+        phrases = set(phrases)
+        if not best_prompt:
+            best_prompt = self.rank_top(
+                image_features, [f for f in phrases], reverse=reverse)
+            best_sim = self.similarity(image_features, best_prompt)
+            phrases.remove(best_prompt)
+        curr_prompt, curr_sim = best_prompt, best_sim
+
+        def check(addition: str, idx: int) -> bool:
+            nonlocal best_prompt, best_sim, curr_prompt, curr_sim
+            prompt = curr_prompt + ', ' + addition
+            sim = self.similarity(image_features, prompt)
+            if reverse:
+                sim = -sim
+
+            if sim > best_sim:
+                best_prompt, best_sim = prompt, sim
+            if sim > curr_sim or idx < min_count:
+                curr_prompt, curr_sim = prompt, sim
+                return True
+            return False
+
+        for idx in tqdm(
+                range(max_count), desc=desc, disable=self.config.quiet):
+            best = self.rank_top(
+                image_features, [f'{curr_prompt}, {f}' for f in phrases],
+                reverse=reverse)
+            flave = best[len(curr_prompt) + 2:]
+            if not check(flave, idx):
+                break
+            if _prompt_at_max_len(curr_prompt, self.tokenize):
+                break
+            phrases.remove(flave)
+
+        return best_prompt
+
+    def generate_caption(self, pil_image: Image) -> str:
+        assert self.caption_model is not None, 'No caption model loaded.'
+        self._prepare_caption()
+        inputs = self.caption_processor(
+            images=pil_image, return_tensors='pt').to(self.device)
+        if not self.config.caption_model_name.startswith('git-'):
+            inputs = inputs.to(self.dtype)
+        tokens = self.caption_model.generate(
+            **inputs, max_new_tokens=self.config.caption_max_length)
+        return self.caption_processor.batch_decode(
+            tokens, skip_special_tokens=True)[0].strip()
+
+    def image_to_features(self, image: Image) -> torch.Tensor:
+        self._prepare_clip()
+        images = self.clip_preprocess(image).unsqueeze(0).to(self.device)
+        with torch.no_grad(), torch.cuda.amp.autocast():
+            image_features = self.clip_model.encode_image(images)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+        return image_features
+
+    def interrogate_classic(self,
+                            image: Image,
+                            max_flavors: int = 3,
+                            caption: Optional[str] = None) -> str:
+        """Classic mode creates a prompt in a standard format first describing the image,
+        then listing the artist, trending, movement, and flavor text modifiers."""
+        caption = caption or self.generate_caption(image)
+        image_features = self.image_to_features(image)
+
+        medium = self.mediums.rank(image_features, 1)[0]
+        artist = self.artists.rank(image_features, 1)[0]
+        trending = self.trendings.rank(image_features, 1)[0]
+        movement = self.movements.rank(image_features, 1)[0]
+        flaves = ', '.join(self.flavors.rank(image_features, max_flavors))
+
+        if caption.startswith(medium):
+            prompt = f'{caption} {artist}, {trending}, {movement}, {flaves}'
+        else:
+            prompt = f'{caption}, {medium} {artist}, {trending}, {movement}, {flaves}'
+
+        return _truncate_to_fit(prompt, self.tokenize)
+
+    def interrogate_fast(self,
+                         image: Image,
+                         max_flavors: int = 32,
+                         caption: Optional[str] = None) -> str:
+        """Fast mode simply adds the top ranked terms after a caption. It generally results in
+        better similarity between generated prompt and image than classic mode, but the prompts
+        are less readable."""
+        caption = caption or self.generate_caption(image)
+        image_features = self.image_to_features(image)
+        merged = _merge_tables([
+            self.artists, self.flavors, self.mediums, self.movements,
+            self.trendings
+        ], self)
+        tops = merged.rank(image_features, max_flavors)
+        return _truncate_to_fit(caption + ', ' + ', '.join(tops),
+                                self.tokenize)
+
+    def interrogate_negative(self, image: Image, max_flavors: int = 32) -> str:
+        """Negative mode chains together the most dissimilar terms to the image. It can be used
+        to help build a negative prompt to pair with the regular positive prompt and often
+        improve the results of generated images particularly with Stable Diffusion 2."""
+        image_features = self.image_to_features(image)
+        flaves = self.flavors.rank(
+            image_features,
+            self.config.flavor_intermediate_count,
+            reverse=True)
+        flaves = flaves + self.negative.labels
+        return self.chain(
+            image_features,
+            flaves,
+            max_count=max_flavors,
+            reverse=True,
+            desc='Negative chain')
+
+    def interrogate(self,
+                    image: Image,
+                    min_flavors: int = 8,
+                    max_flavors: int = 32,
+                    caption: Optional[str] = None) -> str:
+        caption = caption or self.generate_caption(image)
+        image_features = self.image_to_features(image)
+
+        merged = _merge_tables([
+            self.artists, self.flavors, self.mediums, self.movements,
+            self.trendings
+        ], self)
+        flaves = merged.rank(image_features,
+                             self.config.flavor_intermediate_count)
+        best_prompt, best_sim = caption, self.similarity(
+            image_features, caption)
+        best_prompt = self.chain(
+            image_features,
+            flaves,
+            best_prompt,
+            best_sim,
+            min_count=min_flavors,
+            max_count=max_flavors,
+            desc='Flavor chain')
+
+        fast_prompt = self.interrogate_fast(
+            image, max_flavors, caption=caption)
+        classic_prompt = self.interrogate_classic(
+            image, max_flavors, caption=caption)
+        candidates = [caption, classic_prompt, fast_prompt, best_prompt]
+        return candidates[np.argmax(
+            self.similarities(image_features, candidates))]
+
+    def rank_top(self,
+                 image_features: torch.Tensor,
+                 text_array: List[str],
+                 reverse: bool = False) -> str:
+        self._prepare_clip()
+        text_tokens = self.tokenize([text
+                                     for text in text_array]).to(self.device)
+        with torch.no_grad(), torch.cuda.amp.autocast():
+            text_features = self.clip_model.encode_text(text_tokens)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+            similarity = text_features @ image_features.T
+            if reverse:
+                similarity = -similarity
+        return text_array[similarity.argmax().item()]
+
+    def similarity(self, image_features: torch.Tensor, text: str) -> float:
+        self._prepare_clip()
+        text_tokens = self.tokenize([text]).to(self.device)
+        with torch.no_grad(), torch.cuda.amp.autocast():
+            text_features = self.clip_model.encode_text(text_tokens)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+            similarity = text_features @ image_features.T
+        return similarity[0][0].item()
+
+    def similarities(self, image_features: torch.Tensor,
+                     text_array: List[str]) -> List[float]:
+        self._prepare_clip()
+        text_tokens = self.tokenize([text
+                                     for text in text_array]).to(self.device)
+        with torch.no_grad(), torch.cuda.amp.autocast():
+            text_features = self.clip_model.encode_text(text_tokens)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+            similarity = text_features @ image_features.T
+        return similarity.T[0].tolist()
+
+    def _prepare_caption(self):
+        if self.config.clip_offload and not self.clip_offloaded:
+            self.clip_model = self.clip_model.to('cpu')
+            self.clip_offloaded = True
+        if self.caption_offloaded:
+            self.caption_model = self.caption_model.to(self.device)
+            self.caption_offloaded = False
+
+    def _prepare_clip(self):
+        if self.config.caption_offload and not self.caption_offloaded:
+            self.caption_model = self.caption_model.to('cpu')
+            self.caption_offloaded = True
+        if self.clip_offloaded:
+            self.clip_model = self.clip_model.to(self.device)
+            self.clip_offloaded = False
+
+
+class LabelTable():
+
+    def __init__(self, labels: List[str], desc: str, ci: Interrogator):
+        clip_model, config = ci.clip_model, ci.config
+        self.chunk_size = config.chunk_size
+        self.config = config
+        self.device = config.device
+        self.embeds = []
+        self.labels = labels
+        self.tokenize = ci.tokenize
+
+        hash = hashlib.sha256(','.join(labels).encode()).hexdigest()
+        sanitized_name = self.config.clip_model_name.replace('/', '_').replace(
+            '@', '_')
+        self._load_cached(desc, hash, sanitized_name)
+
+        if len(self.labels) != len(self.embeds):
+            self.embeds = []
+            chunks = np.array_split(
+                self.labels, max(1,
+                                 len(self.labels) / config.chunk_size))
+            for chunk in tqdm(
+                    chunks,
+                    desc=f'Preprocessing {desc}' if desc else None,
+                    disable=self.config.quiet):
+                text_tokens = self.tokenize(chunk).to(self.device)
+                with torch.no_grad(), torch.cuda.amp.autocast():
+                    text_features = clip_model.encode_text(text_tokens)
+                    text_features /= text_features.norm(dim=-1, keepdim=True)
+                    text_features = text_features.half().cpu().numpy()
+                for i in range(text_features.shape[0]):
+                    self.embeds.append(text_features[i])
+
+            if desc and self.config.cache_path:
+                os.makedirs(self.config.cache_path, exist_ok=True)
+                cache_filepath = os.path.join(
+                    self.config.cache_path,
+                    f'{sanitized_name}_{desc}.safetensors')
+                tensors = {
+                    'embeds': np.stack(self.embeds),
+                    'hash': np.array([ord(c) for c in hash], dtype=np.int8)
+                }
+                save_file(tensors, cache_filepath)
+
+        if self.device == 'cpu' or self.device == torch.device('cpu'):
+            self.embeds = [e.astype(np.float32) for e in self.embeds]
+
+    def _load_cached(self, desc: str, hash: str, sanitized_name: str) -> bool:
+        if self.config.cache_path is None or desc is None:
+            return False
+
+        cached_safetensors = os.path.join(
+            self.config.cache_path, f'{sanitized_name}_{desc}.safetensors')
+
+        if os.path.exists(cached_safetensors):
+            try:
+                tensors = load_file(cached_safetensors)
+            except Exception as e:
+                print(f'Failed to load {cached_safetensors}')
+                print(e)
+                return False
+            if 'hash' in tensors and 'embeds' in tensors:
+                if np.array_equal(
+                        tensors['hash'],
+                        np.array([ord(c) for c in hash], dtype=np.int8)):
+                    self.embeds = tensors['embeds']
+                    if len(self.embeds.shape) == 2:
+                        self.embeds = [
+                            self.embeds[i] for i in range(self.embeds.shape[0])
+                        ]
+                    return True
+
+        return False
+
+    def _rank(self,
+              image_features: torch.Tensor,
+              text_embeds: torch.Tensor,
+              top_count: int = 1,
+              reverse: bool = False) -> str:
+        top_count = min(top_count, len(text_embeds))
+        text_embeds = torch.stack([torch.from_numpy(t)
+                                   for t in text_embeds]).to(self.device)
+        with torch.cuda.amp.autocast():
+            similarity = image_features @ text_embeds.T
+            if reverse:
+                similarity = -similarity
+        _, top_labels = similarity.float().cpu().topk(top_count, dim=-1)
+        return [top_labels[0][i].numpy() for i in range(top_count)]
+
+    def rank(self,
+             image_features: torch.Tensor,
+             top_count: int = 1,
+             reverse: bool = False) -> List[str]:
+        if len(self.labels) <= self.chunk_size:
+            tops = self._rank(
+                image_features,
+                self.embeds,
+                top_count=top_count,
+                reverse=reverse)
+            return [self.labels[i] for i in tops]
+
+        num_chunks = int(math.ceil(len(self.labels) / self.chunk_size))
+        keep_per_chunk = int(self.chunk_size / num_chunks)
+
+        top_labels, top_embeds = [], []
+        for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
+            start = chunk_idx * self.chunk_size
+            stop = min(start + self.chunk_size, len(self.embeds))
+            tops = self._rank(
+                image_features,
+                self.embeds[start:stop],
+                top_count=keep_per_chunk,
+                reverse=reverse)
+            top_labels.extend([self.labels[start + i] for i in tops])
+            top_embeds.extend([self.embeds[start + i] for i in tops])
+
+        tops = self._rank(image_features, top_embeds, top_count=top_count)
+        return [top_labels[i] for i in tops]
+
+
+def _download_file(url: str,
+                   filepath: str,
+                   chunk_size: int = 4 * 1024 * 1024,
+                   quiet: bool = False):
+    r = requests.get(url, stream=True)
+    if r.status_code != 200:
+        return
+
+    file_size = int(r.headers.get('Content-Length', 0))
+    filename = url.split('/')[-1]
+    progress = tqdm(
+        total=file_size,
+        unit='B',
+        unit_scale=True,
+        desc=filename,
+        disable=quiet)
+    with open(filepath, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=chunk_size):
+            if chunk:
+                f.write(chunk)
+                progress.update(len(chunk))
+    progress.close()
+
+
+def _merge_tables(tables: List[LabelTable], ci: Interrogator) -> LabelTable:
+    m = LabelTable([], None, ci)
+    for table in tables:
+        m.labels.extend(table.labels)
+        m.embeds.extend(table.embeds)
+    return m
+
+
+def _prompt_at_max_len(text: str, tokenize) -> bool:
+    tokens = tokenize([text])
+    return tokens[0][-1] != 0
+
+
+def _truncate_to_fit(text: str, tokenize) -> str:
+    parts = text.split(', ')
+    new_text = parts[0]
+    for part in parts[1:]:
+        if _prompt_at_max_len(new_text + part, tokenize):
+            break
+        new_text += ', ' + part
+    return new_text
+
+
+def list_caption_models() -> List[str]:
+    return list(CAPTION_MODELS.keys())
+
+
+def list_clip_models() -> List[str]:
+    return ['/'.join(x) for x in open_clip.list_pretrained()]
+
+
+def load_list(data_path: str, filename: Optional[str] = None) -> List[str]:
+    """Load a list of strings from a file."""
+    if filename is not None:
+        data_path = os.path.join(data_path, filename)
+    with open(data_path, 'r', encoding='utf-8', errors='replace') as f:
+        items = [line.strip() for line in f.readlines()]
+    return items
+
+
+@MODELS.register_module(
+    Tasks.image_captioning, module_name=Models.clip_interrogator)
+class CLIP_Interrogator(TorchModel):
+
+    def __init__(self, model_dir, device='cuda', device_id=0, *args, **kwargs):
+        super().__init__(
+            model_dir=model_dir, device_id=device_id, *args, **kwargs)
+        self.device = device
+        self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
+        cf = Config(clip_model_name='ViT-L-14/openai')
+        cf.data_path = os.path.join(model_dir, 'data')
+        cf.clip_model_path = model_dir
+        cf.cache_path = model_dir
+        self.ci = Interrogator(cf)
+
+    def forward(self, inputs):
+        image = transforms.ToPILImage()(inputs)
+        return {'caption': self.ci.interrogate(image)}
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -128,13 +128,13 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
                          local_transform,
                          s=None,
                          e=None):
-        video_mask = np.zeros(self.max_frames, dtype=np.long)
+        video_mask = np.zeros(self.max_frames, dtype=int)
        max_video_length = 0

        # T x 3 x H x W
        video = np.zeros((self.max_frames, 3, rawVideoExtractor.size,
                          rawVideoExtractor.size),
-                         dtype=np.float)
+                         dtype=float)

        if s is None:
            start_time, end_time = None, None
--- a/modelscope/models/multi_modal/mplug_owl/init.py
+++ b/modelscope/models/multi_modal/mplug_owl/init.py
@@ -0,0 +1,18 @@
+# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
+                                      MplugOwlVisualAbstractorConfig)
+from .modeling_mplug_owl import MplugOwlForConditionalGeneration
--- a/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py
+++ b/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py
@@ -0,0 +1,257 @@
+# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MPLUG OWL model configuration """
+import copy
+import os
+from typing import Union
+
+from transformers import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+from transformers.utils import logging
+
+from modelscope.utils.constant import Tasks
+
+logger = logging.get_logger()
+
+
+class MplugOwlVisionConfig(PretrainedConfig):
+    r"""
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    ```"""
+
+    model_type = 'mplug_owl_vision_model'
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        projection_dim=768,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act='quick_gelu',
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        use_flash_attn=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from MplugOwlConfig
+        if config_dict.get('model_type') == 'mplug_owl':
+            config_dict = config_dict['vision_config']
+
+        if 'model_type' in config_dict and hasattr(
+                cls,
+                'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugOwlVisualAbstractorConfig(PretrainedConfig):
+
+    model_type = 'MPlugOwlVisualAbstractor'
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        num_hidden_layers=6,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        encoder_hidden_size=1024,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from MplugOwlConfig
+        if config_dict.get('model_type') == 'mplug_owl':
+            config_dict = config_dict['abstractor_config']
+
+        if 'model_type' in config_dict and hasattr(
+                cls,
+                'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MplugOwlConfig(PretrainedConfig):
+    r"""
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = 'mplug_owl'
+    is_composition = True
+
+    def __init__(self,
+                 task=Tasks.multimodal_dialogue,
+                 vision_config=None,
+                 visual_abstractor_config=None,
+                 text_config=None,
+                 num_query_tokens=64,
+                 **kwargs):
+
+        super().__init__(**kwargs)
+        self.task = task
+        if vision_config is None:
+            vision_config = MplugOwlVisionConfig().to_dict()
+            logger.info('vision_config is None.')
+
+        if visual_abstractor_config is None:
+            visual_abstractor_config = {}
+            logger.info('abstractor_config is None. ')
+
+        if text_config is None:
+            # we use LLAMA 7b by default
+            from transformers.models.llama.configuration_llama import \
+                LlamaConfig
+            text_config = LlamaConfig(pad_token_id=2).to_dict()
+            logger.info('text_config is None.')
+
+        self.vision_config = MplugOwlVisionConfig(**vision_config)
+        self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
+            **visual_abstractor_config)
+        text_model_type = text_config[
+            'model_type'] if 'model_type' in text_config else 'llama'
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+
+        self.num_query_tokens = num_query_tokens
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_abstractor_text_configs(
+        cls,
+        vision_config: MplugOwlVisionConfig,
+        visual_abstractor_config: MplugOwlVisualAbstractorConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+            [`MplugOwlConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            visual_abstractor_config=visual_abstractor_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        tmp = self.visual_abstractor_config.to_dict()
+        output['visual_abstractor_config'] = tmp
+        output['text_config'] = self.text_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        return output
--- a/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
+++ b/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
--- a/modelscope/models/nlp/mglm/blocklm_utils.py
+++ b/modelscope/models/nlp/mglm/blocklm_utils.py
@@ -212,10 +212,10 @@ class ConstructBlockStrategy:
                         block_spans,
                         rng,
                         task='bert'):
-        position_ids = np.arange(len(tokens), dtype=np.long)
+        position_ids = np.arange(len(tokens), dtype=int)
        targets = copy.deepcopy(tokens)
        mask_id = self.tokenizer.get_command('MASK').Id
-        mlm_masks = np.zeros(len(tokens), dtype=np.long)
+        mlm_masks = np.zeros(len(tokens), dtype=int)
        for start, end in block_spans:
            for idx in range(start, end):
                tokens[idx] = mask_id
@@ -231,7 +231,7 @@ class ConstructBlockStrategy:
                        rng,
                        task='bert'):
        text_length = len(tokens)
-        position_ids = np.ones(len(tokens), dtype=np.long)
+        position_ids = np.ones(len(tokens), dtype=int)
        for start, end in block_spans:
            position_ids[start + 1:end] = 0
        position_ids = np.cumsum(position_ids) - 1
@@ -270,7 +270,7 @@ class ConstructBlockStrategy:
                                           (end - start + 1))
            if self.block_position_encoding:
                target_block_position_ids.append(
-                    np.arange(1, end - start + 2, dtype=np.long))
+                    np.arange(1, end - start + 2, dtype=int))
            else:
                target_block_position_ids.append([1] * (end - start + 1))
        block_spans.sort(key=lambda x: x[0])
@@ -307,7 +307,7 @@ class ConstructBlockStrategy:
            target_tokens = target_tokens + [
                self.tokenizer.get_command('eop').Id
            ]
-            loss_masks = np.ones(len(target_tokens), dtype=np.long)
+            loss_masks = np.ones(len(target_tokens), dtype=int)
            return source_tokens, target_tokens, loss_masks
        else:
            tokens = np.concatenate(source_tokens + target_tokens)
@@ -326,12 +326,12 @@ class ConstructBlockStrategy:
                for pos in mask_pos:
                    tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
            targets = np.concatenate(source_tokens + targets)
-            loss_masks = np.ones(len(tokens), dtype=np.long)
+            loss_masks = np.ones(len(tokens), dtype=int)
            loss_masks[:source_length] = 0
            position_ids = np.concatenate(source_position_ids
                                          + target_position_ids)
            block_position_ids = np.concatenate(
-                [np.zeros(source_length, dtype=np.long)]
+                [np.zeros(source_length, dtype=int)]
                + target_block_position_ids)
            position_ids = np.stack([position_ids, block_position_ids], axis=0)
            if attention_mask is not None:
@@ -539,22 +539,21 @@ class ConstructBlockStrategy:
                        (source_tokens, [self.generation_mask], target_tokens))
                    loss_masks = np.concatenate(
                        (np.zeros(len(source_tokens) + 1,
-                                  dtype=np.long), target_masks))
+                                  dtype=int), target_masks))
                    token_batch.append(tokens)
                    target_batch.append(targets)
                    loss_mask_batch.append(loss_masks)
                    position_ids = np.arange(
-                        len(source_tokens) + len(target_tokens) + 1,
-                        dtype=np.long)
+                        len(source_tokens) + len(target_tokens) + 1, dtype=int)
                    position_ids[len(source_tokens) + 1:] = len(source_tokens)
                    if self.block_position_encoding:
                        block_position_ids = np.concatenate(
-                            (np.zeros(len(source_tokens), dtype=np.long),
-                             np.arange(len(target_tokens) + 1, dtype=np.long)))
+                            (np.zeros(len(source_tokens), dtype=int),
+                             np.arange(len(target_tokens) + 1, dtype=int)))
                    else:
                        block_position_ids = np.concatenate(
-                            (np.zeros(len(source_tokens) + 1, dtype=np.long),
-                             np.ones(len(target_tokens) + 1, dtype=np.long)))
+                            (np.zeros(len(source_tokens) + 1, dtype=int),
+                             np.ones(len(target_tokens) + 1, dtype=int)))
                    position_id_batch.append(
                        np.stack([position_ids, block_position_ids], axis=0))
                else:
@@ -597,27 +596,25 @@ class ConstructBlockStrategy:
            max_length = max(seq_lengths)
            token_batch = [
                np.concatenate(
-                    (tokens, np.zeros(max_length - len(tokens),
-                                      dtype=np.long)))
+                    (tokens, np.zeros(max_length - len(tokens), dtype=int)))
                for tokens in token_batch
            ]
            target_batch = [
                np.concatenate(
-                    (targets,
-                     np.zeros(max_length - len(targets), dtype=np.long)))
+                    (targets, np.zeros(max_length - len(targets), dtype=int)))
                for targets in target_batch
            ]
            loss_mask_batch = [
                np.concatenate(
                    (loss_masks,
-                     np.zeros(max_length - len(loss_masks), dtype=np.long)))
+                     np.zeros(max_length - len(loss_masks), dtype=int)))
                for loss_masks in loss_mask_batch
            ]
            position_id_batch = [
-                np.concatenate((position_ids,
+                np.concatenate(
+                    (position_ids,
                     np.zeros(
-                                    (2, max_length - position_ids.shape[1]),
-                                    dtype=np.long)),
+                         (2, max_length - position_ids.shape[1]), dtype=int)),
                    axis=1) for position_ids in position_id_batch
            ]
        return token_batch, target_batch, loss_mask_batch, position_id_batch
--- a/modelscope/models/nlp/mglm/data_utils/datasets.py
+++ b/modelscope/models/nlp/mglm/data_utils/datasets.py
@@ -583,8 +583,8 @@ class XLDataset(data.Dataset):
    def getidx(self, idx):
        tokens, targets, loss_masks = [], [], []
        attention_mask = np.concatenate(
-            (np.zeros((self.max_seq_len, self.mem_len), dtype=np.long),
-             np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)),
+            (np.zeros((self.max_seq_len, self.mem_len), dtype=int),
+             np.ones((self.max_seq_len, self.max_seq_len), dtype=int)),
            axis=1)
        sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
        last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
--- a/modelscope/models/nlp/mglm/test/test_block.py
+++ b/modelscope/models/nlp/mglm/test/test_block.py
@@ -28,7 +28,7 @@ def main():
    counts = np.array([0] * 10)
    for _ in range(10000):
        spans = strategy.sample_span_in_document(
-            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
+            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=int), [1, 1],
            random.Random())
        for start, end in spans:
            counts[start:end] += 1
--- a/modelscope/models/nlp/mglm/test/test_rel_shift.py
+++ b/modelscope/models/nlp/mglm/test/test_rel_shift.py
@@ -17,7 +17,7 @@ def main():
        num_iters=300000,
        decay_style='cosine',
        decay_ratio=0.1)
-    steps = np.arange(0, 400000, 10, dtype=np.long)
+    steps = np.arange(0, 400000, 10, dtype=int)
    rates = []
    for step in steps:
        lr_scheduler.num_iters = step
--- a/modelscope/models/nlp/unite/init.py
+++ b/modelscope/models/nlp/unite/init.py
@@ -5,12 +5,12 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:
-    from .configuration_unite import UniTEConfig
-    from .modeling_unite import UniTEForTranslationEvaluation
+    from .configuration import UniTEConfig
+    from .translation_evaluation import UniTEForTranslationEvaluation
 else:
    _import_structure = {
-        'configuration_unite': ['UniTEConfig'],
-        'modeling_unite': ['UniTEForTranslationEvaluation'],
+        'configuration': ['UniTEConfig'],
+        'translation_evaluation': ['UniTEForTranslationEvaluation'],
    }

    import sys
--- a/modelscope/models/nlp/unite/configuration_unite.py
+++ b/modelscope/models/nlp/unite/configuration_unite.py
@@ -9,7 +9,7 @@ from modelscope.utils.config import Config
 logger = logging.get_logger()


-class EvaluationMode(Enum):
+class InputFormat(Enum):
    SRC = 'src'
    REF = 'ref'
    SRC_REF = 'src-ref'
--- a/modelscope/models/nlp/unite/translation_evaluation.py
+++ b/modelscope/models/nlp/unite/translation_evaluation.py
@@ -20,6 +20,8 @@ from transformers.activations import ACT2FN
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
+from modelscope.models.nlp.unite.configuration import InputFormat
+from modelscope.outputs.nlp_outputs import TranslationEvaluationOutput
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger

@@ -71,8 +73,16 @@ class LayerwiseAttention(Module):
        mask: torch.Tensor = None,
    ) -> torch.Tensor:
        tensors = torch.cat(list(x.unsqueeze(dim=0) for x in tensors), dim=0)
+
+        if self.training and self.dropout:
            normed_weights = softmax(
-            self.scalar_parameters, dim=0).view(-1, 1, 1, 1)
+                torch.where(self.dropout_mask.uniform_() > self.dropout,
+                            self.scalar_parameters, self.dropout_fill),
+                dim=-1)
+        else:
+            normed_weights = softmax(self.scalar_parameters, dim=-1)
+
+        normed_weights = normed_weights.view(-1, 1, 1, 1)

        mask_float = mask.float()
        weighted_sum = (normed_weights
@@ -266,8 +276,11 @@ class UniTEForTranslationEvaluation(TorchModel):

        return

-    def forward(self, input_sentences: List[torch.Tensor]):
-        input_ids = self.combine_input_sentences(input_sentences)
+    def forward(self,
+                input_ids: torch.Tensor,
+                input_format: Optional[List[InputFormat]] = None,
+                score: Optional[torch.Tensor] = None,
+                **kwargs) -> TranslationEvaluationOutput:
        attention_mask = input_ids.ne(self.pad_token_id).long()
        outputs = self.encoder(
            input_ids=input_ids,
@@ -276,32 +289,48 @@ class UniTEForTranslationEvaluation(TorchModel):
            return_dict=True)
        mix_states = self.layerwise_attention(outputs['hidden_states'],
                                              attention_mask)
-        pred = self.estimator(mix_states)
-        return pred.squeeze(dim=-1)
+        pred = self.estimator(mix_states).squeeze(dim=-1)
+        output = TranslationEvaluationOutput(
+            score=pred.cpu().tolist(), input_format=input_format)

-    def load_checkpoint(self, path: str, device: torch.device):
+        if score is not None:
+            loss = (pred - score).pow(2).mean()
+            output['loss'] = loss
+
+        return output
+
+    def load_checkpoint(self, path: str, device: torch.device, plm_only: bool):
+        if plm_only:
+            self.encoder = self.encoder.from_pretrained(path).to(device)
+            self.encoder.pooler = None
+        else:
            state_dict = torch.load(path, map_location=device)
            self.load_state_dict(state_dict)
        logger.info('Loading checkpoint parameters from %s' % path)
        return

-    def combine_input_sentences(self, input_sent_groups: List[torch.Tensor]):
-        for input_sent_group in input_sent_groups[1:]:
-            input_sent_group[:, 0] = self.eos_token_id

-        if len(input_sent_groups) == 3:
-            cutted_sents = self.cut_long_sequences3(input_sent_groups)
+def combine_input_sentences(all_input_concat: List[List[torch.Tensor]],
+                            maximum_length: int = 512,
+                            pad_idx: int = 1,
+                            eos_idx: int = 2):
+    for group in all_input_concat[1:]:
+        group[:, 0] = eos_idx
+
+    if len(all_input_concat) == 3:
+        return cut_long_sequences3(all_input_concat, maximum_length, pad_idx)
    else:
-            cutted_sents = self.cut_long_sequences2(input_sent_groups)
-        return cutted_sents
+        return cut_long_sequences2(all_input_concat, maximum_length, pad_idx)

-    @staticmethod
-    def cut_long_sequences2(all_input_concat: List[List[torch.Tensor]],
+
+def cut_long_sequences2(all_input_concat: List[List[torch.Tensor]],
                        maximum_length: int = 512,
                        pad_idx: int = 1):
    all_input_concat = list(zip(*all_input_concat))
    collected_tuples = list()
    for tensor_tuple in all_input_concat:
+        tensor_tuple = tuple(
+            x.masked_select(x.ne(pad_idx)) for x in tensor_tuple)
        all_lens = tuple(len(x) for x in tensor_tuple)

        if sum(all_lens) > maximum_length:
@@ -315,13 +344,12 @@ class UniTEForTranslationEvaluation(TorchModel):
                                // 2) and min(all_lens) > offset:
                lengths = dict((k, v - offset) for k, v in lengths.items())
            else:
-                    lengths[lengths_sorted_idxes[
-                        0]] = maximum_length - lengths[lengths_sorted_idxes[1]]
+                lengths[lengths_sorted_idxes[0]] = maximum_length - lengths[
+                    lengths_sorted_idxes[1]]

-                new_lens = list(lengths[k]
-                                for k in range(0, len(tensor_tuple)))
-                new_tensor_tuple = tuple(
-                    x[:y] for x, y in zip(tensor_tuple, new_lens))
+            new_lens = list(lengths[k] for k in range(0, len(tensor_tuple)))
+            new_tensor_tuple = tuple(x[:y]
+                                     for x, y in zip(tensor_tuple, new_lens))
            for x, y in zip(new_tensor_tuple, tensor_tuple):
                x[-1] = y[-1]
            collected_tuples.append(new_tensor_tuple)
@@ -331,16 +359,17 @@ class UniTEForTranslationEvaluation(TorchModel):
    concat_tensor = list(torch.cat(x, dim=0) for x in collected_tuples)
    all_input_concat_padded = pad_sequence(
        concat_tensor, batch_first=True, padding_value=pad_idx)
-
    return all_input_concat_padded

-    @staticmethod
-    def cut_long_sequences3(all_input_concat: List[List[torch.Tensor]],
+
+def cut_long_sequences3(all_input_concat: List[List[torch.Tensor]],
                        maximum_length: int = 512,
                        pad_idx: int = 1):
    all_input_concat = list(zip(*all_input_concat))
    collected_tuples = list()
    for tensor_tuple in all_input_concat:
+        tensor_tuple = tuple(
+            x.masked_select(x.ne(pad_idx)) for x in tensor_tuple)
        all_lens = tuple(len(x) for x in tensor_tuple)

        if sum(all_lens) > maximum_length:
@@ -357,9 +386,8 @@ class UniTEForTranslationEvaluation(TorchModel):
                while sum(lengths.values()) > maximum_length:
                    if lengths[lengths_sorted_idxes[0]] > lengths[
                            lengths_sorted_idxes[1]]:
-                            offset = maximum_length - lengths[
-                                lengths_sorted_idxes[1]] - lengths[
-                                    lengths_sorted_idxes[2]]
+                        offset = maximum_length - lengths[lengths_sorted_idxes[
+                            1]] - lengths[lengths_sorted_idxes[2]]
                        if offset > lengths[lengths_sorted_idxes[1]]:
                            lengths[lengths_sorted_idxes[0]] = offset
                        else:
@@ -380,12 +408,11 @@ class UniTEForTranslationEvaluation(TorchModel):
                    else:
                        lengths[lengths_sorted_idxes[0]] = lengths[
                            lengths_sorted_idxes[1]] = lengths[
-                                    lengths_sorted_idxes[
-                                        2]] = maximum_length // 3
+                                lengths_sorted_idxes[2]] = maximum_length // 3

            new_lens = list(lengths[k] for k in range(0, len(lengths)))
-                new_tensor_tuple = tuple(
-                    x[:y] for x, y in zip(tensor_tuple, new_lens))
+            new_tensor_tuple = tuple(x[:y]
+                                     for x, y in zip(tensor_tuple, new_lens))

            for x, y in zip(new_tensor_tuple, tensor_tuple):
                x[-1] = y[-1]
@@ -396,5 +423,4 @@ class UniTEForTranslationEvaluation(TorchModel):
    concat_tensor = list(torch.cat(x, dim=0) for x in collected_tuples)
    all_input_concat_padded = pad_sequence(
        concat_tensor, batch_first=True, padding_value=pad_idx)
-
    return all_input_concat_padded
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}`
				`@@ -0,0 +1 @@`
				`# Copyright (c) Alibaba, Inc. and its affiliates.`