add 1.6

2025-12-16 16:27:45 +01:00 · 2023-05-22 10:53:18 +08:00
parent 52aea36c12
commit 48c0d2a9af
468 changed files with 12942 additions and 7176 deletions
--- a/README.md
+++ b/README.md
@@ -108,9 +108,9 @@ Audio:
 * [speech_charctc_kws_phone-xiaoyun](https://modelscope.cn/models/damo/speech_charctc_kws_phone-xiaoyun)
 * [u2pp_conformer-asr-cn-16k-online](https://modelscope.cn/models/wenet/u2pp_conformer-asr-cn-16k-online)
- 
+
 * [speech_fsmn_vad_zh-cn-16k-common-pytorch](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
- 
+
 * [punc_ct-transformer_zh-cn-common-vocab272727-pytorch](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)
 * [speech_frcrn_ans_cirm_16k](https://modelscope.cn/models/damo/speech_frcrn_ans_cirm_16k)
--- a/examples/pytorch/image_classification/finetune_image_classification.py
+++ b/examples/pytorch/image_classification/finetune_image_classification.py
@@ -1,13 +1,12 @@
 import os
 from dataclasses import dataclass, field
 from modelscope import MsDataset, TrainingArgs
 from modelscope.metainfo import Trainers
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.trainers.builder import build_trainer
 from modelscope.trainers.training_args import TrainingArgs
-@dataclass
+@dataclass(init=False)
 class ImageClassificationTrainingArgs(TrainingArgs):
    num_classes: int = field(
        default=None,
@@ -46,26 +45,35 @@ def create_dataset(name, split):
        dataset_name, namespace=namespace, subset_name='default', split=split)
-def train():
+training_args = ImageClassificationTrainingArgs(
-    args = ImageClassificationTrainingArgs.from_cli(
+    model='damo/cv_vit-base_image-classification_ImageNet-labels',
-        model='damo/cv_vit-base_image-classification_ImageNet-labels',
+    max_epochs=1,
-        max_epochs=1,
+    lr=1e-4,
-        lr=1e-4,
+    optimizer='AdamW',
-        optimizer='AdamW',
+    warmup_iters=1,
-        warmup_iters=1,
+    topk=(1, )).parse_cli()
-        topk=(1, ))
+config, args = training_args.to_config()
-    if args.dataset_name is not None:
+
-        train_dataset = create_dataset(args.dataset_name, split='train')
+
-        val_dataset = create_dataset(args.dataset_name, split='validation')
+def cfg_modify_fn(cfg):
    if args.use_model_config:
        cfg.merge_from_dict(config)
    else:
-        train_dataset = create_dataset(args.train_dataset_name, split='train')
+        cfg = config
-        val_dataset = create_dataset(args.val_dataset_name, split='validation')
+    return cfg
 def train():
    train_dataset = create_dataset(
        training_args.train_dataset_name, split=training_args.train_split)
    val_dataset = create_dataset(
        training_args.val_dataset_name, split=training_args.val_split)
    kwargs = dict(
        model=args.model,  # model id
        train_dataset=train_dataset,  # training dataset
        eval_dataset=val_dataset,  # validation dataset
-        cfg_modify_fn=args  # callback to modify configuration
+        cfg_modify_fn=cfg_modify_fn  # callback to modify configuration
    )
    # in distributed training, specify pytorch launcher
--- a/examples/pytorch/image_classification/run_train.sh
+++ b/examples/pytorch/image_classification/run_train.sh
@@ -2,4 +2,7 @@ PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
    examples/pytorch/image_classification/finetune_image_classification.py \
    --num_classes 2 \
    --train_dataset_name 'tany0699/cats_and_dogs' \
-    --val_dataset_name 'tany0699/cats_and_dogs'
+    --val_dataset_name 'tany0699/cats_and_dogs' \
    --train_split train \
    --val_split validation \
    --use_model_config true \
--- a/examples/pytorch/multi_modal_embedding/finetune_multi_modal_embedding.py
+++ b/examples/pytorch/multi_modal_embedding/finetune_multi_modal_embedding.py
@@ -1,15 +1,13 @@
 import os
 from dataclasses import dataclass, field
 from functools import partial
 from modelscope import MsDataset, TrainingArgs
 from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
-from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
+from modelscope.trainers.training_args import set_flatten_value
                                               set_flatten_value)
-@dataclass
+@dataclass(init=False)
 class MultiModalEmbeddingArguments(TrainingArgs):
    trainer: str = field(
@@ -17,6 +15,12 @@ class MultiModalEmbeddingArguments(TrainingArgs):
            'help': 'The trainer used',
        })
    work_dir: str = field(
        default='./tmp',
        metadata={
            'help': 'The working path for saving checkpoint',
        })
    use_fp16: bool = field(
        default=None,
        metadata={
@@ -35,7 +39,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'train.optimizer_hparams',
            'cfg_getter': partial(get_flatten_value, exclusions=['lr']),
            'cfg_setter': set_flatten_value,
            'help': 'The optimizer init params except `lr`',
        })
@@ -51,7 +54,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'dataset.column_map',
            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The column map for dataset',
        })
@@ -67,7 +69,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'train.lr_scheduler_hook',
            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The parameters for lr scheduler hook',
        })
@@ -76,7 +77,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
        default=None,
        metadata={
            'cfg_node': 'train.optimizer_hook',
            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The parameters for optimizer hook',
        })
@@ -92,23 +92,28 @@ class MultiModalEmbeddingArguments(TrainingArgs):
            'help': 'The data parallel world size',
        })
-    def __call__(self, config):
+
-        config = super().__call__(config)
+config, args = MultiModalEmbeddingArguments().parse_cli().to_config()
-        config.merge_from_dict({'pretrained_model.model_name': self.model})
+print(config, args)
        if self.clip_clamp:
            config.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
        if self.world_size > 1:
            config.train.launcher = 'pytorch'
        return config
-args = MultiModalEmbeddingArguments.from_cli(task='multi-modal-embedding')
+def cfg_modify_fn(cfg):
-print(args)
+    if args.use_model_config:
        cfg.merge_from_dict(config)
    else:
        cfg = config
    cfg.merge_from_dict({'pretrained_model.model_name': args.model})
    if args.clip_clamp:
        cfg.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
    if args.world_size > 1:
        cfg.train.launcher = 'pytorch'
    return cfg
 train_dataset = MsDataset.load(
-    args.dataset_name, namespace='modelscope', split='train')
+    args.train_dataset_name, namespace='modelscope', split='train')
 eval_dataset = MsDataset.load(
-    args.dataset_name, namespace='modelscope', split='validation')
+    args.train_dataset_name, namespace='modelscope', split='validation')
 os.makedirs(args.work_dir, exist_ok=True)
 kwargs = dict(
@@ -116,6 +121,6 @@ kwargs = dict(
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    work_dir=args.work_dir,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)
 trainer = build_trainer(name=args.trainer, default_args=kwargs)
 trainer.train()
--- a/examples/pytorch/multi_modal_embedding/run_train.sh
+++ b/examples/pytorch/multi_modal_embedding/run_train.sh
@@ -6,14 +6,16 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
    --trainer 'clip-multi-modal-embedding' \
    --work_dir './workspace/ckpts/clip' \
    --model 'damo/multi-modal_clip-vit-base-patch16_zh' \
-    --dataset_name 'muge' \
+    --train_dataset_name 'muge' \
    --dataset_column_map 'img=image,text=query' \
    --max_epochs 1 \
    --use_fp16 true \
    --per_device_train_batch_size 180 \
    --train_data_worker 0 \
    --train_shuffle true \
    --train_drop_last true \
    --per_device_eval_batch_size 128 \
    --eval_data_worker 0 \
    --eval_shuffle true \
    --eval_drop_last true \
    --save_ckpt_best true \
@@ -33,3 +35,4 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
    --optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \
    --clip_clamp true \
    --world_size $DATA_PARALLEL_SIZE \
    --use_model_config true \
--- a/examples/pytorch/stable_diffusion/finetune_stable_diffusion.py
+++ b/examples/pytorch/stable_diffusion/finetune_stable_diffusion.py
@@ -4,30 +4,32 @@ from modelscope.msdatasets import MsDataset
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.trainers.training_args import TrainingArgs
-
+training_args = TrainingArgs(task='efficient-diffusion-tuning').parse_cli()
-@dataclass
+config, args = training_args.to_config()
 class StableDiffusionArguments(TrainingArgs):
    def __call__(self, config):
        config = super().__call__(config)
        config.train.lr_scheduler.T_max = self.max_epochs
        config.model.inference = False
        return config
 args = StableDiffusionArguments.from_cli(task='efficient-diffusion-tuning')
 print(args)
-dataset = MsDataset.load(args.dataset_name, namespace=args.namespace)
+dataset = MsDataset.load(
    args.train_dataset_name, namespace=args.train_dataset_namespace)
 train_dataset = dataset['train']
 validation_dataset = dataset['validation']
 def cfg_modify_fn(cfg):
    if args.use_model_config:
        cfg.merge_from_dict(config)
    else:
        cfg = config
    cfg.train.lr_scheduler.T_max = training_args.max_epochs
    cfg.model.inference = False
    return cfg
 kwargs = dict(
-    model=args.model,
+    model=training_args.model,
-    work_dir=args.work_dir,
+    work_dir=training_args.work_dir,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)
 trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
 trainer.train()
--- a/examples/pytorch/stable_diffusion/run_train.sh
+++ b/examples/pytorch/stable_diffusion/run_train.sh
@@ -1,11 +1,12 @@
 PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \
    --model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
    --work_dir './tmp/stable_diffusion_tuning' \
-    --namespace 'damo' \
+    --train_dataset_namespace 'damo' \
-    --dataset_name 'buptwq/lora-stable-diffusion-finetune-dog' \
+    --train_dataset_name 'controlnet_dataset_condition_fill50k' \
-    --max_epochs 150 \
+    --max_epochs 1 \
    --save_ckpt_strategy 'by_epoch' \
    --logging_interval 100 \
    --train.dataloader.workers_per_gpu 0 \
    --evaluation.dataloader.workers_per_gpu 0 \
-    --train.optimizer.lr 1e-4
+    --train.optimizer.lr 1e-5 \
    --use_model_config true
--- a/examples/pytorch/text_classification/finetune_text_classification.py
+++ b/examples/pytorch/text_classification/finetune_text_classification.py
@@ -1,26 +1,18 @@
 import os
 from dataclasses import dataclass, field
-from modelscope.msdatasets import MsDataset
+from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
-from modelscope.trainers import EpochBasedTrainer, build_trainer
+                        build_dataset_from_file)
-from modelscope.trainers.training_args import TrainingArgs
+from modelscope.trainers import build_trainer
-def get_labels(cfg, metadata):
+def set_labels(labels):
    label2id = cfg.safe_get(metadata['cfg_node'])
    if label2id is not None:
        return ','.join(label2id.keys())
 def set_labels(cfg, labels, metadata):
    if isinstance(labels, str):
        labels = labels.split(',')
-    cfg.merge_from_dict(
+    return {label: id for id, label in enumerate(labels)}
        {metadata['cfg_node']: {label: id
                                for id, label in enumerate(labels)}})
-@dataclass
+@dataclass(init=False)
 class TextClassificationArguments(TrainingArgs):
    first_sequence: str = field(
@@ -49,7 +41,6 @@ class TextClassificationArguments(TrainingArgs):
        metadata={
            'help': 'The labels of the dataset',
            'cfg_node': 'preprocessor.label2id',
            'cfg_getter': get_labels,
            'cfg_setter': set_labels,
        })
@@ -60,30 +51,39 @@ class TextClassificationArguments(TrainingArgs):
            'cfg_node': 'preprocessor.type'
        })
-    def __call__(self, config):
+
-        config = super().__call__(config)
+config, args = TextClassificationArguments().parse_cli().to_config()
-        config.model['num_labels'] = len(self.labels)
+
-        if config.train.lr_scheduler.type == 'LinearLR':
+print(config, args)
            config.train.lr_scheduler['total_iters'] = \
                int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
        return config
-args = TextClassificationArguments.from_cli(
+def cfg_modify_fn(cfg):
-    task='text-classification', eval_metrics='seq-cls-metric')
+    if args.use_model_config:
        cfg.merge_from_dict(config)
    else:
        cfg = config
    cfg.model['num_labels'] = len(cfg.preprocessor.label2id)
    if cfg.train.lr_scheduler.type == 'LinearLR':
        cfg.train.lr_scheduler['total_iters'] = \
            int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
    return cfg
 print(args)
-dataset = MsDataset.load(args.dataset_name, subset_name=args.subset_name)
+if args.dataset_json_file is None:
-train_dataset = dataset['train']
+    dataset = MsDataset.load(
-validation_dataset = dataset['validation']
+        args.train_dataset_name, subset_name=args.train_subset_name)
    train_dataset = dataset['train']
    validation_dataset = dataset['validation']
 else:
    train_dataset, validation_dataset = build_dataset_from_file(
        args.dataset_json_file)
 kwargs = dict(
    model=args.model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    seed=args.seed,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)
 os.environ['LOCAL_RANK'] = str(args.local_rank)
 trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
--- a/examples/pytorch/text_classification/run_train.sh
+++ b/examples/pytorch/text_classification/run_train.sh
@@ -1,12 +1,16 @@
 PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
    --task 'text-classification' \
    --model 'damo/nlp_structbert_backbone_base_std' \
-    --dataset_name 'clue' \
+    --train_dataset_name 'clue' \
-    --subset_name 'tnews' \
+    --train_subset_name 'tnews' \
    --first_sequence 'sentence' \
    --preprocessor.label label \
    --model.num_labels 15 \
    --labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
    --preprocessor 'sen-cls-tokenizer' \
    --use_model_config True \
    --max_epochs 1 \
    --train.dataloader.workers_per_gpu 0 \
    --evaluation.dataloader.workers_per_gpu 0 \
    --train.optimizer.lr 1e-5 \
    --eval_metrics 'seq-cls-metric' \
--- a/examples/pytorch/text_generation/finetune_text_generation.py
+++ b/examples/pytorch/text_generation/finetune_text_generation.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass, field
 from modelscope import EpochBasedTrainer, MsDataset, TrainingArgs
 from modelscope.metainfo import Trainers
-from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.trainers.training_args import TrainingArgs
-@dataclass
+@dataclass(init=False)
 class TextGenerationArguments(TrainingArgs):
    trainer: str = field(
@@ -67,30 +66,35 @@ class TextGenerationArguments(TrainingArgs):
            'help': 'Whether to use MegatronHook',
        })
    def __call__(self, config):
        config = super().__call__(config)
        if config.train.lr_scheduler.type == 'noam':
            config.train.lr_scheduler = {
                'type': 'LambdaLR',
                'lr_lambda': noam_lambda,
                'options': {
                    'by_epoch': False
                }
            }
        if self.use_megatron:
            config.train.hooks.append({'type': 'MegatronHook'})
        return config
 def noam_lambda(current_step: int):
    current_step += 1
    return min(current_step**(-0.5), current_step * 100**(-1.5))
-args = TextGenerationArguments.from_cli(task='text-generation')
+config, args = TextGenerationArguments().parse_cli().to_config()
-print(args)
+print(config, args)
-dataset = MsDataset.load(args.dataset_name)
+
 def cfg_modify_fn(cfg):
    if args.use_model_config:
        cfg.merge_from_dict(config)
    else:
        cfg = config
    if cfg.train.lr_scheduler.type == 'noam':
        cfg.train.lr_scheduler = {
            'type': 'LambdaLR',
            'lr_lambda': noam_lambda,
            'options': {
                'by_epoch': False
            }
        }
    if args.use_megatron:
        cfg.train.hooks.append({'type': 'MegatronHook'})
    return cfg
 dataset = MsDataset.load(args.train_dataset_name)
 train_dataset = dataset['train']
 eval_dataset = dataset['validation' if 'validation' in dataset else 'test']
@@ -100,7 +104,7 @@ kwargs = dict(
    eval_dataset=eval_dataset,
    seed=args.seed,
    work_dir=args.work_dir,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)
 trainer: EpochBasedTrainer = build_trainer(
    name=args.trainer, default_args=kwargs)
--- a/examples/pytorch/text_generation/run_train_gpt3.sh
+++ b/examples/pytorch/text_generation/run_train_gpt3.sh
@@ -8,7 +8,7 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
    --trainer 'nlp-gpt3-trainer' \
    --work_dir './tmp' \
    --model 'damo/nlp_gpt3_text-generation_1.3B' \
-    --dataset_name 'chinese-poetry-collection' \
+    --train_dataset_name 'chinese-poetry-collection' \
    --preprocessor 'text-gen-jieba-tokenizer' \
    --src_txt 'text1' \
    --tgt_txt 'text2' \
@@ -20,4 +20,5 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
    --world_size $WORLD_SIZE \
    --tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \
    --use_megatron true \
-    # --dataset_name 'DuReader_robust-QG' \ # input&output
+    --use_model_config true \
    # --train_dataset_name 'DuReader_robust-QG' \ # input&output
--- a/examples/pytorch/text_generation/run_train_mt5.sh
+++ b/examples/pytorch/text_generation/run_train_mt5.sh
@@ -0,0 +1,13 @@
 PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.py \
    --trainer 'text-generation-trainer' \
    --work_dir './tmp' \
    --task 'text2text-generation' \
    --model 'damo/nlp_mt5_zero-shot-augment_chinese-base' \
    --train_dataset_name 'DuReader_robust-QG' \
    --src_txt 'text1' \
    --tgt_txt 'text2' \
    --max_epochs 1 \
    --use_model_config True \
    --per_device_train_batch_size 8 \
    --lr 1e-3 \
    --lr_scheduler 'noam' \
--- a/examples/pytorch/text_generation/run_train_palm.sh
+++ b/examples/pytorch/text_generation/run_train_palm.sh
@@ -2,10 +2,11 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
    --trainer 'text-generation-trainer' \
    --work_dir './tmp' \
    --model 'damo/nlp_palm2.0_pretrained_chinese-base' \
-    --dataset_name 'DuReader_robust-QG' \
+    --train_dataset_name 'DuReader_robust-QG' \
    --src_txt 'text1' \
    --tgt_txt 'text2' \
-    --max_epochs 15 \
+    --max_epochs 1 \
    --use_model_config True \
    --per_device_train_batch_size 8 \
    --lr 1e-3 \
    --lr_scheduler 'noam' \
--- a/examples/pytorch/token_classification/finetune_token_classification.py
+++ b/examples/pytorch/token_classification/finetune_token_classification.py
@@ -1,20 +1,22 @@
 from dataclasses import dataclass, field
-from modelscope.metainfo import Trainers
+from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
-from modelscope.msdatasets import MsDataset
+                        build_dataset_from_file)
 from modelscope.trainers import build_trainer
 from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
                                               set_flatten_value)
-@dataclass
+@dataclass(init=False)
 class TokenClassificationArguments(TrainingArgs):
    trainer: str = field(
-        default=Trainers.default, metadata={
+        default=None, metadata={
            'help': 'The trainer used',
        })
    work_dir: str = field(
        default='./tmp',
        metadata={
            'help': 'The working path for saving checkpoint',
        })
    preprocessor: str = field(
        default=None,
        metadata={
@@ -29,60 +31,99 @@ class TokenClassificationArguments(TrainingArgs):
            'cfg_node': 'preprocessor.padding'
        })
-    train_dataset_params: str = field(
+    mode: str = field(
        default='inference',
        metadata={
            'help': 'The preprocessor padding',
            'cfg_node': 'preprocessor.mode'
        })
    first_sequence: str = field(
        default=None,
        metadata={
-            'cfg_node': 'dataset.train',
+            'cfg_node': 'preprocessor.first_sequence',
            'cfg_getter': get_flatten_value,
            'cfg_setter': set_flatten_value,
            'help': 'The parameters for train dataset',
        })
-    def __call__(self, config):
+    label: str = field(
-        config = super().__call__(config)
+        default=None,
-        if config.safe_get('dataset.train.label') == 'ner_tags':
+        metadata={
-            ner_tags_labels = train_dataset['ner_tags'] + eval_dataset[
+            'cfg_node': 'preprocessor.label',
-                'ner_tags']
+            'help': 'The parameters for train dataset',
-            label_enumerate_values = self._get_label_list(ner_tags_labels)
+        })
            config.merge_from_dict(
                {'dataset.train.labels': label_enumerate_values})
        if config.train.lr_scheduler.type == 'LinearLR':
            config.train.lr_scheduler['total_iters'] = \
                int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
        return config
-    # TODO: Future performance optimization in MsDataset
+    sequence_length: int = field(
-    @staticmethod
+        default=128,
-    def _get_label_list(labels):
+        metadata={
-        unique_labels = set()
+            'cfg_node': 'preprocessor.sequence_length',
-        for label in labels:
+            'help': 'The parameters for train dataset',
-            unique_labels = unique_labels | set(label)
+        })
        label_list = list(unique_labels)
        label_list.sort()
        return label_list
-args = TokenClassificationArguments.from_cli(task='token-classification')
+training_args = TokenClassificationArguments().parse_cli()
 config, args = training_args.to_config()
 print(args)
-# load dataset
+
-train_dataset = MsDataset.load(
+def get_label_list(labels):
-    args.dataset_name,
+    unique_labels = set()
-    subset_name=args.subset_name,
+    for label in labels:
-    split='train',
+        unique_labels = unique_labels | set(label)
-    namespace='damo')['train']
+    label_list = list(unique_labels)
-eval_dataset = MsDataset.load(
+    label_list.sort()
-    args.dataset_name,
+    return label_list
-    subset_name=args.subset_name,
+
-    split='validation',
+
-    namespace='damo')['validation']
+def cfg_modify_fn(cfg):
    if args.use_model_config:
        cfg.merge_from_dict(config)
    else:
        cfg = config
    labels = train_dataset[training_args.label] + validation_dataset[
        training_args.label]
    label_enumerate_values = get_label_list(labels)
    cfg.merge_from_dict({
        'preprocessor.label2id':
        {label: id
         for id, label in enumerate(label_enumerate_values)}
    })
    cfg.merge_from_dict({'model.num_labels': len(label_enumerate_values)})
    cfg.merge_from_dict({'preprocessor.use_fast': True})
    cfg.merge_from_dict({
        'evaluation.metrics': {
            'type': 'token-cls-metric',
            'label2id':
            {label: id
             for id, label in enumerate(label_enumerate_values)}
        }
    })
    if cfg.train.lr_scheduler.type == 'LinearLR':
        cfg.train.lr_scheduler['total_iters'] = \
            int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
    return cfg
 if args.dataset_json_file is None:
    train_dataset = MsDataset.load(
        args.train_dataset_name,
        subset_name=args.train_subset_name,
        split='train',
        namespace=args.train_dataset_namespace)['train']
    validation_dataset = MsDataset.load(
        args.train_dataset_name,
        subset_name=args.train_subset_name,
        split='validation',
        namespace=args.train_dataset_namespace)['validation']
 else:
    train_dataset, validation_dataset = build_dataset_from_file(
        args.dataset_json_file)
 kwargs = dict(
    model=args.model,
    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
+    eval_dataset=validation_dataset,
    work_dir=args.work_dir,
-    cfg_modify_fn=args)
+    cfg_modify_fn=cfg_modify_fn)
-trainer = build_trainer(name=args.trainer, default_args=kwargs)
+trainer = EpochBasedTrainer(**kwargs)
 trainer.train()
--- a/examples/pytorch/token_classification/run_train_mgeo.sh
+++ b/examples/pytorch/token_classification/run_train_mgeo.sh
@@ -1,15 +1,22 @@
-PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
+PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
    --task 'token-classification' \
    --trainer 'nlp-base-trainer' \
    --work_dir './tmp' \
    --model 'damo/mgeo_backbone_chinese_base' \
-    --dataset_name 'GeoGLUE' \
+    --train_dataset_name 'GeoGLUE' \
-    --subset_name 'GeoETA' \
+    --train_subset_name 'GeoETA' \
-    --train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
+    --train_dataset_namespace 'damo' \
    --first_sequence 'tokens' \
    --eval_strategy by_step \
    --eval_interval 10 \
    --label 'ner_tags' \
    --sequence_length 128 \
    --preprocessor 'token-cls-tokenizer' \
    --preprocessor_padding 'max_length' \
    --max_epochs 1 \
    --mode 'inference' \
    --use_model_config True \
    --per_device_train_batch_size 32 \
    --train_data_worker 0 \
    --eval_data_worker 0 \
    --lr 3e-5 \
    --save_ckpt_strategy 'by_epoch' \
    --logging_interval 100 \
    --eval_strategy 'by_epoch' \
--- a/examples/pytorch/token_classification/run_train_structbert.sh
+++ b/examples/pytorch/token_classification/run_train_structbert.sh
@@ -1,16 +1,22 @@
-PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
+PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
    --task 'token-classification' \
    --trainer 'nlp-base-trainer' \
    --work_dir './tmp' \
    --model 'damo/nlp_structbert_backbone_base_std' \
-    --dataset_name 'GeoGLUE' \
+    --train_dataset_name 'GeoGLUE' \
-    --subset_name 'GeoETA' \
+    --train_subset_name 'GeoETA' \
-    --train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
+    --train_dataset_namespace 'damo' \
    --first_sequence 'tokens' \
    --eval_strategy by_step \
    --eval_interval 20 \
    --label 'ner_tags' \
    --sequence_length 128 \
    --preprocessor 'token-cls-tokenizer' \
    --preprocessor_padding 'max_length' \
    --max_epochs 2 \
    --mode 'inference' \
    --use_model_config True \
    --per_device_train_batch_size 32 \
    --train_data_worker 0 \
    --eval_data_worker 0 \
    --lr 3e-5 \
    --save_ckpt_strategy 'by_epoch' \
    --logging_interval 1 \
    --eval_strategy 'by_step' \
    --eval_interval 20 \
--- a/examples/pytorch/transformers/configuration.json
+++ b/examples/pytorch/transformers/configuration.json
@@ -1 +0,0 @@
 {"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}
--- a/examples/pytorch/transformers/finetune_transformers_model.py
+++ b/examples/pytorch/transformers/finetune_transformers_model.py
@@ -5,11 +5,11 @@ from datasets import load_dataset
 from transformers import (BertForSequenceClassification, BertTokenizerFast,
                          default_data_collator)
 from modelscope import TrainingArgs
 from modelscope.trainers import EpochBasedTrainer, build_trainer
 from modelscope.trainers.default_config import DEFAULT_CONFIG, TrainingArgs
-@dataclass
+@dataclass(init=False)
 class TransformersArguments(TrainingArgs):
    num_labels: int = field(
@@ -17,13 +17,27 @@ class TransformersArguments(TrainingArgs):
            'help': 'The number of labels',
        })
    sentence: str = field(
        default=None, metadata={
            'help': 'The sentence key',
        })
-args = TransformersArguments.from_cli(
+    label: str = field(
-    task='text-classification', eval_metrics='seq-cls-metric')
+        default=None, metadata={
            'help': 'The label key',
        })
 print(args)
-dataset = load_dataset(args.dataset_name, args.subset_name)
+training_args = TransformersArguments(
    task='text-classification', eval_metrics='seq-cls-metric').parse_cli()
 config, args = training_args.to_config()
 print(config, args)
 train_dataset = load_dataset(
    args.train_dataset_name, args.train_subset_name, split=args.train_split)
 val_dataset = load_dataset(
    args.val_dataset_name, args.val_subset_name, split=args.val_split)
 model = BertForSequenceClassification.from_pretrained(
    args.model, num_labels=args.num_labels)
@@ -31,26 +45,30 @@ tokenizer = BertTokenizerFast.from_pretrained(args.model)
 def tokenize_sentence(row):
-    return tokenizer(row['sentence'], padding='max_length', max_length=128)
+    return tokenizer(
        row[training_args.sentence], padding='max_length', max_length=128)
 # Extra columns, Rename columns
-dataset = dataset.map(tokenize_sentence).remove_columns(['sentence',
+train_dataset = train_dataset.map(tokenize_sentence)
-                                                         'idx']).rename_column(
+val_dataset = val_dataset.map(tokenize_sentence)
-                                                             'label', 'labels')
+if training_args.label != 'labels':
    train_dataset = train_dataset.rename_columns(
        {training_args.label: 'labels'})
    val_dataset = val_dataset.rename_columns({training_args.label: 'labels'})
 cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
-DEFAULT_CONFIG.dump(cfg_file)
+config.dump(cfg_file)
 kwargs = dict(
    model=model,
    cfg_file=cfg_file,
    # data_collator
    data_collator=default_data_collator,
-    train_dataset=dataset['train'],
+    train_dataset=train_dataset,
-    eval_dataset=dataset['validation'],
+    eval_dataset=val_dataset,
-    seed=args.seed,
+    remove_unused_data=True,
-    cfg_modify_fn=args)
+    seed=args.seed)
 os.environ['LOCAL_RANK'] = str(args.local_rank)
 trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
--- a/examples/pytorch/transformers/run_train.sh
+++ b/examples/pytorch/transformers/run_train.sh
@@ -1,5 +1,14 @@
 PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
    --model bert-base-uncased \
    --num_labels 15 \
-    --dataset_name clue \
+    --train_dataset_name clue \
-    --subset_name tnews
+    --train_subset_name tnews \
    --train_split train \
    --val_dataset_name clue \
    --val_subset_name tnews \
    --train_split train \
    --val_split validation \
    --sentence sentence \
    --label label \
    --eval_strategy by_step \
    --eval_interval 100
--- a/modelscope/init.py
+++ b/modelscope/init.py
@@ -1,4 +1,79 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .version import __release_datetime__, __version__
+from typing import TYPE_CHECKING
-__all__ = ['__version__', '__release_datetime__']
+from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .version import __release_datetime__, __version__
    from .trainers import EpochBasedTrainer, TrainingArgs, build_dataset_from_file
    from .trainers import Hook, Priority
    from .exporters import Exporter
    from .exporters import TfModelExporter
    from .exporters import TorchModelExporter
    from .hub.api import HubApi
    from .hub.snapshot_download import snapshot_download
    from .hub.push_to_hub import push_to_hub, push_to_hub_async
    from .hub.check_model import check_model_is_id, check_local_model_is_latest
    from .metrics import AudioNoiseMetric, Metric, task_default_metrics, ImageColorEnhanceMetric, ImageDenoiseMetric, \
        ImageInstanceSegmentationCOCOMetric, ImagePortraitEnhancementMetric, SequenceClassificationMetric, \
        TextGenerationMetric, TokenClassificationMetric, VideoSummarizationMetric, MovieSceneSegmentationMetric, \
        AccuracyMetric, BleuMetric, ImageInpaintingMetric, ReferringVideoObjectSegmentationMetric, \
        VideoFrameInterpolationMetric, VideoStabilizationMetric, VideoSuperResolutionMetric, PplMetric, \
        ImageQualityAssessmentDegradationMetric, ImageQualityAssessmentMosMetric, TextRankingMetric, \
        LossMetric, ImageColorizationMetric, OCRRecognitionMetric
    from .models import Model, TorchModel
    from .preprocessors import Preprocessor
    from .pipelines import Pipeline, pipeline
    from .utils.hub import read_config, create_model_if_not_exist
    from .utils.logger import get_logger
    from .msdatasets import MsDataset
 else:
    _import_structure = {
        'version': ['__release_datetime__', '__version__'],
        'trainers': [
            'EpochBasedTrainer', 'TrainingArgs', 'Hook', 'Priority',
            'build_dataset_from_file'
        ],
        'exporters': [
            'Exporter',
            'TfModelExporter',
            'TorchModelExporter',
        ],
        'hub.api': ['HubApi'],
        'hub.snapshot_download': ['snapshot_download'],
        'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'],
        'hub.check_model':
        ['check_model_is_id', 'check_local_model_is_latest'],
        'metrics': [
            'AudioNoiseMetric', 'Metric', 'task_default_metrics',
            'ImageColorEnhanceMetric', 'ImageDenoiseMetric',
            'ImageInstanceSegmentationCOCOMetric',
            'ImagePortraitEnhancementMetric', 'SequenceClassificationMetric',
            'TextGenerationMetric', 'TokenClassificationMetric',
            'VideoSummarizationMetric', 'MovieSceneSegmentationMetric',
            'AccuracyMetric', 'BleuMetric', 'ImageInpaintingMetric',
            'ReferringVideoObjectSegmentationMetric',
            'VideoFrameInterpolationMetric', 'VideoStabilizationMetric',
            'VideoSuperResolutionMetric', 'PplMetric',
            'ImageQualityAssessmentDegradationMetric',
            'ImageQualityAssessmentMosMetric', 'TextRankingMetric',
            'LossMetric', 'ImageColorizationMetric', 'OCRRecognitionMetric'
        ],
        'models': ['Model', 'TorchModel'],
        'preprocessors': ['Preprocessor'],
        'pipelines': ['Pipeline', 'pipeline'],
        'utils.hub': ['read_config', 'create_model_if_not_exist'],
        'utils.logger': ['get_logger'],
        'msdatasets': ['MsDataset']
    }
    import sys
    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/cli/template/template.tpl
+++ b/modelscope/cli/template/template.tpl
@@ -122,10 +122,11 @@ class ${pipeline_name}(Pipeline):
 # Tips: usr_config_path is the temporary save configuration location， after upload modelscope hub, it is the model_id
 usr_config_path = '${configuration_path}'
 config = Config({
-    'framework': 'pytorch',
+    "framework": 'pytorch',
-    'task': '${task_name}',
+    "task": '${task_name}',
-    'model': {'type': 'my-custom-model'},
+    "model": {'type': 'my-custom-model'},
-    "pipeline": {"type": "my-custom-pipeline"}
+    "pipeline": {"type": "my-custom-pipeline"},
    "allow_remote": True
 })
 config.dump('${configuration_path}' + 'configuration.json')
--- a/modelscope/models/cv/human_wholebody_keypoint/init.py
+++ b/modelscope/models/cv/human_wholebody_keypoint/init.py
@@ -1,14 +1,14 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
-    from .human_wholebody_keypoint import HumanWholeBodyKeypoint
+    from .ans_dfsmn_exporter import ANSDFSMNExporter
 else:
    _import_structure = {
-        'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
+        'ans_dfsmn_exporter': ['ANSDFSMNExporter'],
    }
    import sys
--- a/modelscope/exporters/audio/ans_dfsmn_exporter.py
+++ b/modelscope/exporters/audio/ans_dfsmn_exporter.py
@@ -0,0 +1,62 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import torch
 from modelscope.exporters.builder import EXPORTERS
 from modelscope.exporters.torch_model_exporter import TorchModelExporter
 from modelscope.metainfo import Models
 from modelscope.utils.constant import ModelFile, Tasks
 INPUT_NAME = 'input'
 OUTPUT_NAME = 'output'
@EXPORTERS.register_module(
    Tasks.acoustic_noise_suppression, module_name=Models.speech_dfsmn_ans)
 class ANSDFSMNExporter(TorchModelExporter):
    def export_onnx(self, output_dir: str, opset=9, **kwargs):
        """Export the model as onnx format files.
        Args:
            output_dir: The output dir.
            opset: The version of the ONNX operator set to use.
            kwargs:
                device: The device used to forward.
        Returns:
            A dict containing the model key - model file path pairs.
        """
        model = self.model if 'model' not in kwargs else kwargs.pop('model')
        device_name = 'cpu' if 'device' not in kwargs else kwargs.pop('device')
        model_bin_file = os.path.join(model.model_dir,
                                      ModelFile.TORCH_MODEL_BIN_FILE)
        if os.path.exists(model_bin_file):
            checkpoint = torch.load(model_bin_file, map_location='cpu')
            model.load_state_dict(checkpoint)
        onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
        with torch.no_grad():
            model.eval()
            device = torch.device(device_name)
            model.to(device)
            model_script = torch.jit.script(model)
            fbank_input = torch.zeros((1, 3, 120), dtype=torch.float32)
            torch.onnx.export(
                model_script,
                fbank_input,
                onnx_file,
                opset_version=opset,
                input_names=[INPUT_NAME],
                output_names=[OUTPUT_NAME],
                dynamic_axes={
                    INPUT_NAME: {
                        0: 'batch_size',
                        1: 'number_of_frame'
                    },
                    OUTPUT_NAME: {
                        0: 'batch_size',
                        1: 'number_of_frame'
                    }
                })
        return {'model': onnx_file}
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -6,6 +6,7 @@ import functools
 import os
 import pickle
 import platform
 import re
 import shutil
 import tempfile
 import uuid
@@ -15,10 +16,10 @@ from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import Dict, List, Optional, Tuple, Union
 import requests
 from requests import Session
 from requests.adapters import HTTPAdapter, Retry
 from modelscope import __version__
 from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
                                      API_RESPONSE_FIELD_DATA,
                                      API_RESPONSE_FIELD_EMAIL,
@@ -45,7 +46,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
                                       MASTER_MODEL_BRANCH, DatasetFormations,
                                       DatasetMetaFormats,
                                       DatasetVisibilityMap, DownloadChannel,
-                                       ModelFile)
+                                       ModelFile, VirgoDatasetConfig)
 from modelscope.utils.logger import get_logger
 from .utils.utils import (get_endpoint, get_release_datetime,
                          model_id_to_group_owner_name)
@@ -160,6 +161,7 @@ class HubApi:
            'Visibility': visibility,  # server check
            'License': license,
            'OriginalModelId': original_model_id,
            'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''),
        }
        r = self.session.post(
            path, json=body, cookies=cookies, headers=self.headers)
@@ -236,8 +238,10 @@ class HubApi:
                   license: Optional[str] = Licenses.APACHE_V2,
                   chinese_name: Optional[str] = None,
                   commit_message: Optional[str] = 'upload model',
                   tag: Optional[str] = None,
                   revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
-                   original_model_id: Optional[str] = None):
+                   original_model_id: Optional[str] = None,
                   ignore_file_pattern: Optional[Union[List[str], str]] = None):
        """Upload model from a given directory to given repository. A valid model directory
        must contain a configuration.json file.
@@ -268,10 +272,13 @@ class HubApi:
                chinese name of the new created model.
            commit_message(`str`, *optional*, defaults to `None`):
                commit message of the push request.
            tag(`str`, *optional*, defaults to `None`):
                The tag on this commit
            revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
                which branch to push. If the branch is not exists, It will create a new
                branch and push to it.
            original_model_id (str, optional): The base model id which this model is trained from
            ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading
        Raises:
            InvalidParameter: Parameter invalid.
@@ -292,6 +299,10 @@ class HubApi:
        if cookies is None:
            raise NotLoginException('Must login before upload!')
        files_to_save = os.listdir(model_dir)
        if ignore_file_pattern is None:
            ignore_file_pattern = []
        if isinstance(ignore_file_pattern, str):
            ignore_file_pattern = [ignore_file_pattern]
        try:
            self.get_model(model_id=model_id)
        except Exception:
@@ -325,6 +336,8 @@ class HubApi:
                        shutil.rmtree(src, ignore_errors=True)
            for f in files_to_save:
                if f[0] != '.':
                    if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
                        continue
                    src = os.path.join(model_dir, f)
                    if os.path.isdir(src):
                        shutil.copytree(src, os.path.join(tmp_dir, f))
@@ -338,6 +351,8 @@ class HubApi:
                commit_message=commit_message,
                local_branch=revision,
                remote_branch=revision)
            if tag is not None:
                repo.tag_and_push(tag, tag)
        except Exception:
            raise
        finally:
@@ -581,6 +596,17 @@ class HubApi:
        file_list = file_list['Files']
        return file_list
    @staticmethod
    def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
        """
        Dump the data_type as a local file, in order to get the dataset formation without calling the datahub.
        More details, please refer to the class `modelscope.utils.constant.DatasetFormations`.
        """
        dataset_type_file_path = os.path.join(meta_cache_dir,
                                              f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
        with open(dataset_type_file_path, 'w') as fp:
            fp.write('*** Automatically-generated file, do not modify ***')
    def get_dataset_meta_files_local_paths(self, dataset_name: str,
                                           namespace: str,
                                           revision: str,
@@ -591,10 +617,7 @@ class HubApi:
        cookies = ModelScopeConfig.get_cookies()
        # Dump the data_type as a local file
-        dataset_type_file_path = os.path.join(meta_cache_dir,
+        HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)
                                              f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
        with open(dataset_type_file_path, 'w') as fp:
            fp.write('*** Automatically-generated file, do not modify ***')
        for file_info in file_list:
            file_path = file_info['Path']
@@ -661,7 +684,6 @@ class HubApi:
            cookies = self._check_cookie(use_cookies=True)
        else:
            cookies = ModelScopeConfig.get_cookies()
        r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
        r = self.session.get(
            url=datahub_url, cookies=cookies, headers=self.headers)
@@ -669,6 +691,31 @@ class HubApi:
        raise_on_error(resp)
        return resp['Data']
    def get_virgo_meta(self, dataset_id: str, version: int = 1) -> dict:
        """
        Get virgo dataset meta info.
        """
        virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
        if not virgo_endpoint:
            raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
        virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
        cookies = requests.utils.dict_from_cookiejar(ModelScopeConfig.get_cookies())
        dataset_info = dict(
            dataSetId=dataset_id,
            dataSetVersion=version
        )
        data = dict(
            data=dataset_info,
        )
        r = self.session.post(url=virgo_dataset_url, json=data, cookies=cookies, headers=self.headers, timeout=900)
        resp = r.json()
        if resp['code'] != 0:
            raise RuntimeError(f'Failed to get virgo dataset: {resp}')
        return resp['data']
    def get_dataset_access_config_for_unzipped(self,
                                               dataset_name: str,
                                               namespace: str,
@@ -895,6 +942,7 @@ class ModelScopeConfig:
        if MODELSCOPE_CLOUD_USERNAME in os.environ:
            user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
        from modelscope import __version__
        ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
            __version__,
            platform.python_version(),
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -2,6 +2,7 @@
 from http import HTTPStatus
 import requests
 from requests.exceptions import HTTPError
 from modelscope.utils.logger import get_logger
@@ -57,13 +58,22 @@ def is_ok(rsp):
    return rsp['Code'] == HTTPStatus.OK and rsp['Success']
 def _decode_response_error(response: requests.Response):
    if 'application/json' in response.headers.get('content-type', ''):
        message = response.json()
    else:
        message = response.content.decode('utf-8')
    return message
 def handle_http_post_error(response, url, request_body):
    try:
        response.raise_for_status()
    except HTTPError as error:
        logger.error('Request %s with body: %s exception' %
                     (url, request_body))
-        logger.error('Response details: %s' % response.content)
+        message = _decode_response_error(response)
        logger.error('Response details: %s' % message)
        raise error
@@ -75,7 +85,8 @@ def handle_http_response(response, logger, cookies, model_id):
            logger.error(
                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
                private. Please login first.')
-        logger.error('Response details: %s' % response.content)
+        message = _decode_response_error(response)
        logger.error('Response details: %s' % message)
        raise error
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -12,7 +12,6 @@ import requests
 from requests.adapters import Retry
 from tqdm import tqdm
 from modelscope import __version__
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
                                      API_FILE_DOWNLOAD_RETRY_TIMES,
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -55,16 +55,10 @@ class GitCommandWrapper(metaclass=Singleton):
            response.check_returncode()
            return response
        except subprocess.CalledProcessError as error:
-            if response.returncode == 1:
+            logger.error('There are error run git command.')
-                logger.info('Nothing to commit.')
+            raise GitError(
-                return response
+                'stdout: %s, stderr: %s' %
-            else:
+                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
                logger.error(
                    'There are error run git command, you may need to login first.'
                )
                raise GitError('stdout: %s, stderr: %s' %
                               (response.stdout.decode('utf8'),
                                error.stderr.decode('utf8')))
    def config_auth_token(self, repo_dir, auth_token):
        url = self.get_repo_remote_url(repo_dir)
@@ -199,8 +193,11 @@ class GitCommandWrapper(metaclass=Singleton):
        else:
            return ['/'.join(line.split('/')[1:]) for line in info[1:]]
-    def pull(self, repo_dir: str):
+    def pull(self,
-        cmds = ['-C', repo_dir, 'pull']
+             repo_dir: str,
             remote: str = 'origin',
             branch: str = 'master'):
        cmds = ['-C', repo_dir, 'pull', remote, branch]
        return self._run_git_command(*cmds)
    def push(self,
--- a/modelscope/hub/push_to_hub.py
+++ b/modelscope/hub/push_to_hub.py
@@ -4,8 +4,8 @@ import concurrent.futures
 import os
 from modelscope.hub.api import HubApi
-from modelscope.hub.constants import Licenses, ModelVisibility
+from modelscope.hub.constants import ModelVisibility
-from modelscope.hub.errors import NotExistError
+from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
 from modelscope.utils.logger import get_logger
 logger = get_logger()
@@ -18,7 +18,10 @@ def _api_push_to_hub(repo_name,
                     token,
                     private=True,
                     commit_message='',
-                     source_repo=''):
+                     tag=None,
                     source_repo='',
                     ignore_file_pattern=None,
                     revision=DEFAULT_REPOSITORY_REVISION):
    try:
        api = HubApi()
        api.login(token)
@@ -29,7 +32,10 @@ def _api_push_to_hub(repo_name,
            if not private else ModelVisibility.PRIVATE,
            chinese_name=repo_name,
            commit_message=commit_message,
-            original_model_id=source_repo)
+            tag=tag,
            original_model_id=source_repo,
            ignore_file_pattern=ignore_file_pattern,
            revision=revision)
        commit_message = commit_message or 'No commit message'
        logger.info(
            f'Successfully upload the model to {repo_name} with message: {commit_message}'
@@ -48,7 +54,10 @@ def push_to_hub(repo_name,
                private=True,
                retry=3,
                commit_message='',
-                source_repo=''):
+                tag=None,
                source_repo='',
                ignore_file_pattern=None,
                revision=DEFAULT_REPOSITORY_REVISION):
    """
    Args:
        repo_name: The repo name for the modelhub repo
@@ -57,13 +66,18 @@ def push_to_hub(repo_name,
        private: If is a private repo, default True
        retry: Retry times if something error in uploading, default 3
        commit_message: The commit message
        tag: The tag of this commit
        source_repo: The source repo (model id) which this model comes from
-
+        ignore_file_pattern: The file pattern to be ignored in uploading.
        revision: The branch to commit to
    Returns:
        The boolean value to represent whether the model is uploaded.
    """
    if token is None:
        token = os.environ.get('MODELSCOPE_API_TOKEN')
    if ignore_file_pattern is None:
        ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
    assert repo_name is not None
    assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
    assert os.path.isdir(output_dir)
    assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
@@ -73,7 +87,8 @@ def push_to_hub(repo_name,
        f'Uploading {output_dir} to {repo_name} with message {commit_message}')
    for i in range(retry):
        if _api_push_to_hub(repo_name, output_dir, token, private,
-                            commit_message, source_repo):
+                            commit_message, tag, source_repo,
                            ignore_file_pattern, revision):
            return True
    return False
@@ -83,7 +98,10 @@ def push_to_hub_async(repo_name,
                      token=None,
                      private=True,
                      commit_message='',
-                      source_repo=''):
+                      tag=None,
                      source_repo='',
                      ignore_file_pattern=None,
                      revision=DEFAULT_REPOSITORY_REVISION):
    """
    Args:
        repo_name: The repo name for the modelhub repo
@@ -91,13 +109,18 @@ def push_to_hub_async(repo_name,
        token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None
        private: If is a private repo, default True
        commit_message: The commit message
        tag: The tag of this commit
        source_repo: The source repo (model id) which this model comes from
-
+        ignore_file_pattern: The file pattern to be ignored in uploading
        revision: The branch to commit to
    Returns:
        A handler to check the result and the status
    """
    if token is None:
        token = os.environ.get('MODELSCOPE_API_TOKEN')
    if ignore_file_pattern is None:
        ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
    assert repo_name is not None
    assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
    assert os.path.isdir(output_dir)
    assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
@@ -106,4 +129,5 @@ def push_to_hub_async(repo_name,
    logger.info(
        f'Uploading {output_dir} to {repo_name} with message {commit_message}')
    return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
-                            private, commit_message, source_repo)
+                            private, commit_message, tag, source_repo,
                            ignore_file_pattern, revision)
--- a/modelscope/hub/repository.py
+++ b/modelscope/hub/repository.py
@@ -88,6 +88,26 @@ class Repository:
            remote = None
        return remote
    def pull(self, remote: str = 'origin', branch: str = 'master'):
        """Pull remote branch
        Args:
            remote (str, optional): The remote name. Defaults to 'origin'.
            branch (str, optional): The remote branch. Defaults to 'master'.
        """
        self.git_wrapper.pull(self.model_dir, remote=remote, branch=branch)
    def add_lfs_type(self, file_name_suffix: str):
        """Add file suffix to lfs list.
        Args:
            file_name_suffix (str): The file name suffix.
                examples '*.safetensors'
        """
        os.system(
            "printf '%s filter=lfs diff=lfs merge=lfs -text\n'>>%s" %
            (file_name_suffix, os.path.join(self.model_dir, '.gitattributes')))
    def push(self,
             commit_message: str,
             local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
@@ -120,7 +140,6 @@ class Repository:
                                       self.model_repo_name)
        url = self.git_wrapper.get_repo_remote_url(self.model_dir)
        self.git_wrapper.pull(self.model_dir)
        self.git_wrapper.add(self.model_dir, all_files=True)
        self.git_wrapper.commit(self.model_dir, commit_message)
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -116,15 +116,9 @@ class Models(object):
    bad_image_detecting = 'bad-image-detecting'
    controllable_image_generation = 'controllable-image-generation'
    longshortnet = 'longshortnet'
    fastinst = 'fastinst'
    pedestrian_attribute_recognition = 'pedestrian-attribute-recognition'
    # EasyCV models
    yolox = 'YOLOX'
    segformer = 'Segformer'
    hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
    image_object_detection_auto = 'image-object-detection-auto'
    dino = 'DINO'
    # nlp models
    bert = 'bert'
    palm = 'palm-v2'
@@ -177,6 +171,7 @@ class Models(object):
    speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
    speech_dfsmn_ans = 'speech_dfsmn_ans'
    speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
    speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
    speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
    speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
    kws_kwsbp = 'kws-kwsbp'
@@ -187,6 +182,9 @@ class Models(object):
    generic_sv = 'generic-sv'
    ecapa_tdnn_sv = 'ecapa-tdnn-sv'
    campplus_sv = 'cam++-sv'
    eres2net_sv = 'eres2net-sv'
    scl_sd = 'scl-sd'
    rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
    generic_lm = 'generic-lm'
    # multi-modal models
@@ -205,6 +203,8 @@ class Models(object):
    hitea = 'hitea'
    soonet = 'soonet'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
    mplug_owl = 'mplug-owl'
    clip_interrogator = 'clip-interrogator'
    # science models
    unifold = 'unifold'
@@ -255,6 +255,7 @@ class Pipelines(object):
    should use task name for this pipeline.
        For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
    """
    pipeline_template = 'pipeline-template'
    # vision tasks
    portrait_matting = 'unet-image-matting'
    universal_matting = 'unet-universal-matting'
@@ -277,8 +278,6 @@ class Pipelines(object):
    tbs_detection = 'tbs-detection'
    object_detection = 'vit-object-detection'
    abnormal_object_detection = 'abnormal-object-detection'
    easycv_detection = 'easycv-detection'
    easycv_segmentation = 'easycv-segmentation'
    face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
    salient_detection = 'u2net-salient-detection'
    salient_boudary_detection = 'res2net-salient-detection'
@@ -347,7 +346,6 @@ class Pipelines(object):
    video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking'
    video_multi_object_tracking = 'video-multi-object-tracking'
    image_panoptic_segmentation = 'image-panoptic-segmentation'
    image_panoptic_segmentation_easycv = 'image-panoptic-segmentation-easycv'
    video_summarization = 'googlenet_pgl_video_summarization'
    language_guided_video_summarization = 'clip-it-video-summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
@@ -402,7 +400,7 @@ class Pipelines(object):
    nerf_recon_acc = 'nerf-recon-acc'
    bad_image_detecting = 'bad-image-detecting'
    controllable_image_generation = 'controllable-image-generation'
-
+    fast_instance_segmentation = 'fast-instance-segmentation'
    image_quality_assessment_mos = 'image-quality-assessment-mos'
    image_quality_assessment_man = 'image-quality-assessment-man'
    image_quality_assessment_degradation = 'image-quality-assessment-degradation'
@@ -485,6 +483,9 @@ class Pipelines(object):
    speaker_diarization_inference = 'speaker-diarization-inference'
    vad_inference = 'vad-inference'
    speaker_verification = 'speaker-verification'
    speaker_verification_rdino = 'speaker-verification-rdino'
    speaker_verification_eres2net = 'speaker-verification-eres2net'
    speaker_change_locating = 'speaker-change-locating'
    lm_inference = 'language-score-prediction'
    speech_timestamp_inference = 'speech-timestamp-inference'
@@ -514,6 +515,7 @@ class Pipelines(object):
    gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
    soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
    efficient_diffusion_tuning = 'efficient-diffusion-tuning'
    multimodal_dialogue = 'multimodal-dialogue'
    # science tasks
    protein_structure = 'unifold-protein-structure'
@@ -881,6 +883,7 @@ class NLPTrainers(object):
    document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
    document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
    siamese_uie_trainer = 'siamese-uie-trainer'
    translation_evaluation_trainer = 'translation-evaluation-trainer'
 class MultiModalTrainers(object):
@@ -911,7 +914,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
    """
    default = 'trainer'
    easycv = 'easycv'
    tinynas_damoyolo = 'tinynas-damoyolo'
    @staticmethod
@@ -933,8 +935,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
            return Fields.multi_modal
        elif attribute_or_value == Trainers.default:
            return Trainers.default
        elif attribute_or_value == Trainers.easycv:
            return Trainers.easycv
        else:
            return 'unknown'
@@ -1034,6 +1034,8 @@ class Preprocessors(object):
    vldoc_preprocessor = 'vldoc-preprocessor'
    hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
    diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
    mplug_owl_preprocessor = 'mplug-owl-preprocessor'
    image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'
    # science preprocessor
    unifold_preprocessor = 'unifold-preprocessor'
@@ -1098,6 +1100,8 @@ class Metrics(object):
    # metric for image-colorization task
    image_colorization_metric = 'image-colorization-metric'
    ocr_recognition_metric = 'ocr-recognition-metric'
    # metric for translation evaluation
    translation_evaluation_metric = 'translation-evaluation-metric'
 class Optimizers(object):
@@ -1165,14 +1169,6 @@ class LR_Schedulers(object):
 class CustomDatasets(object):
    """ Names for different datasets.
    """
    ClsDataset = 'ClsDataset'
    Face2dKeypointsDataset = 'FaceKeypointDataset'
    HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
    HumanWholeBodyKeypointDataset = 'WholeBodyCocoTopDownDataset'
    SegDataset = 'SegDataset'
    DetDataset = 'DetDataset'
    DetImagesMixDataset = 'DetImagesMixDataset'
    PanopticDataset = 'PanopticDataset'
    PairedDataset = 'PairedDataset'
    SiddDataset = 'SiddDataset'
    GoproDataset = 'GoproDataset'
--- a/modelscope/metrics/init.py
+++ b/modelscope/metrics/init.py
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
    from .loss_metric import LossMetric
    from .image_colorization_metric import ImageColorizationMetric
    from .ocr_recognition_metric import OCRRecognitionMetric
    from .translation_evaluation_metric import TranslationEvaluationMetric
 else:
    _import_structure = {
        'audio_noise_metric': ['AudioNoiseMetric'],
@@ -62,7 +63,8 @@ else:
        'text_ranking_metric': ['TextRankingMetric'],
        'loss_metric': ['LossMetric'],
        'image_colorization_metric': ['ImageColorizationMetric'],
-        'ocr_recognition_metric': ['OCRRecognitionMetric']
+        'ocr_recognition_metric': ['OCRRecognitionMetric'],
        'translation_evaluation_metric': ['TranslationEvaluationMetric']
    }
    import sys
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -42,6 +42,7 @@ class MetricKeys(object):
    NDCG = 'ndcg'
    AR = 'AR'
    Colorfulness = 'colorfulness'
    Kendall_Tau_Correlation = 'kendall_tau_correlation'
 task_default_metrics = {
@@ -76,6 +77,7 @@ task_default_metrics = {
    Tasks.bad_image_detecting: [Metrics.accuracy],
    Tasks.ocr_recognition: [Metrics.ocr_recognition_metric],
    Tasks.efficient_diffusion_tuning: [Metrics.loss_metric],
    Tasks.translation_evaluation: [Metrics.translation_evaluation_metric]
 }
--- a/modelscope/metrics/translation_evaluation_metric.py
+++ b/modelscope/metrics/translation_evaluation_metric.py
@@ -0,0 +1,174 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import importlib
 from typing import Dict, List, Union
 from pandas import DataFrame
 from modelscope.metainfo import Metrics
 from modelscope.metrics.base import Metric
 from modelscope.metrics.builder import METRICS, MetricKeys
 from modelscope.models.nlp.unite.configuration import InputFormat
 from modelscope.utils.logger import get_logger
 from modelscope.utils.registry import default_group
 logger = get_logger()
@METRICS.register_module(
    group_key=default_group, module_name=Metrics.translation_evaluation_metric)
 class TranslationEvaluationMetric(Metric):
    r"""The metric class for translation evaluation.
    """
    def __init__(self, gap_threshold: float = 25.0):
        r"""Build a translation evaluation metric, following the designed
            Kendall's tau correlation from WMT Metrics Shared Task competitions.
            Args:
                gap_threshold: The score gap denoting the available hypothesis pair.
            Returns:
                A metric for translation evaluation.
        """
        self.gap_threshold = gap_threshold
        self.lp = list()
        self.segment_id = list()
        self.raw_score = list()
        self.score = list()
        self.input_format = list()
    def clear(self) -> None:
        r"""Clear all the stored variables.
        """
        self.lp.clear()
        self.segment_id.clear()
        self.raw_score.clear()
        self.input_format.clear()
        self.score.clear()
        return
    def add(self, outputs: Dict[str, List[float]],
            inputs: Dict[str, List[Union[float, int]]]) -> None:
        r"""Collect the related results for processing.
            Args:
                outputs: Dict containing 'scores'
                inputs: Dict containing 'labels' and 'segment_ids'
        """
        self.lp += inputs['lp']
        self.segment_id += inputs['segment_id']
        self.raw_score += inputs['raw_score']
        self.input_format += inputs['input_format']
        self.score += outputs['score']
        return
    def evaluate(self) -> Dict[str, Dict[str, float]]:
        r"""Compute the Kendall's tau correlation.
            Returns:
                A dict denoting Kendall's tau correlation.
        """
        data = {
            'lp': self.lp,
            'segment_id': self.segment_id,
            'raw_score': self.raw_score,
            'input_format': self.input_format,
            'score': self.score
        }
        data = DataFrame(data=data)
        correlation = dict()
        for input_format in data.input_format.unique():
            logger.info('Evaluation results for %s input format'
                        % input_format.value)
            input_format_data = data[data.input_format == input_format]
            temp_correlation = dict()
            for lp in sorted(input_format_data.lp.unique()):
                sub_data = input_format_data[input_format_data.lp == lp]
                temp_correlation[input_format.value + '_'
                                 + lp] = self.compute_kendall_tau(sub_data)
                logger.info(
                    '\t%s: %f' %
                    (lp,
                     temp_correlation[input_format.value + '_' + lp] * 100))
            avg_correlation = sum(
                temp_correlation.values()) / len(temp_correlation)
            correlation[input_format.value + '_avg'] = avg_correlation
            logger.info('Average evaluation result for %s input format: %f' %
                        (input_format.value, avg_correlation))
            logger.info('')
            correlation.update(temp_correlation)
        return correlation
    def merge(self, other: 'TranslationEvaluationMetric') -> None:
        r"""Merge the predictions from other TranslationEvaluationMetric objects.
            Args:
                other: Another TranslationEvaluationMetric object.
        """
        self.lp += other.lp
        self.segment_id += other.segment_ids
        self.raw_score += other.raw_score
        self.input_format += other.input_format
        self.score += other.score
        return
    def compute_kendall_tau(self, csv_data: DataFrame) -> float:
        r"""Compute kendall's tau correlation.
            Args:
                csv_data: The pandas dataframe.
            Returns:
                float: THe kendall's Tau correlation.
        """
        concor = discor = 0
        for segment_id in sorted(csv_data.segment_id.unique()):
            group_csv_data = csv_data[csv_data.segment_id == segment_id]
            examples = group_csv_data.to_dict('records')
            for i in range(0, len(examples)):
                for j in range(i + 1, len(examples)):
                    if self.raw_score[i] - self.raw_score[
                            j] >= self.gap_threshold:
                        if self.score[i] > self.score[j]:
                            concor += 1
                        elif self.score[i] < self.score[j]:
                            discor += 1
                    elif self.raw_score[i] - self.raw_score[
                            j] <= -self.gap_threshold:
                        if self.score[i] < self.score[j]:
                            concor += 1
                        elif self.score[i] > self.score[j]:
                            discor += 1
        if concor + discor == 0:
            logger.warning(
                'We don\'t have available pairs when evaluation. '
                'Marking the kendall tau correlation as the lowest value (-1.0).'
            )
            return -1.0
        else:
            return (concor - discor) / (concor + discor)
--- a/modelscope/models/audio/ans/conv_stft.py
+++ b/modelscope/models/audio/ans/conv_stft.py
@@ -39,7 +39,7 @@ class ConvSTFT(nn.Module):
        super(ConvSTFT, self).__init__()
        if fft_len is None:
-            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
+            self.fft_len = int(2**np.ceil(np.log2(win_len)))
        else:
            self.fft_len = fft_len
@@ -78,7 +78,7 @@ class ConviSTFT(nn.Module):
                 fix=True):
        super(ConviSTFT, self).__init__()
        if fft_len is None:
-            self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
+            self.fft_len = int(2**np.ceil(np.log2(win_len)))
        else:
            self.fft_len = fft_len
        kernel, window = init_kernels(
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
@@ -45,27 +45,5 @@ class GenericAutomaticSpeechRecognition(Model):
    def forward(self) -> Dict[str, Any]:
        """preload model and return the info of the model
        """
        if self.model_cfg['model_config']['type'] == Frameworks.tf:
            from easyasr import asr_inference_paraformer_tf
            if hasattr(asr_inference_paraformer_tf, 'preload'):
                model_workspace = self.model_cfg['model_workspace']
                model_path = os.path.join(model_workspace,
                                          self.model_cfg['am_model'])
                vocab_path = os.path.join(
                    model_workspace,
                    self.model_cfg['model_config']['vocab_file'])
                sampled_ids = 'seq2seq/sampled_ids'
                sampled_lengths = 'seq2seq/sampled_lengths'
                if 'sampled_ids' in self.model_cfg['model_config']:
                    sampled_ids = self.model_cfg['model_config']['sampled_ids']
                if 'sampled_lengths' in self.model_cfg['model_config']:
                    sampled_lengths = self.model_cfg['model_config'][
                        'sampled_lengths']
                asr_inference_paraformer_tf.preload(
                    ngpu=1,
                    asr_model_file=model_path,
                    vocab_file=vocab_path,
                    sampled_ids=sampled_ids,
                    sampled_lengths=sampled_lengths)
        return self.model_cfg
--- a/modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
+++ b/modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
@@ -0,0 +1,233 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
 from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32
 class DFSMNUnit(nn.Module):
    """ one multi-channel deep fsmn unit
    Args:
        dimin:                  input dimension
        dimexpand:              feature expansion dimension
        dimout:                 output dimension
        lorder:                 left ofder
        rorder:                 right order
    """
    def __init__(self,
                 dimin=64,
                 dimexpand=128,
                 dimout=64,
                 lorder=10,
                 rorder=1):
        super(DFSMNUnit, self).__init__()
        self.expand = AffineTransform(dimin, dimexpand)
        self.shrink = LinearTransform(dimexpand, dimout)
        self.fsmn = Fsmn(dimout, dimout, lorder, rorder, 1, 1)
        self.debug = False
        self.dataout = None
    def forward(self, input):
        """
        Args:
            input: [batch, time, feature]
        """
        out1 = F.relu(self.expand(input))
        out2 = self.shrink(out1)
        out3 = self.fsmn(out2)
        # add skip connection for matched data
        if input.shape[-1] == out3.shape[-1]:
            out3 = input + out3
        if self.debug:
            self.dataout = out3
        return out3
    def print_model(self):
        self.expand.printModel()
        self.shrink.printModel()
        self.fsmn.printModel()
    def to_kaldi_nnet(self):
        re_str = self.expand.toKaldiNNet()
        relu = RectifiedLinear(self.expand.linear.out_features,
                               self.expand.linear.out_features)
        re_str += relu.toKaldiNNet()
        re_str = self.shrink.toKaldiNNet()
        re_str += self.fsmn.toKaldiNNet()
        return re_str
 class FSMNSeleNetV3(nn.Module):
    """ Deep FSMN model with channel selection performs multi-channel kws.
    Zhang, Shiliang, et al. "Deep-FSMN for large vocabulary continuous speech
    recognition." 2018 IEEE International Conference on Acoustics, Speech and
    Signal Processing (ICASSP). IEEE, 2018.
    Args:
        input_dim:              input dimension
        linear_dim:             fsmn input dimension
        proj_dim:               fsmn projection dimension
        lorder:                 fsmn left order
        rorder:                 fsmn right order
        num_syn:                output dimension
        fsmn_layers:            no. of fsmn units
    """
    def __init__(self,
                 input_dim=120,
                 linear_dim=128,
                 proj_dim=64,
                 lorder=10,
                 rorder=1,
                 num_syn=5,
                 fsmn_layers=5):
        super(FSMNSeleNetV3, self).__init__()
        self.mem = []
        # the first unit, mapping input dim to proj dim
        unit = DFSMNUnit(input_dim, linear_dim, proj_dim, lorder, rorder)
        self.mem.append(unit)
        self.add_module('mem_{:d}'.format(0), unit)
        # deep fsmn layers with skip connection
        for i in range(1, fsmn_layers):
            unit = DFSMNUnit(proj_dim, linear_dim, proj_dim, lorder, rorder)
            self.mem.append(unit)
            self.add_module('mem_{:d}'.format(i), unit)
        self.expand2 = AffineTransform(proj_dim, linear_dim)
        self.decision = AffineTransform(linear_dim, num_syn)
    def forward(self, input):
        # multi-channel temp space, [batch, time, channel, feature]
        if torch.cuda.is_available():
            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
                            self.expand2.linear.out_features).cuda()
        else:
            x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
                            self.expand2.linear.out_features)
        for n in range(input.shape[2]):
            chin = input[:, :, n, :]
            for unit in self.mem:
                chout = unit(chin)
                chin = chout
            x[:, :, n, :] = F.relu(self.expand2(chout))
        # perform max pooling
        pool = nn.MaxPool2d((x.shape[2], 1), stride=(x.shape[2], 1))
        y = pool(x)
        # remove channel dimension
        y = torch.squeeze(y, -2)
        z = self.decision(y)
        return z
    def print_model(self):
        for unit in self.mem:
            unit.print_model()
        self.expand2.printModel()
        self.decision.printModel()
    def print_header(self):
        """ get DFSMN params
        """
        input_dim = self.mem[0].expand.linear.in_features
        linear_dim = self.mem[0].expand.linear.out_features
        proj_dim = self.mem[0].shrink.linear.out_features
        lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
        rorder = 0
        if self.mem[0].fsmn.conv_right is not None:
            rorder = self.mem[0].fsmn.conv_right.kernel_size[0]
        num_syn = self.decision.linear.out_features
        fsmn_layers = len(self.mem)
        # no. of output channels, 0.0 means the same as numins
        numouts = 1.0
        #
        # write total header
        #
        header = [0.0] * HEADER_BLOCK_SIZE * 5
        # numins
        header[0] = 0.0
        # numouts
        header[1] = numouts
        # dimins
        header[2] = input_dim
        # dimouts
        header[3] = num_syn
        # numlayers
        header[4] = 4
        #
        # write each layer's header
        #
        hidx = 1
        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DFSMN.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 4] = proj_dim
        header[HEADER_BLOCK_SIZE * hidx + 5] = lorder
        header[HEADER_BLOCK_SIZE * hidx + 6] = rorder
        header[HEADER_BLOCK_SIZE * hidx + 7] = fsmn_layers
        hidx += 1
        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DENSE.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = proj_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
            ActivationType.ACTIVATION_RELU.value)
        hidx += 1
        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_MAX_POOLING.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
        hidx += 1
        header[HEADER_BLOCK_SIZE * hidx + 0] = float(
            LayerType.LAYER_DENSE.value)
        header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
        header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
        header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
        header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
        header[HEADER_BLOCK_SIZE * hidx + 5] = float(
            ActivationType.ACTIVATION_SOFTMAX.value)
        for h in header:
            print(f32ToI32(h))
    def to_kaldi_nnet(self):
        re_str = '<Nnet>\n'
        for unit in self.mem:
            re_str += unit.to_kaldi_nnet()
        re_str = self.expand2.toKaldiNNet()
        relu = RectifiedLinear(self.expand2.linear.out_features,
                               self.expand2.linear.out_features)
        re_str += relu.toKaldiNNet()
        re_str += self.decision.toKaldiNNet()
        re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
                                         self.decision.linear.out_features)
        re_str += '<!EndOfComponent>\n'
        re_str += '</Nnet>\n'
        return re_str
--- a/modelscope/models/audio/kws/farfield/model.py
+++ b/modelscope/models/audio/kws/farfield/model.py
@@ -11,6 +11,7 @@ from modelscope.models.builder import MODELS
 from modelscope.utils.audio.audio_utils import update_conf
 from modelscope.utils.constant import Tasks
 from .fsmn_sele_v2 import FSMNSeleNetV2
 from .fsmn_sele_v3 import FSMNSeleNetV3
@MODELS.register_module(
@@ -18,6 +19,7 @@ from .fsmn_sele_v2 import FSMNSeleNetV2
 class FSMNSeleNetV2Decorator(TorchModel):
    r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """
    MODEL_CLASS = FSMNSeleNetV2
    MODEL_TXT = 'model.txt'
    SC_CONFIG = 'sound_connect.conf'
@@ -33,7 +35,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
        """
        super().__init__(model_dir, *args, **kwargs)
        if training:
-            self.model = FSMNSeleNetV2(*args, **kwargs)
+            self.model = self.MODEL_CLASS(*args, **kwargs)
        else:
            sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
            model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
@@ -42,7 +44,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
            self._sc = None
            if os.path.exists(model_txt_file):
-                conf_dict = dict(mode=56542, kws_model=model_txt_file)
+                conf_dict = dict(kws_model=model_txt_file)
                update_conf(sc_config_file, new_config_file, conf_dict)
                import py_sound_connect
                self._sc = py_sound_connect.SoundConnect(new_config_file)
@@ -50,8 +52,8 @@ class FSMNSeleNetV2Decorator(TorchModel):
                self.size_out = self._sc.bytesPerBlockOut()
            else:
                raise Exception(
-                    f'Invalid model directory! Failed to load model file: {model_txt_file}.'
+                    f'Invalid model directory! Failed to load model file:'
-                )
+                    f' {model_txt_file}.')
    def __del__(self):
        if hasattr(self, 'tmp_dir'):
@@ -73,3 +75,24 @@ class FSMNSeleNetV2Decorator(TorchModel):
                'confidence': self._sc.kwsConfidence()
            }
        return result
@MODELS.register_module(
    Tasks.keyword_spotting,
    module_name=Models.speech_dfsmn_kws_char_farfield_iot)
 class FSMNSeleNetV3Decorator(FSMNSeleNetV2Decorator):
    r""" A decorator of FSMNSeleNetV3 for integrating into modelscope framework """
    MODEL_CLASS = FSMNSeleNetV3
    def __init__(self,
                 model_dir: str,
                 training: Optional[bool] = False,
                 *args,
                 **kwargs):
        """initialize the dfsmn model from the `model_dir` path.
        Args:
            model_dir (str): the model path.
        """
        super().__init__(model_dir, training, *args, **kwargs)
--- a/modelscope/models/audio/sv/DTDNN.py
+++ b/modelscope/models/audio/sv/DTDNN.py
@@ -76,11 +76,13 @@ class CAMPPlus(nn.Module):
                 bn_size=4,
                 init_channels=128,
                 config_str='batchnorm-relu',
-                 memory_efficient=True):
+                 memory_efficient=True,
                 output_level='segment'):
        super(CAMPPlus, self).__init__()
        self.head = FCM(feat_dim=feat_dim)
        channels = self.head.out_channels
        self.output_level = output_level
        self.xvector = nn.Sequential(
            OrderedDict([
@@ -118,10 +120,14 @@ class CAMPPlus(nn.Module):
        self.xvector.add_module('out_nonlinear',
                                get_nonlinear(config_str, channels))
-        self.xvector.add_module('stats', StatsPool())
+        if self.output_level == 'segment':
-        self.xvector.add_module(
+            self.xvector.add_module('stats', StatsPool())
-            'dense',
+            self.xvector.add_module(
-            DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
+                'dense',
                DenseLayer(
                    channels * 2, embedding_size, config_str='batchnorm_'))
        else:
            assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. '
        for m in self.modules():
            if isinstance(m, (nn.Conv1d, nn.Linear)):
@@ -133,6 +139,8 @@ class CAMPPlus(nn.Module):
        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
        x = self.head(x)
        x = self.xvector(x)
        if self.output_level == 'frame':
            x = x.transpose(1, 2)
        return x
--- a/modelscope/models/audio/sv/ERes2Net.py
+++ b/modelscope/models/audio/sv/ERes2Net.py
@@ -0,0 +1,344 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """ Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
    ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
    fusion (LFF) fuses the features within one single residual block to extract the local signal.
    The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
 """
 import math
 import os
 from typing import Any, Dict, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.compliance.kaldi as Kaldi
 import modelscope.models.audio.sv.pooling_layers as pooling_layers
 from modelscope.metainfo import Models
 from modelscope.models import MODELS, TorchModel
 from modelscope.models.audio.sv.fusion import AFF
 from modelscope.utils.constant import Tasks
 class ReLU(nn.Hardtanh):
    def __init__(self, inplace=False):
        super(ReLU, self).__init__(0, 20, inplace)
    def __repr__(self):
        inplace_str = 'inplace' if self.inplace else ''
        return self.__class__.__name__ + ' (' \
            + inplace_str + ')'
 def conv1x1(in_planes, out_planes, stride=1):
    '1x1 convolution without padding'
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=1,
        stride=stride,
        padding=0,
        bias=False)
 def conv3x3(in_planes, out_planes, stride=1):
    '3x3 convolution with padding'
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=1,
        bias=False)
 class BasicBlockRes2Net(nn.Module):
    expansion = 2
    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
        super(BasicBlockRes2Net, self).__init__()
        width = int(math.floor(planes * (baseWidth / 64.0)))
        self.conv1 = conv1x1(in_planes, width * scale, stride)
        self.bn1 = nn.BatchNorm2d(width * scale)
        self.nums = scale
        convs = []
        bns = []
        for i in range(self.nums):
            convs.append(conv3x3(width, width))
            bns.append(nn.BatchNorm2d(width))
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)
        self.relu = ReLU(inplace=True)
        self.conv3 = conv1x1(width * scale, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(
                    in_planes,
                    self.expansion * planes,
                    kernel_size=1,
                    stride=stride,
                    bias=False), nn.BatchNorm2d(self.expansion * planes))
        self.stride = stride
        self.width = width
        self.scale = scale
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            sp = self.convs[i](sp)
            sp = self.relu(self.bns[i](sp))
            if i == 0:
                out = sp
            else:
                out = torch.cat((out, sp), 1)
        out = self.conv3(out)
        out = self.bn3(out)
        residual = self.shortcut(x)
        out += residual
        out = self.relu(out)
        return out
 class BasicBlockRes2Net_diff_AFF(nn.Module):
    expansion = 2
    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
        super(BasicBlockRes2Net_diff_AFF, self).__init__()
        width = int(math.floor(planes * (baseWidth / 64.0)))
        self.conv1 = conv1x1(in_planes, width * scale, stride)
        self.bn1 = nn.BatchNorm2d(width * scale)
        self.nums = scale
        convs = []
        fuse_models = []
        bns = []
        for i in range(self.nums):
            convs.append(conv3x3(width, width))
            bns.append(nn.BatchNorm2d(width))
        for j in range(self.nums - 1):
            fuse_models.append(AFF(channels=width))
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)
        self.fuse_models = nn.ModuleList(fuse_models)
        self.relu = ReLU(inplace=True)
        self.conv3 = conv1x1(width * scale, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(
                    in_planes,
                    self.expansion * planes,
                    kernel_size=1,
                    stride=stride,
                    bias=False), nn.BatchNorm2d(self.expansion * planes))
        self.stride = stride
        self.width = width
        self.scale = scale
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                sp = self.fuse_models[i - 1](sp, spx[i])
            sp = self.convs[i](sp)
            sp = self.relu(self.bns[i](sp))
            if i == 0:
                out = sp
            else:
                out = torch.cat((out, sp), 1)
        out = self.conv3(out)
        out = self.bn3(out)
        residual = self.shortcut(x)
        out += residual
        out = self.relu(out)
        return out
 class ERes2Net(nn.Module):
    def __init__(self,
                 block=BasicBlockRes2Net,
                 block_fuse=BasicBlockRes2Net_diff_AFF,
                 num_blocks=[3, 4, 6, 3],
                 m_channels=32,
                 feat_dim=80,
                 embed_dim=192,
                 pooling_func='TSTP',
                 two_emb_layer=False):
        super(ERes2Net, self).__init__()
        self.in_planes = m_channels
        self.feat_dim = feat_dim
        self.embed_dim = embed_dim
        self.stats_dim = int(feat_dim / 8) * m_channels * 8
        self.two_emb_layer = two_emb_layer
        self.conv1 = nn.Conv2d(
            1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(m_channels)
        self.layer1 = self._make_layer(
            block, m_channels, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(
            block, m_channels * 2, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(
            block_fuse, m_channels * 4, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(
            block_fuse, m_channels * 8, num_blocks[3], stride=2)
        # downsampling
        self.layer1_downsample = nn.Conv2d(
            m_channels * 2,
            m_channels * 4,
            kernel_size=3,
            stride=2,
            padding=1,
            bias=False)
        self.layer2_downsample = nn.Conv2d(
            m_channels * 4,
            m_channels * 8,
            kernel_size=3,
            padding=1,
            stride=2,
            bias=False)
        self.layer3_downsample = nn.Conv2d(
            m_channels * 8,
            m_channels * 16,
            kernel_size=3,
            padding=1,
            stride=2,
            bias=False)
        # bottom-up fusion
        self.fuse_mode12 = AFF(channels=m_channels * 4)
        self.fuse_mode123 = AFF(channels=m_channels * 8)
        self.fuse_mode1234 = AFF(channels=m_channels * 16)
        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
        self.pool = getattr(pooling_layers, pooling_func)(
            in_dim=self.stats_dim * block.expansion)
        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
                               embed_dim)
        if self.two_emb_layer:
            self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
            self.seg_2 = nn.Linear(embed_dim, embed_dim)
        else:
            self.seg_bn_1 = nn.Identity()
            self.seg_2 = nn.Identity()
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)
    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = x.unsqueeze_(1)
        out = F.relu(self.bn1(self.conv1(x)))
        out1 = self.layer1(out)
        # bottom-up fusion
        out2 = self.layer2(out1)
        out1_downsample = self.layer1_downsample(out1)
        fuse_out12 = self.fuse_mode12(out2, out1_downsample)
        out3 = self.layer3(out2)
        fuse_out12_downsample = self.layer2_downsample(fuse_out12)
        fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
        out4 = self.layer4(out3)
        fuse_out123_downsample = self.layer3_downsample(fuse_out123)
        fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
        stats = self.pool(fuse_out1234)
        embed_a = self.seg_1(stats)
        if self.two_emb_layer:
            out = F.relu(embed_a)
            out = self.seg_bn_1(out)
            embed_b = self.seg_2(out)
            return embed_b
        else:
            return embed_a
@MODELS.register_module(
    Tasks.speaker_verification, module_name=Models.eres2net_sv)
 class SpeakerVerificationERes2Net(TorchModel):
    r"""Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
    of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
    interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    """
    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
                 **kwargs):
        super().__init__(model_dir, model_config, *args, **kwargs)
        self.model_config = model_config
        self.other_config = kwargs
        self.feature_dim = 80
        self.embedding_model = ERes2Net()
        pretrained_model_name = kwargs['pretrained_model']
        self.__load_check_point(pretrained_model_name)
        self.embedding_model.eval()
    def forward(self, audio):
        assert len(audio.shape) == 2 and audio.shape[
            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
        # audio shape: [1, T]
        feature = self.__extract_feature(audio)
        embedding = self.embedding_model(feature)
        return embedding
    def __extract_feature(self, audio):
        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
        feature = feature - feature.mean(dim=0, keepdim=True)
        feature = feature.unsqueeze(0)
        return feature
    def __load_check_point(self, pretrained_model_name, device=None):
        if not device:
            device = torch.device('cpu')
        self.embedding_model.load_state_dict(
            torch.load(
                os.path.join(self.model_dir, pretrained_model_name),
                map_location=device),
            strict=True)
--- a/modelscope/models/audio/sv/fusion.py
+++ b/modelscope/models/audio/sv/fusion.py
@@ -0,0 +1,32 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import torch
 import torch.nn as nn
 class AFF(nn.Module):
    def __init__(self, channels=64, r=4):
        super(AFF, self).__init__()
        inter_channels = int(channels // r)
        self.local_att = nn.Sequential(
            nn.Conv2d(
                channels * 2,
                inter_channels,
                kernel_size=1,
                stride=1,
                padding=0),
            nn.BatchNorm2d(inter_channels),
            nn.SiLU(inplace=True),
            nn.Conv2d(
                inter_channels, channels, kernel_size=1, stride=1, padding=0),
            nn.BatchNorm2d(channels),
        )
    def forward(self, x, ds_y):
        xa = torch.cat((x, ds_y), dim=1)
        x_att = self.local_att(xa)
        x_att = 1.0 + torch.tanh(x_att)
        xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
        return xo
--- a/modelscope/models/audio/sv/pooling_layers.py
+++ b/modelscope/models/audio/sv/pooling_layers.py
@@ -0,0 +1,107 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """ This implementation is adapted from https://github.com/wenet-e2e/wespeaker.
 """
 import torch
 import torch.nn as nn
 class TAP(nn.Module):
    """
    Temporal average pooling, only first-order mean is considered
    """
    def __init__(self, **kwargs):
        super(TAP, self).__init__()
    def forward(self, x):
        pooling_mean = x.mean(dim=-1)
        # To be compatable with 2D input
        pooling_mean = pooling_mean.flatten(start_dim=1)
        return pooling_mean
 class TSDP(nn.Module):
    """
    Temporal standard deviation pooling, only second-order std is considered
    """
    def __init__(self, **kwargs):
        super(TSDP, self).__init__()
    def forward(self, x):
        # The last dimension is the temporal axis
        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
        pooling_std = pooling_std.flatten(start_dim=1)
        return pooling_std
 class TSTP(nn.Module):
    """
    Temporal statistics pooling, concatenate mean and std, which is used in
    x-vector
    Comment: simple concatenation can not make full use of both statistics
    """
    def __init__(self, **kwargs):
        super(TSTP, self).__init__()
    def forward(self, x):
        # The last dimension is the temporal axis
        pooling_mean = x.mean(dim=-1)
        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
        pooling_mean = pooling_mean.flatten(start_dim=1)
        pooling_std = pooling_std.flatten(start_dim=1)
        stats = torch.cat((pooling_mean, pooling_std), 1)
        return stats
 class ASTP(nn.Module):
    """ Attentive statistics pooling: Channel- and context-dependent
        statistics pooling, first used in ECAPA_TDNN.
    """
    def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
        super(ASTP, self).__init__()
        self.global_context_att = global_context_att
        # Use Conv1d with stride == 1 rather than Linear, then we don't
        # need to transpose inputs.
        if global_context_att:
            self.linear1 = nn.Conv1d(
                in_dim * 3, bottleneck_dim,
                kernel_size=1)  # equals W and b in the paper
        else:
            self.linear1 = nn.Conv1d(
                in_dim, bottleneck_dim,
                kernel_size=1)  # equals W and b in the paper
        self.linear2 = nn.Conv1d(
            bottleneck_dim, in_dim,
            kernel_size=1)  # equals V and k in the paper
    def forward(self, x):
        """
        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
        """
        if len(x.shape) == 4:
            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
        assert len(x.shape) == 3
        if self.global_context_att:
            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
            context_std = torch.sqrt(
                torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
            x_in = torch.cat((x, context_mean, context_std), dim=1)
        else:
            x_in = x
        # DON'T use ReLU here! ReLU may be hard to converge.
        alpha = torch.tanh(
            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
        alpha = torch.softmax(self.linear2(alpha), dim=2)
        mean = torch.sum(alpha * x, dim=2)
        var = torch.sum(alpha * (x**2), dim=2) - mean**2
        std = torch.sqrt(var.clamp(min=1e-10))
        return torch.cat([mean, std], dim=1)
--- a/modelscope/models/audio/sv/rdino.py
+++ b/modelscope/models/audio/sv/rdino.py
@@ -0,0 +1,573 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 """ This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
    RDINOHead implementation is adapted from DINO framework.
 """
 import math
 import os
 from typing import Any, Dict, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.compliance.kaldi as Kaldi
 from modelscope.metainfo import Models
 from modelscope.models import MODELS, TorchModel
 from modelscope.utils.constant import Tasks
 def length_to_mask(length, max_len=None, dtype=None, device=None):
    assert len(length.shape) == 1
    if max_len is None:
        max_len = length.max().long().item()
    mask = torch.arange(
        max_len, device=length.device, dtype=length.dtype).expand(
            len(length), max_len) < length.unsqueeze(1)
    if dtype is None:
        dtype = length.dtype
    if device is None:
        device = length.device
    mask = torch.as_tensor(mask, dtype=dtype, device=device)
    return mask
 def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
    if stride > 1:
        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
        L_out = stride * (n_steps - 1) + kernel_size * dilation
        padding = [kernel_size // 2, kernel_size // 2]
    else:
        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
    return padding
 class Conv1d(nn.Module):
    def __init__(
        self,
        out_channels,
        kernel_size,
        in_channels,
        stride=1,
        dilation=1,
        padding='same',
        groups=1,
        bias=True,
        padding_mode='reflect',
    ):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.dilation = dilation
        self.padding = padding
        self.padding_mode = padding_mode
        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            self.kernel_size,
            stride=self.stride,
            dilation=self.dilation,
            padding=0,
            groups=groups,
            bias=bias,
        )
    def forward(self, x):
        if self.padding == 'same':
            x = self._manage_padding(x, self.kernel_size, self.dilation,
                                     self.stride)
        elif self.padding == 'causal':
            num_pad = (self.kernel_size - 1) * self.dilation
            x = F.pad(x, (num_pad, 0))
        elif self.padding == 'valid':
            pass
        else:
            raise ValueError(
                "Padding must be 'same', 'valid' or 'causal'. Got "
                + self.padding)
        wx = self.conv(x)
        return wx
    def _manage_padding(
        self,
        x,
        kernel_size: int,
        dilation: int,
        stride: int,
    ):
        L_in = x.shape[-1]
        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
        x = F.pad(x, padding, mode=self.padding_mode)
        return x
 class BatchNorm1d(nn.Module):
    def __init__(
        self,
        input_size,
        eps=1e-05,
        momentum=0.1,
    ):
        super().__init__()
        self.norm = nn.BatchNorm1d(
            input_size,
            eps=eps,
            momentum=momentum,
        )
    def forward(self, x):
        return self.norm(x)
 class TDNNBlock(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        dilation,
        activation=nn.ReLU,
        groups=1,
    ):
        super(TDNNBlock, self).__init__()
        self.conv = Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            dilation=dilation,
            groups=groups,
        )
        self.activation = activation()
        self.norm = BatchNorm1d(input_size=out_channels)
    def forward(self, x):
        return self.norm(self.activation(self.conv(x)))
 class Res2NetBlock(torch.nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 scale=8,
                 kernel_size=3,
                 dilation=1):
        super(Res2NetBlock, self).__init__()
        assert in_channels % scale == 0
        assert out_channels % scale == 0
        in_channel = in_channels // scale
        hidden_channel = out_channels // scale
        self.blocks = nn.ModuleList([
            TDNNBlock(
                in_channel,
                hidden_channel,
                kernel_size=kernel_size,
                dilation=dilation,
            ) for i in range(scale - 1)
        ])
        self.scale = scale
    def forward(self, x):
        y = []
        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
            if i == 0:
                y_i = x_i
            elif i == 1:
                y_i = self.blocks[i - 1](x_i)
            else:
                y_i = self.blocks[i - 1](x_i + y_i)
            y.append(y_i)
        y = torch.cat(y, dim=1)
        return y
 class SEBlock(nn.Module):
    def __init__(self, in_channels, se_channels, out_channels):
        super(SEBlock, self).__init__()
        self.conv1 = Conv1d(
            in_channels=in_channels, out_channels=se_channels, kernel_size=1)
        self.relu = torch.nn.ReLU(inplace=True)
        self.conv2 = Conv1d(
            in_channels=se_channels, out_channels=out_channels, kernel_size=1)
        self.sigmoid = torch.nn.Sigmoid()
    def forward(self, x, lengths=None):
        L = x.shape[-1]
        if lengths is not None:
            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
            mask = mask.unsqueeze(1)
            total = mask.sum(dim=2, keepdim=True)
            s = (x * mask).sum(dim=2, keepdim=True) / total
        else:
            s = x.mean(dim=2, keepdim=True)
        s = self.relu(self.conv1(s))
        s = self.sigmoid(self.conv2(s))
        return s * x
 class AttentiveStatisticsPooling(nn.Module):
    def __init__(self, channels, attention_channels=128, global_context=True):
        super().__init__()
        self.eps = 1e-12
        self.global_context = global_context
        if global_context:
            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
        else:
            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
        self.tanh = nn.Tanh()
        self.conv = Conv1d(
            in_channels=attention_channels,
            out_channels=channels,
            kernel_size=1)
    def forward(self, x, lengths=None):
        L = x.shape[-1]
        def _compute_statistics(x, m, dim=2, eps=self.eps):
            mean = (m * x).sum(dim)
            std = torch.sqrt(
                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
            return mean, std
        if lengths is None:
            lengths = torch.ones(x.shape[0], device=x.device)
        # Make binary mask of shape [N, 1, L]
        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
        mask = mask.unsqueeze(1)
        # Expand the temporal context of the pooling layer by allowing the
        # self-attention to look at global properties of the utterance.
        if self.global_context:
            # torch.std is unstable for backward computation
            # https://github.com/pytorch/pytorch/issues/4320
            total = mask.sum(dim=2, keepdim=True).float()
            mean, std = _compute_statistics(x, mask / total)
            mean = mean.unsqueeze(2).repeat(1, 1, L)
            std = std.unsqueeze(2).repeat(1, 1, L)
            attn = torch.cat([x, mean, std], dim=1)
        else:
            attn = x
        # Apply layers
        attn = self.conv(self.tanh(self.tdnn(attn)))
        # Filter out zero-paddings
        attn = attn.masked_fill(mask == 0, float('-inf'))
        attn = F.softmax(attn, dim=2)
        mean, std = _compute_statistics(x, attn)
        # Append mean and std of the batch
        pooled_stats = torch.cat((mean, std), dim=1)
        pooled_stats = pooled_stats.unsqueeze(2)
        return pooled_stats
 class SERes2NetBlock(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        res2net_scale=8,
        se_channels=128,
        kernel_size=1,
        dilation=1,
        activation=torch.nn.ReLU,
        groups=1,
    ):
        super().__init__()
        self.out_channels = out_channels
        self.tdnn1 = TDNNBlock(
            in_channels,
            out_channels,
            kernel_size=1,
            dilation=1,
            activation=activation,
            groups=groups,
        )
        self.res2net_block = Res2NetBlock(out_channels, out_channels,
                                          res2net_scale, kernel_size, dilation)
        self.tdnn2 = TDNNBlock(
            out_channels,
            out_channels,
            kernel_size=1,
            dilation=1,
            activation=activation,
            groups=groups,
        )
        self.se_block = SEBlock(out_channels, se_channels, out_channels)
        self.shortcut = None
        if in_channels != out_channels:
            self.shortcut = Conv1d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
            )
    def forward(self, x, lengths=None):
        residual = x
        if self.shortcut:
            residual = self.shortcut(x)
        x = self.tdnn1(x)
        x = self.res2net_block(x)
        x = self.tdnn2(x)
        x = self.se_block(x, lengths)
        return x + residual
 class ECAPA_TDNN(nn.Module):
    """An implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
    """
    def __init__(
        self,
        input_size,
        device='cpu',
        lin_neurons=512,
        activation=torch.nn.ReLU,
        channels=[512, 512, 512, 512, 1536],
        kernel_sizes=[5, 3, 3, 3, 1],
        dilations=[1, 2, 3, 4, 1],
        attention_channels=128,
        res2net_scale=8,
        se_channels=128,
        global_context=True,
        groups=[1, 1, 1, 1, 1],
    ):
        super().__init__()
        assert len(channels) == len(kernel_sizes)
        assert len(channels) == len(dilations)
        self.channels = channels
        self.blocks = nn.ModuleList()
        # The initial TDNN layer
        self.blocks.append(
            TDNNBlock(
                input_size,
                channels[0],
                kernel_sizes[0],
                dilations[0],
                activation,
                groups[0],
            ))
        # SE-Res2Net layers
        for i in range(1, len(channels) - 1):
            self.blocks.append(
                SERes2NetBlock(
                    channels[i - 1],
                    channels[i],
                    res2net_scale=res2net_scale,
                    se_channels=se_channels,
                    kernel_size=kernel_sizes[i],
                    dilation=dilations[i],
                    activation=activation,
                    groups=groups[i],
                ))
        # Multi-layer feature aggregation
        self.mfa = TDNNBlock(
            channels[-1],
            channels[-1],
            kernel_sizes[-1],
            dilations[-1],
            activation,
            groups=groups[-1],
        )
        # Attentive Statistical Pooling
        self.asp = AttentiveStatisticsPooling(
            channels[-1],
            attention_channels=attention_channels,
            global_context=global_context,
        )
        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
        # Final linear transformation
        self.fc = Conv1d(
            in_channels=channels[-1] * 2,
            out_channels=lin_neurons,
            kernel_size=1,
        )
    def forward(self, x, lengths=None):
        """Returns the embedding vector.
        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        """
        x = x.transpose(1, 2)
        xl = []
        for layer in self.blocks:
            try:
                x = layer(x, lengths=lengths)
            except TypeError:
                x = layer(x)
            xl.append(x)
        # Multi-layer feature aggregation
        x = torch.cat(xl[1:], dim=1)
        x = self.mfa(x)
        # Attentive Statistical Pooling
        x = self.asp(x, lengths=lengths)
        x = self.asp_bn(x)
        # Final linear transformation
        x = self.fc(x)
        x = x.transpose(1, 2).squeeze(1)
        return x
 class RDINOHead(nn.Module):
    def __init__(self,
                 in_dim,
                 out_dim,
                 use_bn=False,
                 norm_last_layer=True,
                 nlayers=3,
                 hidden_dim=2048,
                 bottleneck_dim=256,
                 add_dim=8192):
        super().__init__()
        nlayers = max(nlayers, 1)
        if nlayers == 1:
            self.mlp = nn.Linear(in_dim, bottleneck_dim)
        else:
            layers = [nn.Linear(in_dim, hidden_dim)]
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.GELU())
            for _ in range(nlayers - 2):
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                if use_bn:
                    layers.append(nn.BatchNorm1d(hidden_dim))
                layers.append(nn.GELU())
            layers.append(nn.Linear(hidden_dim, add_dim))
            self.mlp = nn.Sequential(*layers)
        self.add_layer = nn.Linear(add_dim, bottleneck_dim)
        self.apply(self._init_weights)
        self.last_layer = nn.utils.weight_norm(
            nn.Linear(bottleneck_dim, out_dim, bias=False))
        self.last_layer.weight_g.data.fill_(1)
        if norm_last_layer:
            self.last_layer.weight_g.requires_grad = False
    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
    def forward(self, x):
        vicr_out = self.mlp(x)
        x = self.add_layer(vicr_out)
        x = nn.functional.normalize(x, dim=-1, p=2)
        x = self.last_layer(x)
        return vicr_out, x
 class Combine(nn.Module):
    def __init__(self, backbone, head):
        super(Combine, self).__init__()
        self.backbone = backbone
        self.head = head
    def forward(self, x):
        x = self.backbone(x)
        output = self.head(x)
        return output
@MODELS.register_module(
    Tasks.speaker_verification, module_name=Models.rdino_tdnn_sv)
 class SpeakerVerification_RDINO(TorchModel):
    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
                 **kwargs):
        super().__init__(model_dir, model_config, *args, **kwargs)
        self.model_config = model_config
        self.other_config = kwargs
        if self.model_config['channel'] != 1024:
            raise ValueError(
                'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
            )
        self.feature_dim = 80
        channels_config = [1024, 1024, 1024, 1024, 3072]
        self.embedding_model = ECAPA_TDNN(
            self.feature_dim, channels=channels_config)
        self.embedding_model = Combine(self.embedding_model,
                                       RDINOHead(512, 65536, True))
        pretrained_model_name = kwargs['pretrained_model']
        self.__load_check_point(pretrained_model_name)
        self.embedding_model.eval()
    def forward(self, audio):
        assert len(audio.shape) == 2 and audio.shape[
            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
        # audio shape: [1, T]
        feature = self.__extract_feature(audio)
        embedding = self.embedding_model.backbone(feature)
        return embedding
    def __extract_feature(self, audio):
        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
        feature = feature - feature.mean(dim=0, keepdim=True)
        feature = feature.unsqueeze(0)
        return feature
    def __load_check_point(self, pretrained_model_name, device=None):
        if not device:
            device = torch.device('cpu')
        state_dict = torch.load(
            os.path.join(self.model_dir, pretrained_model_name),
            map_location=device)
        state_dict_tea = {
            k.replace('module.', ''): v
            for k, v in state_dict['teacher'].items()
        }
        self.embedding_model.load_state_dict(state_dict_tea, strict=True)
--- a/modelscope/models/audio/sv/speaker_change_locator.py
+++ b/modelscope/models/audio/sv/speaker_change_locator.py
@@ -0,0 +1,319 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 from collections import OrderedDict
 from typing import Any, Dict, Union
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio.compliance.kaldi as Kaldi
 from modelscope.metainfo import Models
 from modelscope.models import MODELS, TorchModel
 from modelscope.models.audio.sv.DTDNN import CAMPPlus
 from modelscope.utils.constant import Tasks
 class MultiHeadSelfAttention(nn.Module):
    def __init__(self, n_units, h=8, dropout=0.1):
        super(MultiHeadSelfAttention, self).__init__()
        self.linearQ = nn.Linear(n_units, n_units)
        self.linearK = nn.Linear(n_units, n_units)
        self.linearV = nn.Linear(n_units, n_units)
        self.linearO = nn.Linear(n_units, n_units)
        self.d_k = n_units // h
        self.h = h
        self.dropout = nn.Dropout(p=dropout)
        self.att = None
    def forward(self, x, batch_size):
        # x: (BT, F)
        q = self.linearQ(x).reshape(batch_size, -1, self.h, self.d_k)
        k = self.linearK(x).reshape(batch_size, -1, self.h, self.d_k)
        v = self.linearV(x).reshape(batch_size, -1, self.h, self.d_k)
        scores = torch.matmul(q.transpose(1, 2), k.permute(
            0, 2, 3, 1)) / np.sqrt(self.d_k)
        # scores: (B, h, T, T)
        self.att = F.softmax(scores, dim=3)
        p_att = self.dropout(self.att)
        # v : (B, T, h, d_k)
        # p_att : (B, h, T, T)
        x = torch.matmul(p_att, v.transpose(1, 2))
        # x : (B, h, T, d_k)
        x = x.transpose(1, 2).reshape(-1, self.h * self.d_k)
        return self.linearO(x)
 class PositionwiseFeedForward(nn.Module):
    def __init__(self, n_units, d_units, dropout):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(n_units, d_units)
        self.linear2 = nn.Linear(d_units, n_units)
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))
 class PosEncoding(nn.Module):
    def __init__(self, max_seq_len, d_word_vec):
        super(PosEncoding, self).__init__()
        pos_enc = np.array([[
            pos / np.power(10000, 2.0 * (j // 2) / d_word_vec)
            for j in range(d_word_vec)
        ] for pos in range(max_seq_len)])
        pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
        pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
        pad_row = np.zeros([1, d_word_vec])
        pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
        self.pos_enc = torch.nn.Embedding(max_seq_len + 1, d_word_vec)
        self.pos_enc.weight = torch.nn.Parameter(
            torch.from_numpy(pos_enc), requires_grad=False)
    def forward(self, input_len):
        max_len = torch.max(input_len)
        input_pos = torch.LongTensor([
            list(range(1, len + 1)) + [0] * (max_len - len)
            for len in input_len
        ])
        return self.pos_enc(input_pos)
 class TransformerEncoder(nn.Module):
    def __init__(self,
                 idim,
                 n_units=256,
                 n_layers=2,
                 e_units=512,
                 h=4,
                 dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.linear_in = nn.Linear(idim, n_units)
        self.lnorm_in = nn.LayerNorm(n_units)
        self.n_layers = n_layers
        self.dropout = nn.Dropout(p=dropout)
        for i in range(n_layers):
            setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
            setattr(self, '{}{:d}'.format('self_att_', i),
                    MultiHeadSelfAttention(n_units, h))
            setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
            setattr(self, '{}{:d}'.format('ff_', i),
                    PositionwiseFeedForward(n_units, e_units, dropout))
        self.lnorm_out = nn.LayerNorm(n_units)
    def forward(self, x):
        # x: [B, num_anchors, T, n_in]
        bs, num, tframe, dim = x.size()
        x = x.reshape(bs * num, tframe, -1)  # [B*num_anchors, T, dim]
        # x: (B, T, F) ... batch, time, (mel)freq
        B_size, T_size, _ = x.shape
        # e: (BT, F)
        e = self.linear_in(x.reshape(B_size * T_size, -1))
        # Encoder stack
        for i in range(self.n_layers):
            # layer normalization
            e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
            # self-attention
            s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
            # residual
            e = e + self.dropout(s)
            # layer normalization
            e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
            # positionwise feed-forward
            s = getattr(self, '{}{:d}'.format('ff_', i))(e)
            # residual
            e = e + self.dropout(s)
        # final layer normalization
        # output: (BT, F)
        # output: (B, F, T)
        output = self.lnorm_out(e).reshape(B_size, T_size, -1)
        output = output.reshape(bs, num, tframe,
                                -1)  # [B, num_anchors, T, dim]
        return output
 class TransformerEncoder_out(nn.Module):
    def __init__(self,
                 idim,
                 n_units=256,
                 n_layers=2,
                 e_units=512,
                 h=4,
                 dropout=0.1):
        super(TransformerEncoder_out, self).__init__()
        self.linear_in = nn.Linear(idim, n_units)
        self.lnorm_in = nn.LayerNorm(n_units)
        self.n_layers = n_layers
        self.dropout = nn.Dropout(p=dropout)
        for i in range(n_layers):
            setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
            setattr(self, '{}{:d}'.format('self_att_', i),
                    MultiHeadSelfAttention(n_units, h))
            setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
            setattr(self, '{}{:d}'.format('ff_', i),
                    PositionwiseFeedForward(n_units, e_units, dropout))
        self.lnorm_out = nn.LayerNorm(n_units)
    def forward(self, x):
        # x: (B, T, F)
        B_size, T_size, _ = x.shape
        # e: (BT, F)
        e = self.linear_in(x.reshape(B_size * T_size, -1))
        # Encoder stack
        for i in range(self.n_layers):
            # layer normalization
            e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
            # self-attention
            s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
            # residual
            e = e + self.dropout(s)
            # layer normalization
            e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
            # positionwise feed-forward
            s = getattr(self, '{}{:d}'.format('ff_', i))(e)
            # residual
            e = e + self.dropout(s)
        # final layer normalization
        # output: (BT, F)
        # output: (B, T, F)
        output = self.lnorm_out(e).reshape(B_size, T_size, -1)
        return output
 class OutLayer(nn.Module):
    def __init__(self, n_units=256, num_anchors=2):
        super(OutLayer, self).__init__()
        self.combine = TransformerEncoder_out(num_anchors * n_units, n_units)
        self.out_linear = nn.Linear(n_units // num_anchors, 1)
    def forward(self, input):
        # input: [B, num_anchors, T, dim]
        bs, num, tframe, dim = input.size()
        output = input.permute(0, 2, 1,
                               3).reshape(bs, tframe,
                                          -1)  # [Bs, t, num_anchors*dim]
        output = self.combine(output)  # [Bs, t, n_units]
        output = output.reshape(
            bs, tframe, num, -1)  # [Bs, t, num_anchors, n_units//num_anchors]
        output = self.out_linear(output).squeeze(-1)  # [Bs, t, num_anchors]
        return output
 class TransformerDetector(nn.Module):
    def __init__(self,
                 frame_dim=512,
                 anchor_dim=192,
                 hidden_dim=256,
                 max_seq_len=1000):
        super(TransformerDetector, self).__init__()
        self.detection = TransformerEncoder(
            idim=frame_dim + anchor_dim, n_units=hidden_dim)
        self.output = OutLayer(n_units=hidden_dim)
        self.pos_enc = PosEncoding(max_seq_len, hidden_dim)
    def forward(self, feats, anchors):
        # feats: [1, t, fdim]
        num_frames = feats.shape[1]
        num_anchors = anchors.shape[1]
        bs = feats.shape[0]
        feats = feats.unsqueeze(1).repeat(
            1, num_anchors, 1, 1)  # shape: [Bs, num_anchors, t, fdim]
        anchors = anchors.unsqueeze(2).repeat(
            1, 1, num_frames, 1)  # shape: [Bs, num_anchors, t, xdim]
        sd_in = torch.cat((feats, anchors),
                          dim=-1)  # shape: [Bs, num_anchors, t, fdim+xdim]
        sd_out = self.detection(sd_in)  # shape: [Bs, num_anchors, t, sd_dim]
        # pos
        pos_emb = self.pos_enc(torch.tensor([num_frames] * (bs * num_anchors)))
        pos_emb = pos_emb.reshape(bs, num_anchors, num_frames, -1)
        sd_out += pos_emb
        # output
        output = self.output(sd_out)  # shape: [Bs, t, num_anchors]
        return output
@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.scl_sd)
 class SpeakerChangeLocatorTransformer(TorchModel):
    r"""A speaekr change locator using the transformer architecture as the backbone.
    Args:
        model_dir: A model dir.
        model_config: The model config.
    """
    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
                 **kwargs):
        super().__init__(model_dir, model_config, *args, **kwargs)
        self.model_config = model_config
        self.feature_dim = self.model_config['fbank_dim']
        frame_size = self.model_config['frame_size']
        anchor_size = self.model_config['anchor_size']
        self.encoder = CAMPPlus(self.feature_dim, output_level='frame')
        self.backend = TransformerDetector(
            frame_dim=frame_size, anchor_dim=anchor_size)
        pretrained_encoder = kwargs['pretrained_encoder']
        pretrained_backend = kwargs['pretrained_backend']
        self.__load_check_point(pretrained_encoder, pretrained_backend)
        self.encoder.eval()
        self.backend.eval()
    def forward(self, audio, anchors):
        assert len(audio.shape) == 2 and audio.shape[
            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
        assert len(
            anchors.shape
        ) == 3 and anchors.shape[0] == 1 and anchors.shape[
            1] == 2, 'modelscope error: the shape of input anchors to model needs to be [1, 2, D]'
        # audio shape: [1, T]
        feature = self.__extract_feature(audio)
        frame_state = self.encoder(feature)
        output = self.backend(frame_state, anchors)
        output = output.squeeze(0).detach().cpu().sigmoid()
        time_scale_factor = int(np.ceil(feature.shape[1] / output.shape[0]))
        output = output.unsqueeze(1).expand(-1, time_scale_factor,
                                            -1).reshape(-1, output.shape[-1])
        return output
    def __extract_feature(self, audio):
        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
        feature = feature - feature.mean(dim=0, keepdim=True)
        feature = feature.unsqueeze(0)
        return feature
    def __load_check_point(self,
                           pretrained_encoder,
                           pretrained_backend,
                           device=None):
        if not device:
            device = torch.device('cpu')
        self.encoder.load_state_dict(
            torch.load(
                os.path.join(self.model_dir, pretrained_encoder),
                map_location=device))
        self.backend.load_state_dict(
            torch.load(
                os.path.join(self.model_dir, pretrained_backend),
                map_location=device))
--- a/modelscope/models/audio/tts/voice.py
+++ b/modelscope/models/audio/tts/voice.py
@@ -17,11 +17,9 @@ from kantts.train.trainer import GAN_Trainer, Sambert_Trainer, distributed_init
 from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit
 from torch.utils.data import DataLoader
 from modelscope import __version__
 from modelscope.utils.audio.audio_utils import TtsCustomParams
 from modelscope.utils.audio.tts_exceptions import (
    TtsModelConfigurationException, TtsModelNotExistsException)
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
@@ -394,6 +392,7 @@ class Voice:
        logger.info(f'TRAINING steps: {train_max_steps}')
        config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime())
        from modelscope import __version__
        config['modelscope_version'] = __version__
        with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
@@ -558,6 +557,7 @@ class Voice:
        logger.info(f'resume from: {resume_from}')
        config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime())
        from modelscope import __version__
        config['modelscope_version'] = __version__
        with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
--- a/modelscope/models/cv/init.py
+++ b/modelscope/models/cv/init.py
@@ -4,9 +4,8 @@
 from . import (action_recognition, animal_recognition, bad_image_detecting,
               body_2d_keypoints, body_3d_keypoints, cartoon,
               cmdssl_video_embedding, controllable_image_generation,
-               crowd_counting, face_2d_keypoints, face_detection,
+               crowd_counting, face_detection, face_generation,
-               face_generation, face_reconstruction, human_reconstruction,
+               face_reconstruction, human_reconstruction, image_classification,
               human_wholebody_keypoint, image_classification,
               image_color_enhance, image_colorization, image_defrcn_fewshot,
               image_denoise, image_inpainting, image_instance_segmentation,
               image_matching, image_mvs_depth_estimation,
--- a/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
+++ b/modelscope/models/cv/body_2d_keypoints/hrnet_v2.py
@@ -72,7 +72,7 @@ class PoseHighResolutionNetV2(TorchModel):
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)
        """final four layers"""
-        last_inp_channels = np.int(np.sum(pre_stage_channels))
+        last_inp_channels = int(np.sum(pre_stage_channels))
        self.final_layer = nn.Sequential(
            nn.Conv2d(
                in_channels=last_inp_channels,
--- a/modelscope/models/cv/cartoon/facelib/face_landmark.py
+++ b/modelscope/models/cv/cartoon/facelib/face_landmark.py
@@ -81,7 +81,7 @@ class FaceLandmark:
        bbox[2] = center[0] + one_edge // 2
        bbox[3] = center[1] + one_edge // 2
-        bbox = bbox.astype(np.int)
+        bbox = bbox.astype(int)
        crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
        h, w, _ = crop_image.shape
        crop_image = cv2.resize(
--- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
+++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
@@ -356,7 +356,7 @@ class HighResolutionNet(nn.Module):
                                                       num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)
-        last_inp_channels = np.int(np.sum(pre_stage_channels)) + 256
+        last_inp_channels = int(np.sum(pre_stage_channels)) + 256
        self.redc_layer = nn.Sequential(
            nn.Conv2d(
                in_channels=last_inp_channels,
--- a/modelscope/models/cv/easycv_base.py
+++ b/modelscope/models/cv/easycv_base.py
@@ -1,25 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.base import BaseModel
 from easycv.utils.ms_utils import EasyCVMeta
 from modelscope.models.base import TorchModel
 class EasyCVBaseModel(BaseModel, TorchModel):
    """Base model for EasyCV."""
    def __init__(self, model_dir=None, args=(), kwargs={}):
        kwargs.pop(EasyCVMeta.ARCH, None)  # pop useless keys
        BaseModel.__init__(self)
        TorchModel.__init__(self, model_dir=model_dir)
    def forward(self, img, mode='train', **kwargs):
        if self.training:
            losses = self.forward_train(img, **kwargs)
            loss, log_vars = self._parse_losses(losses)
            return dict(loss=loss, log_vars=log_vars)
        else:
            return self.forward_test(img, **kwargs)
    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)
--- a/modelscope/models/cv/face_2d_keypoints/init.py
+++ b/modelscope/models/cv/face_2d_keypoints/init.py
@@ -1,20 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .face_2d_keypoints_align import Face2DKeypoints
 else:
    _import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
    import sys
    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
+++ b/modelscope/models/cv/face_2d_keypoints/face_2d_keypoints_align.py
@@ -1,16 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.face.face_keypoint import FaceKeypoint
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
 class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        FaceKeypoint.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py
+++ b/modelscope/models/cv/face_detection/peppa_pig_face/face_landmark.py
@@ -82,7 +82,7 @@ class FaceLandmark:
        bbox[2] = center[0] + one_edge // 2
        bbox[3] = center[1] + one_edge // 2
-        bbox = bbox.astype(np.int)
+        bbox = bbox.astype(int)
        crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
        h, w, _ = crop_image.shape
        crop_image = cv2.resize(crop_image,
--- a/modelscope/models/cv/hand_2d_keypoints/init.py
+++ b/modelscope/models/cv/hand_2d_keypoints/init.py
@@ -1,20 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .hand_2d_keypoints import Hand2dKeyPoints
 else:
    _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
    import sys
    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
+++ b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
@@ -1,16 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.pose import TopDown
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
 class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        TopDown.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/human_reconstruction/models/human_segmenter.py
+++ b/modelscope/models/cv/human_reconstruction/models/human_segmenter.py
@@ -31,7 +31,7 @@ class human_segmenter(object):
            img = np.dstack((img, img, img))
        elif img.shape[2] == 4:
            img = img[:, :, :3]
-        img = img.astype(np.float)
+        img = img.astype(float)
        return img
    def run(self, img):
--- a/modelscope/models/cv/human_reconstruction/utils.py
+++ b/modelscope/models/cv/human_reconstruction/utils.py
@@ -69,8 +69,8 @@ def eval_grid(coords,
              num_samples=512 * 512 * 512):
    resolution = coords.shape[1:4]
    sdf = np.zeros(resolution)
-    dirty = np.ones(resolution, dtype=np.bool)
+    dirty = np.ones(resolution, dtype=bool)
-    grid_mask = np.zeros(resolution, dtype=np.bool)
+    grid_mask = np.zeros(resolution, dtype=bool)
    reso = resolution[0] // init_resolution
    while reso > 0:
--- a/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
+++ b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py
@@ -1,17 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.pose.top_down import TopDown
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.human_wholebody_keypoint,
    module_name=Models.human_wholebody_keypoint)
 class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        TopDown.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/voc_register.py
@@ -163,7 +163,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
                    os.path.join(split_dir,
                                 'box_{}shot_{}_train.txt'.format(shot,
                                                                  cls))) as f:
-                fileids_ = np.loadtxt(f, dtype=np.str).tolist()
+                fileids_ = np.loadtxt(f, dtype=np.str_).tolist()
                if isinstance(fileids_, str):
                    fileids_ = [fileids_]
                fileids_ = [
@@ -219,7 +219,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
        with PathManager.open(
                os.path.join(root, dirname, 'ImageSets', 'Main',
                             split + '.txt')) as f:
-            fileids = np.loadtxt(f, dtype=np.str)
+            fileids = np.loadtxt(f, dtype=np.str_)
        for fileid in fileids:
            anno_file = os.path.join(root, dirname, 'Annotations',
--- a/modelscope/models/cv/image_instance_segmentation/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/init.py
@@ -8,10 +8,12 @@ if TYPE_CHECKING:
    from .maskdino_swin import MaskDINOSwin
    from .model import CascadeMaskRCNNSwinModel
    from .maskdino_model import MaskDINOSwinModel
    from .fastinst_model import FastInst
    from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
 else:
    _import_structure = {
        'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
        'fastinst_model': ['FastInst'],
        'maskdino_swin': ['MaskDINOSwin'],
        'model': ['CascadeMaskRCNNSwinModel'],
        'maskdino_model': ['MaskDINOSwinModel'],
--- a/modelscope/models/cv/image_instance_segmentation/backbones/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/init.py
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .swin_transformer import SwinTransformer
    from .swin_transformer import D2SwinTransformer
    from .resnet import build_resnet_backbone
 else:
    _import_structure = {
        'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
        'resnet': ['build_resnet_backbone']
    }
    import sys
--- a/modelscope/models/cv/image_instance_segmentation/backbones/resnet.py
+++ b/modelscope/models/cv/image_instance_segmentation/backbones/resnet.py
@@ -0,0 +1,114 @@
 # Part of the implementation is borrowed and modified from Detectron2, publicly available at
 # https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py
 import torch.nn.functional as F
 from torch import nn
 from modelscope.models.cv.image_human_parsing.backbone.deeplab_resnet import (
    BottleneckBlock, DeeplabResNet, get_norm)
 from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
    Conv2d
 class BasicStem(nn.Module):
    """
    The standard ResNet stem (layers before the first residual block),
    with a conv, relu and max_pool.
    """
    def __init__(self, in_channels=3, out_channels=64, norm='BN'):
        """
        Args:
            norm (str or callable): norm after the first conv layer.
                See :func:`layers.get_norm` for supported format.
        """
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = 4
        self.conv1 = Conv2d(
            in_channels,
            out_channels,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False,
            norm=get_norm(norm, out_channels),
        )
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu_(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        return x
 def build_resnet_backbone(out_features, depth, num_groups, width_per_group,
                          norm, stem_out_channels, res2_out_channels,
                          stride_in_1x1, res4_dilation, res5_dilation,
                          res5_multi_grid, input_shape):
    stem = BasicStem(
        in_channels=input_shape['channels'],
        out_channels=stem_out_channels,
        norm=norm)
    bottleneck_channels = num_groups * width_per_group
    in_channels = stem_out_channels
    out_channels = res2_out_channels
    assert res4_dilation in {
        1, 2
    }, 'res4_dilation cannot be {}.'.format(res4_dilation)
    assert res5_dilation in {
        1, 2, 4
    }, 'res5_dilation cannot be {}.'.format(res5_dilation)
    if res4_dilation == 2:
        # Always dilate res5 if res4 is dilated.
        assert res5_dilation == 4
    num_blocks_per_stage = {
        50: [3, 4, 6, 3],
        101: [3, 4, 23, 3],
        152: [3, 8, 36, 3]
    }[depth]
    stages = []
    out_stage_idx = [{
        'res2': 2,
        'res3': 3,
        'res4': 4,
        'res5': 5
    }[f] for f in out_features]
    max_stage_idx = max(out_stage_idx)
    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
        if stage_idx == 4:
            dilation = res4_dilation
        elif stage_idx == 5:
            dilation = res5_dilation
        else:
            dilation = 1
        first_stride = 1 if idx == 0 or dilation > 1 else 2
        stride_per_block = [first_stride]
        stride_per_block += [1] * (num_blocks_per_stage[idx] - 1)
        stage_kargs = {
            'num_blocks': num_blocks_per_stage[idx],
            'stride_per_block': stride_per_block,
            'in_channels': in_channels,
            'out_channels': out_channels,
            'norm': norm,
            'bottleneck_channels': bottleneck_channels,
            'stride_in_1x1': stride_in_1x1,
            'dilation': dilation,
            'num_groups': num_groups,
            'block_class': BottleneckBlock
        }
        if stage_idx == 5:
            stage_kargs.pop('dilation')
            stage_kargs['dilation_per_block'] = [
                dilation * mg for mg in res5_multi_grid
            ]
        blocks = DeeplabResNet.make_stage(**stage_kargs)
        in_channels = out_channels
        out_channels *= 2
        bottleneck_channels *= 2
        stages.append(blocks)
    return DeeplabResNet(stem, stages, out_features=out_features)
--- a/modelscope/models/cv/image_instance_segmentation/fastinst/init.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst/init.py
@@ -0,0 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
--- a/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_decoder.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_decoder.py
@@ -0,0 +1,351 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import math
 import torch
 from torch import nn
 from torch.nn import functional as F
 from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import (
    MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer)
 class QueryProposal(nn.Module):
    def __init__(self, num_features, num_queries, num_classes):
        super().__init__()
        self.topk = num_queries
        self.num_classes = num_classes
        self.conv_proposal_cls_logits = nn.Sequential(
            nn.Conv2d(
                num_features, num_features, kernel_size=3, stride=1,
                padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(
                num_features,
                num_classes + 1,
                kernel_size=1,
                stride=1,
                padding=0),
        )
    @torch.no_grad()
    def compute_coordinates(self, x):
        h, w = x.size(2), x.size(3)
        y_loc = torch.linspace(0, 1, h, device=x.device)
        x_loc = torch.linspace(0, 1, w, device=x.device)
        y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
        locations = torch.stack([x_loc, y_loc], 0).unsqueeze(0)
        return locations
    def seek_local_maximum(self, x, epsilon=1e-6):
        """
        inputs:
            x: torch.tensor, shape [b, c, h, w]
        return:
            torch.tensor, shape [b, c, h, w]
        """
        x_pad = F.pad(x, (1, 1, 1, 1), 'constant', 0)
        # top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
        maximum = (x >= x_pad[:, :, :-2, 1:-1]) & \
                  (x >= x_pad[:, :, 2:, 1:-1]) & \
                  (x >= x_pad[:, :, 1:-1, :-2]) & \
                  (x >= x_pad[:, :, 1:-1, 2:]) & \
                  (x >= x_pad[:, :, :-2, :-2]) & \
                  (x >= x_pad[:, :, :-2, 2:]) & \
                  (x >= x_pad[:, :, 2:, :-2]) & \
                  (x >= x_pad[:, :, 2:, 2:]) & \
                  (x >= epsilon)
        return maximum.to(x)
    def forward(self, x, pos_embeddings):
        proposal_cls_logits = self.conv_proposal_cls_logits(x)  # b, c, h, w
        proposal_cls_probs = proposal_cls_logits.softmax(dim=1)  # b, c, h, w
        proposal_cls_one_hot = F.one_hot(
            proposal_cls_probs[:, :-1, :, :].max(1)[1],
            num_classes=self.num_classes + 1).permute(0, 3, 1, 2)  # b, c, h, w
        proposal_cls_probs = proposal_cls_probs.mul(proposal_cls_one_hot)
        proposal_local_maximum_map = self.seek_local_maximum(
            proposal_cls_probs)  # b, c, h, w
        proposal_cls_probs = proposal_cls_probs + proposal_local_maximum_map  # b, c, h, w
        # top-k indices
        topk_indices = torch.topk(
            proposal_cls_probs[:, :-1, :, :].flatten(2).max(1)[0],
            self.topk,
            dim=1)[1]  # b, q
        topk_indices = topk_indices.unsqueeze(1)  # b, 1, q
        # topk queries
        topk_proposals = torch.gather(
            x.flatten(2), dim=2, index=topk_indices.repeat(1, x.shape[1],
                                                           1))  # b, c, q
        pos_embeddings = pos_embeddings.repeat(x.shape[0], 1, 1, 1).flatten(2)
        topk_pos_embeddings = torch.gather(
            pos_embeddings,
            dim=2,
            index=topk_indices.repeat(1, pos_embeddings.shape[1],
                                      1))  # b, c, q
        if self.training:
            locations = self.compute_coordinates(x).repeat(x.shape[0], 1, 1, 1)
            topk_locations = torch.gather(
                locations.flatten(2),
                dim=2,
                index=topk_indices.repeat(1, locations.shape[1], 1))
            topk_locations = topk_locations.transpose(-1, -2)  # b, q, 2
        else:
            topk_locations = None
        return topk_proposals, topk_pos_embeddings, topk_locations, proposal_cls_logits
 class FastInstDecoder(nn.Module):
    def __init__(self, in_channels, *, num_classes: int, hidden_dim: int,
                 num_queries: int, num_aux_queries: int, nheads: int,
                 dim_feedforward: int, dec_layers: int, pre_norm: bool,
                 mask_dim: int):
        """
        Args:
            in_channels: channels of the input features
            num_classes: number of classes
            hidden_dim: Transformer feature dimension
            num_queries: number of queries
            num_aux_queries: number of auxiliary queries
            nheads: number of heads
            dim_feedforward: feature dimension in feedforward network
            dec_layers: number of Transformer decoder layers
            pre_norm: whether to use pre-LayerNorm or not
            mask_dim: mask feature dimension
        """
        super().__init__()
        self.num_heads = nheads
        self.num_layers = dec_layers
        self.num_queries = num_queries
        self.num_aux_queries = num_aux_queries
        self.num_classes = num_classes
        meta_pos_size = int(round(math.sqrt(self.num_queries)))
        self.meta_pos_embed = nn.Parameter(
            torch.empty(1, hidden_dim, meta_pos_size, meta_pos_size))
        if num_aux_queries > 0:
            self.empty_query_features = nn.Embedding(num_aux_queries,
                                                     hidden_dim)
            self.empty_query_pos_embed = nn.Embedding(num_aux_queries,
                                                      hidden_dim)
        self.query_proposal = QueryProposal(hidden_dim, num_queries,
                                            num_classes)
        self.transformer_query_cross_attention_layers = nn.ModuleList()
        self.transformer_query_self_attention_layers = nn.ModuleList()
        self.transformer_query_ffn_layers = nn.ModuleList()
        self.transformer_mask_cross_attention_layers = nn.ModuleList()
        self.transformer_mask_ffn_layers = nn.ModuleList()
        for idx in range(self.num_layers):
            self.transformer_query_cross_attention_layers.append(
                CrossAttentionLayer(
                    d_model=hidden_dim,
                    nhead=nheads,
                    dropout=0.0,
                    normalize_before=pre_norm))
            self.transformer_query_self_attention_layers.append(
                SelfAttentionLayer(
                    d_model=hidden_dim,
                    nhead=nheads,
                    dropout=0.0,
                    normalize_before=pre_norm))
            self.transformer_query_ffn_layers.append(
                FFNLayer(
                    d_model=hidden_dim,
                    dim_feedforward=dim_feedforward,
                    dropout=0.0,
                    normalize_before=pre_norm))
            self.transformer_mask_cross_attention_layers.append(
                CrossAttentionLayer(
                    d_model=hidden_dim,
                    nhead=nheads,
                    dropout=0.0,
                    normalize_before=pre_norm))
            self.transformer_mask_ffn_layers.append(
                FFNLayer(
                    d_model=hidden_dim,
                    dim_feedforward=dim_feedforward,
                    dropout=0.0,
                    normalize_before=pre_norm))
        self.decoder_query_norm_layers = nn.ModuleList()
        self.class_embed_layers = nn.ModuleList()
        self.mask_embed_layers = nn.ModuleList()
        self.mask_features_layers = nn.ModuleList()
        for idx in range(self.num_layers + 1):
            self.decoder_query_norm_layers.append(nn.LayerNorm(hidden_dim))
            self.class_embed_layers.append(
                MLP(hidden_dim, hidden_dim, num_classes + 1, 3))
            self.mask_embed_layers.append(
                MLP(hidden_dim, hidden_dim, mask_dim, 3))
            self.mask_features_layers.append(nn.Linear(hidden_dim, mask_dim))
    def forward(self, x, mask_features, targets=None):
        bs = x[0].shape[0]
        proposal_size = x[1].shape[-2:]
        pixel_feature_size = x[2].shape[-2:]
        pixel_pos_embeds = F.interpolate(
            self.meta_pos_embed,
            size=pixel_feature_size,
            mode='bilinear',
            align_corners=False)
        proposal_pos_embeds = F.interpolate(
            self.meta_pos_embed,
            size=proposal_size,
            mode='bilinear',
            align_corners=False)
        pixel_features = x[2].flatten(2).permute(2, 0, 1)
        pixel_pos_embeds = pixel_pos_embeds.flatten(2).permute(2, 0, 1)
        query_features, query_pos_embeds, query_locations, proposal_cls_logits = self.query_proposal(
            x[1], proposal_pos_embeds)
        query_features = query_features.permute(2, 0, 1)
        query_pos_embeds = query_pos_embeds.permute(2, 0, 1)
        if self.num_aux_queries > 0:
            aux_query_features = self.empty_query_features.weight.unsqueeze(
                1).repeat(1, bs, 1)
            aux_query_pos_embed = self.empty_query_pos_embed.weight.unsqueeze(
                1).repeat(1, bs, 1)
            query_features = torch.cat([query_features, aux_query_features],
                                       dim=0)
            query_pos_embeds = torch.cat(
                [query_pos_embeds, aux_query_pos_embed], dim=0)
        outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
            query_features,
            pixel_features,
            pixel_feature_size,
            -1,
            return_attn_mask=True)
        predictions_class = [outputs_class]
        predictions_mask = [outputs_mask]
        predictions_matching_index = [None]
        query_feature_memory = [query_features]
        pixel_feature_memory = [pixel_features]
        for i in range(self.num_layers):
            query_features, pixel_features = self.forward_one_layer(
                query_features, pixel_features, query_pos_embeds,
                pixel_pos_embeds, attn_mask, i)
            if i < self.num_layers - 1:
                outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
                    query_features,
                    pixel_features,
                    pixel_feature_size,
                    i,
                    return_attn_mask=True,
                )
            else:
                outputs_class, outputs_mask, _, matching_indices, gt_attn_mask = self.forward_prediction_heads(
                    query_features,
                    pixel_features,
                    pixel_feature_size,
                    i,
                )
            predictions_class.append(outputs_class)
            predictions_mask.append(outputs_mask)
            predictions_matching_index.append(None)
            query_feature_memory.append(query_features)
            pixel_feature_memory.append(pixel_features)
        out = {
            'proposal_cls_logits':
            proposal_cls_logits,
            'query_locations':
            query_locations,
            'pred_logits':
            predictions_class[-1],
            'pred_masks':
            predictions_mask[-1],
            'pred_indices':
            predictions_matching_index[-1],
            'aux_outputs':
            self._set_aux_loss(predictions_class, predictions_mask,
                               predictions_matching_index, query_locations)
        }
        return out
    def forward_one_layer(self, query_features, pixel_features,
                          query_pos_embeds, pixel_pos_embeds, attn_mask, i):
        pixel_features = self.transformer_mask_cross_attention_layers[i](
            pixel_features,
            query_features,
            query_pos=pixel_pos_embeds,
            pos=query_pos_embeds)
        pixel_features = self.transformer_mask_ffn_layers[i](pixel_features)
        query_features = self.transformer_query_cross_attention_layers[i](
            query_features,
            pixel_features,
            memory_mask=attn_mask,
            query_pos=query_pos_embeds,
            pos=pixel_pos_embeds)
        query_features = self.transformer_query_self_attention_layers[i](
            query_features, query_pos=query_pos_embeds)
        query_features = self.transformer_query_ffn_layers[i](query_features)
        return query_features, pixel_features
    def forward_prediction_heads(self,
                                 query_features,
                                 pixel_features,
                                 pixel_feature_size,
                                 idx_layer,
                                 return_attn_mask=False,
                                 return_gt_attn_mask=False,
                                 targets=None,
                                 query_locations=None):
        decoder_query_features = self.decoder_query_norm_layers[idx_layer + 1](
            query_features[:self.num_queries])
        decoder_query_features = decoder_query_features.transpose(0, 1)
        if idx_layer + 1 == self.num_layers:
            outputs_class = self.class_embed_layers[idx_layer + 1](
                decoder_query_features)
        else:
            outputs_class = None
        outputs_mask_embed = self.mask_embed_layers[idx_layer + 1](
            decoder_query_features)
        outputs_mask_features = self.mask_features_layers[idx_layer + 1](
            pixel_features.transpose(0, 1))
        outputs_mask = torch.einsum('bqc,blc->bql', outputs_mask_embed,
                                    outputs_mask_features)
        outputs_mask = outputs_mask.reshape(-1, self.num_queries,
                                            *pixel_feature_size)
        if return_attn_mask:
            # outputs_mask.shape: b, q, h, w
            attn_mask = F.pad(outputs_mask,
                              (0, 0, 0, 0, 0, self.num_aux_queries),
                              'constant', 1)
            attn_mask = (attn_mask < 0.).flatten(2)  # b, q, hw
            invalid_query = attn_mask.all(-1, keepdim=True)  # b, q, 1
            attn_mask = (~invalid_query) & attn_mask  # b, q, hw
            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1,
                                                      1).flatten(0, 1)
            attn_mask = attn_mask.detach()
        else:
            attn_mask = None
        matching_indices = None
        gt_attn_mask = None
        return outputs_class, outputs_mask, attn_mask, matching_indices, gt_attn_mask
    @torch.jit.unused
    def _set_aux_loss(self, outputs_class, outputs_seg_masks, output_indices,
                      output_query_locations):
        return [{
            'query_locations': output_query_locations,
            'pred_logits': a,
            'pred_masks': b,
            'pred_matching_indices': c
        } for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],
                             output_indices[:-1])]
--- a/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_encoder.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst/fastinst_encoder.py
@@ -0,0 +1,180 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import logging
 from typing import Callable, Optional, Union
 import torch
 from torch import nn
 from torch.nn import functional as F
 from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
    Conv2d
 # This is a modified FPN decoder.
 class BaseFPN(nn.Module):
    def __init__(
        self,
        input_shape,
        *,
        convs_dim: int,
        mask_dim: int,
        norm: Optional[Union[str, Callable]] = None,
    ):
        """
        Args:
            input_shape: shapes (channels and stride) of the input features
            convs_dim: number of output channels for the intermediate conv layers.
            mask_dim: number of output channels for the final conv layer.
            norm (str or callable): normalization for all conv layers
        """
        super().__init__()
        input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
        self.in_features = [k for k, v in input_shape
                            ]  # starting from "res3" to "res5"
        feature_channels = [v['channels'] for k, v in input_shape]
        lateral_convs = []
        output_convs = []
        use_bias = norm == ''
        for idx, in_channels in enumerate(feature_channels):
            lateral_norm = nn.GroupNorm(32, convs_dim)
            output_norm = nn.GroupNorm(32, convs_dim)
            lateral_conv = Conv2d(
                in_channels,
                convs_dim,
                kernel_size=1,
                bias=use_bias,
                norm=lateral_norm)
            output_conv = Conv2d(
                convs_dim,
                convs_dim,
                kernel_size=3,
                stride=1,
                padding=1,
                bias=use_bias,
                norm=output_norm,
                activation=F.relu,
            )
            self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
            self.add_module('layer_{}'.format(idx + 1), output_conv)
            lateral_convs.append(lateral_conv)
            output_convs.append(output_conv)
        # Place convs into top-down order (from low to high resolution)
        # to make the top-down computation in forward clearer.
        self.lateral_convs = lateral_convs[::-1]
        self.output_convs = output_convs[::-1]
        self.convs_dim = convs_dim
        self.num_feature_levels = 3  # always use 3 scales
    def forward_features(self, features):
        multi_scale_features = []
        num_cur_levels = 0
        # Reverse feature maps into top-down order (from low to high resolution)
        for idx, f in enumerate(self.in_features[::-1]):
            x = features[f]
            lateral_conv = self.lateral_convs[idx]
            output_conv = self.output_convs[idx]
            if idx == 0:
                y = lateral_conv(x)
            else:
                cur_fpn = lateral_conv(x)
                y = cur_fpn + F.interpolate(
                    y,
                    size=cur_fpn.shape[-2:],
                    mode='bilinear',
                    align_corners=False)
            y = output_conv(y)
            if num_cur_levels < self.num_feature_levels:
                multi_scale_features.append(y)
                num_cur_levels += 1
        return None, multi_scale_features
    def forward(self, features, targets=None):
        logger = logging.getLogger(__name__)
        logger.warning(
            'Calling forward() may cause unpredicted behavior of PixelDecoder module.'
        )
        return self.forward_features(features)
 class PyramidPoolingModule(nn.Module):
    def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
        super().__init__()
        self.stages = []
        self.stages = nn.ModuleList(
            [self._make_stage(in_channels, channels, size) for size in sizes])
        self.bottleneck = Conv2d(in_channels + len(sizes) * channels,
                                 in_channels, 1)
    def _make_stage(self, features, out_features, size):
        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
        conv = Conv2d(features, out_features, 1)
        return nn.Sequential(prior, conv)
    def forward(self, feats):
        h, w = feats.size(2), feats.size(3)
        priors = [
            F.interpolate(
                input=F.relu_(stage(feats)),
                size=(h, w),
                mode='bilinear',
                align_corners=False) for stage in self.stages
        ] + [feats]
        out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
        return out
 class PyramidPoolingModuleFPN(BaseFPN):
    def __init__(
        self,
        input_shape,
        *,
        convs_dim: int,
        mask_dim: int,
        norm: Optional[Union[str, Callable]] = None,
    ):
        """
        NOTE: this interface is experimental.
        Args:
            input_shape: shapes (channels and stride) of the input features
            convs_dim: number of output channels for the intermediate conv layers.
            mask_dim: number of output channels for the final conv layer.
            norm (str or callable): normalization for all conv layers
        """
        super().__init__(
            input_shape, convs_dim=convs_dim, mask_dim=mask_dim, norm=norm)
        self.ppm = PyramidPoolingModule(convs_dim, convs_dim // 4)
    def forward_features(self, features):
        multi_scale_features = []
        num_cur_levels = 0
        # Reverse feature maps into top-down order (from low to high resolution)
        for idx, f in enumerate(self.in_features[::-1]):
            x = features[f]
            lateral_conv = self.lateral_convs[idx]
            output_conv = self.output_convs[idx]
            if idx == 0:
                y = self.ppm(lateral_conv(x))
            else:
                cur_fpn = lateral_conv(x)
                y = cur_fpn + F.interpolate(
                    y,
                    size=cur_fpn.shape[-2:],
                    mode='bilinear',
                    align_corners=False)
            y = output_conv(y)
            if num_cur_levels < self.num_feature_levels:
                multi_scale_features.append(y)
                num_cur_levels += 1
        return None, multi_scale_features
--- a/modelscope/models/cv/image_instance_segmentation/fastinst_model.py
+++ b/modelscope/models/cv/image_instance_segmentation/fastinst_model.py
@@ -0,0 +1,221 @@
 # Part of implementation is borrowed and modified from Mask2Former, publicly available at
 # https://github.com/facebookresearch/Mask2Former.
 import os
 from typing import Any, Dict, List
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \
    ImageList
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 from .backbones import build_resnet_backbone
 from .fastinst.fastinst_decoder import FastInstDecoder
 from .fastinst.fastinst_encoder import PyramidPoolingModuleFPN
 logger = get_logger()
@MODELS.register_module(Tasks.image_segmentation, module_name=Models.fastinst)
 class FastInst(TorchModel):
    def __init__(self,
                 model_dir,
                 backbone=None,
                 encoder=None,
                 decoder=None,
                 pretrained=None,
                 classes=None,
                 **kwargs):
        """
        Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394
        Args:
            backbone (dict): backbone config.
            encoder (dict): encoder config.
            decoder (dict): decoder config.
            pretrained (bool): whether to use pretrained model
            classes (list): class names
        """
        super(FastInst, self).__init__(model_dir, **kwargs)
        self.backbone = build_resnet_backbone(
            **backbone, input_shape={'channels': 3})
        in_features = encoder.pop('in_features')
        input_shape = {
            k: v
            for k, v in self.backbone.output_shape().items()
            if k in in_features
        }
        encoder = PyramidPoolingModuleFPN(input_shape=input_shape, **encoder)
        decoder = FastInstDecoder(in_channels=encoder.convs_dim, **decoder)
        self.sem_seg_head = FastInstHead(
            pixel_decoder=encoder, transformer_predictor=decoder)
        self.num_classes = decoder.num_classes
        self.num_queries = decoder.num_queries
        self.size_divisibility = 32
        self.register_buffer(
            'pixel_mean',
            torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
        self.register_buffer(
            'pixel_std',
            torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
        self.classes = classes
        self.test_topk_per_image = 100
        if pretrained:
            model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
            logger.info(f'loading model from {model_path}')
            weight = torch.load(model_path, map_location='cpu')['model']
            tgt_weight = self.state_dict()
            for name in list(weight.keys()):
                if name in tgt_weight:
                    load_size = weight[name].size()
                    tgt_size = tgt_weight[name].size()
                    mis_match = False
                    if len(load_size) != len(tgt_size):
                        mis_match = True
                    else:
                        for n1, n2 in zip(load_size, tgt_size):
                            if n1 != n2:
                                mis_match = True
                                break
                    if mis_match:
                        logger.info(
                            f'size mismatch for {name} '
                            f'({load_size} -> {tgt_size}), skip loading.')
                        del weight[name]
                else:
                    logger.info(
                        f'{name} doesn\'t exist in current model, skip loading.'
                    )
            self.load_state_dict(weight, strict=False)
            logger.info('load model done')
    def forward(self, batched_inputs: List[dict]) -> Dict[str, Any]:
        images = [x['image'].to(self.device) for x in batched_inputs]
        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
        images = ImageList.from_tensors(images, self.size_divisibility)
        features = self.backbone(images.tensor)
        outputs = self.sem_seg_head(features)
        return dict(
            outputs=outputs, batched_inputs=batched_inputs, images=images)
    def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
        outputs = input['outputs']
        batched_inputs = input['batched_inputs']
        images = input['images']
        if self.training:
            raise NotImplementedError
        else:
            mask_cls_results = outputs['pred_logits']  # (B, Q, C+1)
            mask_pred_results = outputs['pred_masks']  # (B, Q, H, W)
            # upsample masks
            mask_pred_results = F.interpolate(
                mask_pred_results,
                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
                mode='bilinear',
                align_corners=False,
            )
            del outputs
            processed_results = []
            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
                    mask_cls_results, mask_pred_results, batched_inputs,
                    images.image_sizes):
                height = input_per_image.get('height', image_size[0])
                width = input_per_image.get('width', image_size[1])
                processed_results.append({})  # for each image
                mask_pred_result = self.sem_seg_postprocess(
                    mask_pred_result, image_size, height, width)
                mask_cls_result = mask_cls_result.to(mask_pred_result)
                instance_r = self.instance_inference(mask_cls_result,
                                                     mask_pred_result)
                processed_results[-1]['instances'] = instance_r
        return dict(eval_result=processed_results)
    @property
    def device(self):
        return self.pixel_mean.device
    def sem_seg_postprocess(self, result, img_size, output_height,
                            output_width):
        result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
        result = F.interpolate(
            result,
            size=(output_height, output_width),
            mode='bilinear',
            align_corners=False)[0]
        return result
    def instance_inference(self, mask_cls, mask_pred):
        # mask_pred is already processed to have the same shape as original input
        image_size = mask_pred.shape[-2:]
        # [Q, K]
        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
        labels = torch.arange(
            self.num_classes,
            device=self.device).unsqueeze(0).repeat(self.num_queries,
                                                    1).flatten(0, 1)
        scores_per_image, topk_indices = scores.flatten(0, 1).topk(
            self.test_topk_per_image, sorted=False)
        labels_per_image = labels[topk_indices]
        topk_indices = topk_indices // self.num_classes
        mask_pred = mask_pred[topk_indices]
        result = {'image_size': image_size}
        # mask (before sigmoid)
        mask_pred_sigmoid = mask_pred.sigmoid()
        result['pred_masks'] = (mask_pred_sigmoid > 0.5).float()
        # calculate average mask prob
        mask_scores_per_image = (mask_pred_sigmoid.flatten(1)
                                 * result['pred_masks'].flatten(1)).sum(1) / (
                                     result['pred_masks'].flatten(1).sum(1)
                                     + 1e-6)
        result['scores'] = scores_per_image * mask_scores_per_image
        result['pred_classes'] = labels_per_image
        return result
 class FastInstHead(nn.Module):
    def __init__(
            self,
            *,
            pixel_decoder: nn.Module,
            # extra parameters
            transformer_predictor: nn.Module):
        """
        NOTE: this interface is experimental.
        Args:
            pixel_decoder: the pixel decoder module
            transformer_predictor: the transformer decoder that makes prediction
        """
        super().__init__()
        self.pixel_decoder = pixel_decoder
        self.predictor = transformer_predictor
    def forward(self, features, targets=None):
        return self.layers(features, targets)
    def layers(self, features, targets=None):
        mask_features, multi_scale_features = self.pixel_decoder.forward_features(
            features)
        predictions = self.predictor(multi_scale_features, mask_features,
                                     targets)
        return predictions
--- a/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
+++ b/modelscope/models/cv/image_instance_segmentation/postprocess_utils.py
@@ -108,16 +108,16 @@ def get_img_ins_seg_result(img_seg_result=None,
    for seg_result in img_seg_result:
        box = [
-            np.int(seg_result[0]),
+            int(seg_result[0]),
-            np.int(seg_result[1]),
+            int(seg_result[1]),
-            np.int(seg_result[2]),
+            int(seg_result[2]),
-            np.int(seg_result[3])
+            int(seg_result[3])
        ]
-        score = np.float(seg_result[4])
+        score = float(seg_result[4])
        category = seg_result[5]
        mask = np.array(seg_result[6], order='F', dtype='uint8')
-        mask = mask.astype(np.float)
+        mask = mask.astype(float)
        results_dict[OutputKeys.BOXES].append(box)
        results_dict[OutputKeys.MASKS].append(mask)
--- a/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py
+++ b/modelscope/models/cv/image_mvs_depth_estimation/colmap2mvsnet.py
@@ -382,7 +382,7 @@ def processing_single_scene(args):
                points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
                points3d[p3d_id].xyz[2], 1
            ])
-            zs.append(np.asscalar(transformed[2]))
+            zs.append(transformed[2].item())
        zs_sorted = sorted(zs)
        # relaxed depth range
        max_ratio = 0.1
--- a/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py
+++ b/modelscope/models/cv/image_mvs_depth_estimation/depth_filter.py
@@ -40,7 +40,7 @@ def read_mask(filename):
 # save a binary mask
 def save_mask(filename, mask):
-    assert mask.dtype == np.bool
+    assert mask.dtype == bool
    mask = mask.astype(np.uint8) * 255
    Image.fromarray(mask).save(filename)
--- a/modelscope/models/cv/image_panoptic_segmentation/init.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/init.py
@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .panseg_model import SwinLPanopticSegmentation
    from .r50_panseg_model import R50PanopticSegmentation
 else:
    _import_structure = {
--- a/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py
+++ b/modelscope/models/cv/image_panoptic_segmentation/r50_panseg_model.py
@@ -1,18 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.segmentation import Mask2Former
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.image_segmentation,
    module_name=Models.r50_panoptic_segmentation)
 class R50PanopticSegmentation(EasyCVBaseModel, Mask2Former):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        Mask2Former.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_semantic_segmentation/segformer.py
+++ b/modelscope/models/cv/image_semantic_segmentation/segformer.py
@@ -1,16 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.segmentation import EncoderDecoder
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.image_segmentation, module_name=Models.segformer)
 class Segformer(EasyCVBaseModel, EncoderDecoder):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        EncoderDecoder.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
+++ b/modelscope/models/cv/image_semantic_segmentation/semantic_seg_model.py
@@ -60,7 +60,7 @@ class SemanticSegmentation(TorchModel):
        ids = ids[legal_indices]
        segms = (semantic_result[None] == ids[:, None, None])
-        masks = [it.astype(np.int) for it in segms]
+        masks = [it.astype(int) for it in segms]
        labels_txt = np.array(self.CLASSES)[ids].tolist()
        results = {
--- a/modelscope/models/cv/image_skychange/ptsemseg/hrnet_backnone.py
+++ b/modelscope/models/cv/image_skychange/ptsemseg/hrnet_backnone.py
@@ -458,7 +458,7 @@ class HrnetBackBone(nn.Module):
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=True)
-        self.backbone_last_inp_channels = np.int(np.sum(pre_stage_channels))
+        self.backbone_last_inp_channels = int(np.sum(pre_stage_channels))
    def _make_transition_layer(self, num_channels_pre_layer,
                               num_channels_cur_layer):
--- a/modelscope/models/cv/image_skychange/ptsemseg/hrnet_super_and_ocr.py
+++ b/modelscope/models/cv/image_skychange/ptsemseg/hrnet_super_and_ocr.py
@@ -259,7 +259,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
            num_channels = [64, last_inp_channels]
            self.stage_super, super_stage_channels = self._make_stage(
                self.super_dict, num_channels)
-            last_inp_channels = np.int(np.sum(super_stage_channels))
+            last_inp_channels = int(np.sum(super_stage_channels))
            if self.is_contain_aspp:
                aspp_param = kwargs['aspp']
@@ -372,7 +372,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
            num_channels = [64, ocr_mid_channels]
            self.stage_super, super_stage_channels = self._make_stage(
                self.super_dict, num_channels)
-            last_inp_channels = np.int(np.sum(super_stage_channels))
+            last_inp_channels = int(np.sum(super_stage_channels))
            self.cls_head = nn.Sequential(
                nn.Conv2d(
--- a/modelscope/models/cv/movie_scene_segmentation/model.py
+++ b/modelscope/models/cv/movie_scene_segmentation/model.py
@@ -13,7 +13,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torchvision.transforms as TF
 from PIL import Image
-from shotdetect_scenedetect_lgss import shot_detect
+from shotdetect_scenedetect_lgss import shot_detector
 from tqdm import tqdm
 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
@@ -60,6 +61,9 @@ class MovieSceneSegmentationModel(TorchModel):
        self.head_sbd = nn.Linear(hdim, 2)
        load_param_with_prefix('head_sbd', self.head_sbd, params)
        self.shot_detector = shot_detector()
        self.shot_detector.init(**self.cfg.preprocessor.shot_detect)
        self.test_transform = TF.Compose([
            TF.Resize(size=256, interpolation=Image.BICUBIC),
            TF.CenterCrop(224),
@@ -98,29 +102,45 @@ class MovieSceneSegmentationModel(TorchModel):
    def inference(self, batch):
        logger.info('Begin scene detect ......')
        bs = self.cfg.pipeline.batch_size_per_gpu
-        sids = batch['sid']
+        device = self.crn.attention_mask.device
        inputs = batch['shot_feat']
-        shot_num = len(sids)
+        shot_timecode_lst = batch['shot_timecode_lst']
        shot_idx_lst = batch['shot_idx_lst']
        shot_num = len(shot_timecode_lst)
        cnt = math.ceil(shot_num / bs)
-        infer_sid, infer_pred = [], []
+        infer_pred = []
        infer_result = {}
-        for i in range(cnt):
+        self.shot_detector.start()
        for i in tqdm(range(cnt)):
            start = i * bs
            end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
-            input_ = inputs[start:end]
+
-            sid_ = sids[start:end]
+            batch_shot_idx_lst = shot_idx_lst[start:end]
-            input_ = torch.stack(input_)
+
            shot_start_idx = batch_shot_idx_lst[0][0]
            shot_end_idx = batch_shot_idx_lst[-1][-1]
            batch_timecode_lst = {
                i: shot_timecode_lst[i]
                for i in range(shot_start_idx, shot_end_idx + 1)
            }
            batch_shot_keyf_lst = self.shot_detector.get_frame_img(
                batch_timecode_lst, shot_start_idx, shot_num)
            inputs = self.get_batch_input(batch_shot_keyf_lst, shot_start_idx,
                                          batch_shot_idx_lst)
            input_ = torch.stack(inputs).to(device)
            outputs = self.shared_step(input_)  # shape [b,2]
            prob = F.softmax(outputs, dim=1)
            infer_sid.extend(sid_.cpu().detach().numpy())
            infer_pred.extend(prob[:, 1].cpu().detach().numpy())
        infer_result.update({'pred': np.stack(infer_pred)})
        infer_result.update({'sid': infer_sid})
-        assert len(infer_result['sid']) == len(sids)
+        infer_result.update({'pred': np.stack(infer_pred)})
-        assert len(infer_result['pred']) == len(inputs)
+        infer_result.update({'sid': np.arange(shot_num)})
        assert len(infer_result['pred']) == shot_num
        self.shot_detector.release()
        return infer_result
    def shared_step(self, inputs):
@@ -162,38 +182,48 @@ class MovieSceneSegmentationModel(TorchModel):
        logger.info('Generate scene .......')
        pred_dict = inputs['feat']
        shot2keyf = inputs['shot2keyf']
        thres = self.cfg.pipeline.save_threshold
        anno_dict = get_pred_boundary(pred_dict, thres)
        scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
-            self.shot2keyf, anno_dict)
+            shot2keyf, anno_dict)
        if self.cfg.pipeline.save_split_scene:
            re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
            print(f'Split scene video saved to {re_dir}')
        return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
-    def preprocess(self, inputs):
+    def get_batch_input(self, shot_keyf_lst, shot_start_idx, shot_idx_lst):
        logger.info('Begin shot detect......')
        shot_keyf_lst, anno, shot2keyf = shot_detect(
            inputs, **self.cfg.preprocessor.shot_detect)
        logger.info('Shot detect done!')
-        single_shot_feat, sid = [], []
+        single_shot_feat = []
        for idx, one_shot in enumerate(shot_keyf_lst):
            one_shot = [
                self.test_transform(one_frame) for one_frame in one_shot
            ]
            one_shot = torch.stack(one_shot, dim=0)
            single_shot_feat.append(one_shot)
-            sid.append(idx)
+
        single_shot_feat = torch.stack(single_shot_feat, dim=0)
        shot_feat = []
        for idx, shot_idx in enumerate(shot_idx_lst):
            shot_idx_ = shot_idx - shot_start_idx
            _one_shot = single_shot_feat[shot_idx_]
            shot_feat.append(_one_shot)
        return shot_feat
    def preprocess(self, inputs):
        logger.info('Begin shot detect......')
        shot_timecode_lst, anno, shot2keyf = self.shot_detector.shot_detect(
            inputs, **self.cfg.preprocessor.shot_detect)
        logger.info('Shot detect done!')
        shot_idx_lst = []
        for idx, one_shot in enumerate(anno):
            shot_idx = int(one_shot['shot_id']) + np.arange(
                -self.neighbor_size, self.neighbor_size + 1)
-            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
+            shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'] - 1)
-            _one_shot = single_shot_feat[shot_idx]
+            shot_idx_lst.append(shot_idx)
-            shot_feat.append(_one_shot)
+
-        self.shot2keyf = shot2keyf
+        return shot2keyf, anno, shot_timecode_lst, shot_idx_lst
        self.anno = anno
        return shot_feat, sid
--- a/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
+++ b/modelscope/models/cv/movie_scene_segmentation/utils/save_op.py
@@ -10,11 +10,12 @@ from tqdm import tqdm
 def get_pred_boundary(pred_dict, threshold=0.5):
-    pred = pred_dict['pred']
+    pred = pred_dict['pred'].cpu().numpy()
    sid = pred_dict['sid'].cpu().numpy().astype(np.int32)
    tmp = (pred > threshold).astype(np.int32)
    anno_dict = {}
    for idx in range(len(tmp)):
-        anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
+        anno_dict.update({str(sid[idx]).zfill(4): int(tmp[idx])})
    return anno_dict
--- a/modelscope/models/cv/nerf_recon_acc/network/segmenter.py
+++ b/modelscope/models/cv/nerf_recon_acc/network/segmenter.py
@@ -31,7 +31,7 @@ class ObjectSegmenter(object):
        elif img.shape[2] == 4:
            img = img[:, :, :3]
        img = img[:, :, ::-1]
-        img = img.astype(np.float)
+        img = img.astype(float)
        return img
    def run_mask(self, img):
--- a/modelscope/models/cv/object_detection/dino.py
+++ b/modelscope/models/cv/object_detection/dino.py
@@ -1,16 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.detection.detectors import Detection as _Detection
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.image_object_detection, module_name=Models.dino)
 class DINO(EasyCVBaseModel, _Detection):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        _Detection.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/object_detection/yolox_pai.py
+++ b/modelscope/models/cv/object_detection/yolox_pai.py
@@ -1,21 +0,0 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from easycv.models.detection.detectors import YOLOX as _YOLOX
 from modelscope.metainfo import Models
 from modelscope.models.builder import MODELS
 from modelscope.models.cv.easycv_base import EasyCVBaseModel
 from modelscope.utils.constant import Tasks
@MODELS.register_module(
    group_key=Tasks.image_object_detection, module_name=Models.yolox)
@MODELS.register_module(
    group_key=Tasks.image_object_detection,
    module_name=Models.image_object_detection_auto)
@MODELS.register_module(
    group_key=Tasks.domain_specific_object_detection, module_name=Models.yolox)
 class YOLOX(EasyCVBaseModel, _YOLOX):
    def __init__(self, model_dir=None, *args, **kwargs):
        EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
        _YOLOX.__init__(self, *args, **kwargs)
--- a/modelscope/models/cv/object_detection_3d/depe/result_vis.py
+++ b/modelscope/models/cv/object_detection_3d/depe/result_vis.py
@@ -30,7 +30,7 @@ def depth2color(depth):
    if gray == 1:
        return tuple(colors[-1].tolist())
    num_rank = len(colors) - 1
-    rank = np.floor(gray * num_rank).astype(np.int)
+    rank = np.floor(gray * num_rank).astype(int)
    diff = (gray - rank / num_rank) * num_rank
    tmp = colors[rank + 1] - colors[rank]
    return tuple((colors[rank] + tmp * diff).tolist())
@@ -136,7 +136,7 @@ def plot_result(res_path,
            l2g = get_lidar2global(infos)
            corners_lidar = corners_global @ np.linalg.inv(l2g).T
            corners_lidar = corners_lidar[:, :3]
-        pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
+        pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=bool)
        scores = [
            pred_res[rid]['detection_score'] for rid in range(len(pred_res))
        ]
@@ -151,7 +151,7 @@ def plot_result(res_path,
                   origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
            corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
                                           axis=0)
-            gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
+            gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=bool)
            pred_flag = np.concatenate(
                [pred_flag, np.logical_not(gt_flag)], axis=0)
            scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
@@ -169,7 +169,7 @@ def plot_result(res_path,
                check_point_in_img(corners_img, img.shape[0], img.shape[1]))
            valid = valid.reshape(
                -1, 8)  # valid means: d>0 and visible in current view
-            corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
+            corners_img = corners_img.reshape(-1, 8, 2).astype(int)
            for aid in range(valid.shape[0]):
                if scores[aid] < vis_thred and pred_flag[aid]:
                    continue
--- a/modelscope/models/cv/ocr_recognition/model.py
+++ b/modelscope/models/cv/ocr_recognition/model.py
@@ -90,8 +90,15 @@ class OCRRecognition(TorchModel):
                f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
            )
        if model_path != '':
-            self.recognizer.load_state_dict(
+            params_pretrained = torch.load(model_path, map_location='cpu')
-                torch.load(model_path, map_location='cpu'))
+            model_dict = self.recognizer.state_dict()
            # remove prefix for finetuned models
            check_point = {
                k.replace('recognizer.', ''): v
                for k, v in params_pretrained.items()
            }
            model_dict.update(check_point)
            self.recognizer.load_state_dict(model_dict)
        dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
        self.labelMapping = dict()
--- a/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
+++ b/modelscope/models/cv/open_vocabulary_detection_vild/vild.py
@@ -176,8 +176,7 @@ class OpenVocabularyDetectionViLD(Model):
        # Filter out invalid rois (nmsed rois)
        valid_indices = np.where(
            np.logical_and(
-                np.isin(
+                np.isin(np.arange(len(roi_scores), dtype=int), nmsed_indices),
                    np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
                np.logical_and(
                    np.logical_not(np.all(roi_boxes == 0., axis=-1)),
                    np.logical_and(roi_scores >= min_rpn_score_thresh,
--- a/modelscope/models/cv/panorama_depth_estimation/networks/layers.py
+++ b/modelscope/models/cv/panorama_depth_estimation/networks/layers.py
@@ -72,7 +72,7 @@ class Cube2Equirec(nn.Module):
                self.equ_h, 0), 3 * self.equ_w // 8, 1)
        # Prepare ceil mask
-        mask = np.zeros((self.equ_h, self.equ_w // 4), np.bool)
+        mask = np.zeros((self.equ_h, self.equ_w // 4), bool)
        idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
        idx = self.equ_h // 2 - np.round(
            np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)
--- a/modelscope/models/cv/video_depth_estimation/utils/depth.py
+++ b/modelscope/models/cv/video_depth_estimation/utils/depth.py
@@ -29,7 +29,7 @@ def load_depth(file):
    elif file.endswith('png'):
        depth_png = np.array(load_image(file), dtype=int)
        assert (np.max(depth_png) > 255), 'Wrong .png depth file'
-        return depth_png.astype(np.float) / 256.
+        return depth_png.astype(float) / 256.
    else:
        raise NotImplementedError('Depth extension not supported.')
--- a/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py
+++ b/modelscope/models/cv/video_frame_interpolation/utils/scene_change_detection.py
@@ -85,7 +85,7 @@ def do_scene_detect(F01_tensor, F10_tensor, img0_tensor, img1_tensor):
    img_diff = ori_img.float() - ref_img.float()
    img_diff = torch.abs(img_diff)
-    kernel = np.ones([8, 8], np.float) / 64
+    kernel = np.ones([8, 8], float) / 64
    kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
    diff = F.conv2d(img_diff, kernel, padding=4)
--- a/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/matching.py
@@ -27,7 +27,7 @@ def linear_assignment(cost_matrix, thresh):
 def ious(atlbrs, btlbrs):
-    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
+    ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=float)
    if ious.size == 0:
        return ious
@@ -60,13 +60,13 @@ def embedding_distance(tracks, detections, metric='cosine'):
        cost_matrix: np.ndarray
    """
-    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
+    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=float)
    if cost_matrix.size == 0:
        return cost_matrix
    det_features = np.asarray([track.curr_feat for track in detections],
-                              dtype=np.float)
+                              dtype=float)
    track_features = np.asarray([track.smooth_feat for track in tracks],
-                                dtype=np.float)
+                                dtype=float)
    cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
    return cost_matrix
--- a/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py
+++ b/modelscope/models/cv/video_multi_object_tracking/tracker/multitracker.py
@@ -28,7 +28,7 @@ class STrack(BaseTrack):
    def __init__(self, tlwh, score, temp_feat, buffer_size=30):
        # wait activate
-        self._tlwh = np.asarray(tlwh, dtype=np.float)
+        self._tlwh = np.asarray(tlwh, dtype=float)
        self.kalman_filter = None
        self.mean, self.covariance = None, None
        self.is_activated = False
--- a/modelscope/models/multi_modal/init.py
+++ b/modelscope/models/multi_modal/init.py
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
    from .vldoc import VLDocForDocVLEmbedding
    from .video_synthesis import TextToVideoSynthesis
    from .efficient_diffusion_tuning import EfficientStableDiffusion
    from .mplug_owl import MplugOwlForConditionalGeneration
    from .clip_interrogator import CLIP_Interrogator
 else:
    _import_structure = {
@@ -37,7 +39,9 @@ else:
        ['MultiStageDiffusionForTextToImageSynthesis'],
        'vldoc': ['VLDocForDocVLEmbedding'],
        'video_synthesis': ['TextToVideoSynthesis'],
-        'efficient_diffusion_tuning': ['EfficientStableDiffusion']
+        'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
        'mplug_owl': ['MplugOwlForConditionalGeneration'],
        'clip_interrogator': ['CLIP_Interrogator'],
    }
    import sys
--- a/modelscope/models/multi_modal/clip_interrogator/init.py
+++ b/modelscope/models/multi_modal/clip_interrogator/init.py
@@ -0,0 +1 @@
 from .model import CLIP_Interrogator
--- a/modelscope/models/multi_modal/clip_interrogator/model.py
+++ b/modelscope/models/multi_modal/clip_interrogator/model.py
@@ -0,0 +1,599 @@
 # This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at
 # https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py
 import hashlib
 import math
 import os
 import time
 from dataclasses import dataclass
 from typing import List, Optional
 import numpy as np
 import open_clip
 import requests
 import torch
 import torchvision.transforms as transforms
 from PIL import Image
 from safetensors.numpy import load_file, save_file
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoProcessor,
                          Blip2ForConditionalGeneration,
                          BlipForConditionalGeneration)
 from modelscope.metainfo import Models
 from modelscope.models.base import TorchModel
 from modelscope.models.builder import MODELS
 from modelscope.outputs import OutputKeys
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
 logger = get_logger()
 __all__ = ['CLIP_Interrogator']
 CAPTION_MODELS = {
    'blip-base': 'blip-image-captioning-base',
    'blip-large': 'blip-image-captioning-large',
    'blip2-2.7b': 'blip2-opt-2.7b',
    'blip2-flan-t5-xl': 'blip2-flan-t5-xl',
    'git-large-coco': 'git-large-coco',
 }
@dataclass
 class Config:
    # models can optionally be passed in directly
    caption_model = None
    caption_processor = None
    clip_model = None
    clip_preprocess = None
    # blip settings
    caption_max_length: int = 32
    caption_model_name: Optional[
        str] = 'blip-large'  # use a key from CAPTION_MODELS or None
    caption_offload: bool = False
    # clip settings
    clip_model_name: str = 'ViT-L-14/openai'
    clip_model_path: Optional[str] = None
    clip_offload: bool = False
    # interrogator settings
    cache_path: str = 'cache'  # path to store cached text embeddings
    download_cache: bool = False  # when true, cached embeds are downloaded from huggingface
    chunk_size: int = 2048  # batch size for CLIP, use smaller for lower VRAM
    data_path: str = os.path.join(os.path.dirname(__file__), 'data')
    device: str = ('cuda' if torch.cuda.is_available() else 'cpu')
    flavor_intermediate_count: int = 2048
    quiet: bool = False  # when quiet progress bars are not shown
    def apply_low_vram_defaults(self):
        self.caption_model_name = 'blip-base'
        self.caption_offload = True
        self.clip_offload = True
        self.chunk_size = 1024
        self.flavor_intermediate_count = 1024
 # CLIP-Interrogator utilize CLIP and BLIP to generate rich caption for images.
 # CLIP is a zero-shot image classifier which can be used to generate image and text embeddings.
 # BLIP is a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks.
 # BLIP effectively utilizes the noisy web data by bootstrapping the captions, where
 # a captioner generates synthetic captions and a filter removes the noisy ones.
 # Please infer to the paper CLIP: Learning Transferable Visual Models From Natural Language Supervision
 # https://arxiv.org/abs/2103.00020
 # BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
 # https://arxiv.org/abs/2201.12086
 class Interrogator():
    def __init__(self, config: Config):
        self.config = config
        self.device = config.device
        self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
        self.caption_offloaded = True
        self.clip_offloaded = True
        self.load_caption_model()
        self.load_clip_model()
    def load_caption_model(self):
        if self.config.caption_model is None and self.config.caption_model_name:
            if not self.config.quiet:
                print(
                    f'Loading caption model {self.config.caption_model_name}...'
                )
            model_path = CAPTION_MODELS[self.config.caption_model_name]
            if self.config.caption_model_name.startswith('git-'):
                caption_model = AutoModelForCausalLM.from_pretrained(
                    os.path.join(self.config.cache_path, model_path),
                    torch_dtype=torch.float32)
            elif self.config.caption_model_name.startswith('blip2-'):
                caption_model = Blip2ForConditionalGeneration.from_pretrained(
                    os.path.join(self.config.cache_path, model_path),
                    torch_dtype=self.dtype)
            else:
                caption_model = BlipForConditionalGeneration.from_pretrained(
                    os.path.join(self.config.cache_path, model_path),
                    torch_dtype=self.dtype)
            self.caption_processor = AutoProcessor.from_pretrained(
                os.path.join(self.config.cache_path, model_path))
            caption_model.eval()
            if not self.config.caption_offload:
                caption_model = caption_model.to(self.config.device)
            self.caption_model = caption_model
        else:
            self.caption_model = self.config.caption_model
            self.caption_processor = self.config.caption_processor
    def load_clip_model(self):
        start_time = time.time()
        config = self.config
        clip_model_name, clip_model_pretrained_name = config.clip_model_name.split(
            '/', 2)
        if config.clip_model is None:
            if not config.quiet:
                print(f'Loading CLIP model {config.clip_model_name}...')
            self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
                clip_model_name,
                pretrained=clip_model_pretrained_name,
                precision='fp16' if config.device == 'cuda' else 'fp32',
                device=config.device,
                jit=False,
                cache_dir=config.clip_model_path)
            self.clip_model.eval()
        else:
            self.clip_model = config.clip_model
            self.clip_preprocess = config.clip_preprocess
        self.tokenize = open_clip.get_tokenizer(clip_model_name)
        sites = [
            'Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart',
            'dribbble', 'flickr', 'instagram', 'pexels', 'pinterest',
            'pixabay', 'pixiv', 'polycount', 'reddit', 'shutterstock',
            'tumblr', 'unsplash', 'zbrush central'
        ]
        trending_list = [site for site in sites]
        trending_list.extend(['trending on ' + site for site in sites])
        trending_list.extend(['featured on ' + site for site in sites])
        trending_list.extend([site + ' contest winner' for site in sites])
        raw_artists = load_list(config.data_path, 'artists.txt')
        artists = [f'by {a}' for a in raw_artists]
        artists.extend([f'inspired by {a}' for a in raw_artists])
        self._prepare_clip()
        self.artists = LabelTable(artists, 'artists', self)
        self.flavors = LabelTable(
            load_list(config.data_path, 'flavors.txt'), 'flavors', self)
        self.mediums = LabelTable(
            load_list(config.data_path, 'mediums.txt'), 'mediums', self)
        self.movements = LabelTable(
            load_list(config.data_path, 'movements.txt'), 'movements', self)
        self.trendings = LabelTable(trending_list, 'trendings', self)
        self.negative = LabelTable(
            load_list(config.data_path, 'negative.txt'), 'negative', self)
        end_time = time.time()
        if not config.quiet:
            print(
                f'Loaded CLIP model and data in {end_time-start_time:.2f} seconds.'
            )
    def chain(self,
              image_features: torch.Tensor,
              phrases: List[str],
              best_prompt: str = '',
              best_sim: float = 0,
              min_count: int = 8,
              max_count: int = 32,
              desc='Chaining',
              reverse: bool = False) -> str:
        self._prepare_clip()
        phrases = set(phrases)
        if not best_prompt:
            best_prompt = self.rank_top(
                image_features, [f for f in phrases], reverse=reverse)
            best_sim = self.similarity(image_features, best_prompt)
            phrases.remove(best_prompt)
        curr_prompt, curr_sim = best_prompt, best_sim
        def check(addition: str, idx: int) -> bool:
            nonlocal best_prompt, best_sim, curr_prompt, curr_sim
            prompt = curr_prompt + ', ' + addition
            sim = self.similarity(image_features, prompt)
            if reverse:
                sim = -sim
            if sim > best_sim:
                best_prompt, best_sim = prompt, sim
            if sim > curr_sim or idx < min_count:
                curr_prompt, curr_sim = prompt, sim
                return True
            return False
        for idx in tqdm(
                range(max_count), desc=desc, disable=self.config.quiet):
            best = self.rank_top(
                image_features, [f'{curr_prompt}, {f}' for f in phrases],
                reverse=reverse)
            flave = best[len(curr_prompt) + 2:]
            if not check(flave, idx):
                break
            if _prompt_at_max_len(curr_prompt, self.tokenize):
                break
            phrases.remove(flave)
        return best_prompt
    def generate_caption(self, pil_image: Image) -> str:
        assert self.caption_model is not None, 'No caption model loaded.'
        self._prepare_caption()
        inputs = self.caption_processor(
            images=pil_image, return_tensors='pt').to(self.device)
        if not self.config.caption_model_name.startswith('git-'):
            inputs = inputs.to(self.dtype)
        tokens = self.caption_model.generate(
            **inputs, max_new_tokens=self.config.caption_max_length)
        return self.caption_processor.batch_decode(
            tokens, skip_special_tokens=True)[0].strip()
    def image_to_features(self, image: Image) -> torch.Tensor:
        self._prepare_clip()
        images = self.clip_preprocess(image).unsqueeze(0).to(self.device)
        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = self.clip_model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
        return image_features
    def interrogate_classic(self,
                            image: Image,
                            max_flavors: int = 3,
                            caption: Optional[str] = None) -> str:
        """Classic mode creates a prompt in a standard format first describing the image,
        then listing the artist, trending, movement, and flavor text modifiers."""
        caption = caption or self.generate_caption(image)
        image_features = self.image_to_features(image)
        medium = self.mediums.rank(image_features, 1)[0]
        artist = self.artists.rank(image_features, 1)[0]
        trending = self.trendings.rank(image_features, 1)[0]
        movement = self.movements.rank(image_features, 1)[0]
        flaves = ', '.join(self.flavors.rank(image_features, max_flavors))
        if caption.startswith(medium):
            prompt = f'{caption} {artist}, {trending}, {movement}, {flaves}'
        else:
            prompt = f'{caption}, {medium} {artist}, {trending}, {movement}, {flaves}'
        return _truncate_to_fit(prompt, self.tokenize)
    def interrogate_fast(self,
                         image: Image,
                         max_flavors: int = 32,
                         caption: Optional[str] = None) -> str:
        """Fast mode simply adds the top ranked terms after a caption. It generally results in
        better similarity between generated prompt and image than classic mode, but the prompts
        are less readable."""
        caption = caption or self.generate_caption(image)
        image_features = self.image_to_features(image)
        merged = _merge_tables([
            self.artists, self.flavors, self.mediums, self.movements,
            self.trendings
        ], self)
        tops = merged.rank(image_features, max_flavors)
        return _truncate_to_fit(caption + ', ' + ', '.join(tops),
                                self.tokenize)
    def interrogate_negative(self, image: Image, max_flavors: int = 32) -> str:
        """Negative mode chains together the most dissimilar terms to the image. It can be used
        to help build a negative prompt to pair with the regular positive prompt and often
        improve the results of generated images particularly with Stable Diffusion 2."""
        image_features = self.image_to_features(image)
        flaves = self.flavors.rank(
            image_features,
            self.config.flavor_intermediate_count,
            reverse=True)
        flaves = flaves + self.negative.labels
        return self.chain(
            image_features,
            flaves,
            max_count=max_flavors,
            reverse=True,
            desc='Negative chain')
    def interrogate(self,
                    image: Image,
                    min_flavors: int = 8,
                    max_flavors: int = 32,
                    caption: Optional[str] = None) -> str:
        caption = caption or self.generate_caption(image)
        image_features = self.image_to_features(image)
        merged = _merge_tables([
            self.artists, self.flavors, self.mediums, self.movements,
            self.trendings
        ], self)
        flaves = merged.rank(image_features,
                             self.config.flavor_intermediate_count)
        best_prompt, best_sim = caption, self.similarity(
            image_features, caption)
        best_prompt = self.chain(
            image_features,
            flaves,
            best_prompt,
            best_sim,
            min_count=min_flavors,
            max_count=max_flavors,
            desc='Flavor chain')
        fast_prompt = self.interrogate_fast(
            image, max_flavors, caption=caption)
        classic_prompt = self.interrogate_classic(
            image, max_flavors, caption=caption)
        candidates = [caption, classic_prompt, fast_prompt, best_prompt]
        return candidates[np.argmax(
            self.similarities(image_features, candidates))]
    def rank_top(self,
                 image_features: torch.Tensor,
                 text_array: List[str],
                 reverse: bool = False) -> str:
        self._prepare_clip()
        text_tokens = self.tokenize([text
                                     for text in text_array]).to(self.device)
        with torch.no_grad(), torch.cuda.amp.autocast():
            text_features = self.clip_model.encode_text(text_tokens)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = text_features @ image_features.T
            if reverse:
                similarity = -similarity
        return text_array[similarity.argmax().item()]
    def similarity(self, image_features: torch.Tensor, text: str) -> float:
        self._prepare_clip()
        text_tokens = self.tokenize([text]).to(self.device)
        with torch.no_grad(), torch.cuda.amp.autocast():
            text_features = self.clip_model.encode_text(text_tokens)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = text_features @ image_features.T
        return similarity[0][0].item()
    def similarities(self, image_features: torch.Tensor,
                     text_array: List[str]) -> List[float]:
        self._prepare_clip()
        text_tokens = self.tokenize([text
                                     for text in text_array]).to(self.device)
        with torch.no_grad(), torch.cuda.amp.autocast():
            text_features = self.clip_model.encode_text(text_tokens)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = text_features @ image_features.T
        return similarity.T[0].tolist()
    def _prepare_caption(self):
        if self.config.clip_offload and not self.clip_offloaded:
            self.clip_model = self.clip_model.to('cpu')
            self.clip_offloaded = True
        if self.caption_offloaded:
            self.caption_model = self.caption_model.to(self.device)
            self.caption_offloaded = False
    def _prepare_clip(self):
        if self.config.caption_offload and not self.caption_offloaded:
            self.caption_model = self.caption_model.to('cpu')
            self.caption_offloaded = True
        if self.clip_offloaded:
            self.clip_model = self.clip_model.to(self.device)
            self.clip_offloaded = False
 class LabelTable():
    def __init__(self, labels: List[str], desc: str, ci: Interrogator):
        clip_model, config = ci.clip_model, ci.config
        self.chunk_size = config.chunk_size
        self.config = config
        self.device = config.device
        self.embeds = []
        self.labels = labels
        self.tokenize = ci.tokenize
        hash = hashlib.sha256(','.join(labels).encode()).hexdigest()
        sanitized_name = self.config.clip_model_name.replace('/', '_').replace(
            '@', '_')
        self._load_cached(desc, hash, sanitized_name)
        if len(self.labels) != len(self.embeds):
            self.embeds = []
            chunks = np.array_split(
                self.labels, max(1,
                                 len(self.labels) / config.chunk_size))
            for chunk in tqdm(
                    chunks,
                    desc=f'Preprocessing {desc}' if desc else None,
                    disable=self.config.quiet):
                text_tokens = self.tokenize(chunk).to(self.device)
                with torch.no_grad(), torch.cuda.amp.autocast():
                    text_features = clip_model.encode_text(text_tokens)
                    text_features /= text_features.norm(dim=-1, keepdim=True)
                    text_features = text_features.half().cpu().numpy()
                for i in range(text_features.shape[0]):
                    self.embeds.append(text_features[i])
            if desc and self.config.cache_path:
                os.makedirs(self.config.cache_path, exist_ok=True)
                cache_filepath = os.path.join(
                    self.config.cache_path,
                    f'{sanitized_name}_{desc}.safetensors')
                tensors = {
                    'embeds': np.stack(self.embeds),
                    'hash': np.array([ord(c) for c in hash], dtype=np.int8)
                }
                save_file(tensors, cache_filepath)
        if self.device == 'cpu' or self.device == torch.device('cpu'):
            self.embeds = [e.astype(np.float32) for e in self.embeds]
    def _load_cached(self, desc: str, hash: str, sanitized_name: str) -> bool:
        if self.config.cache_path is None or desc is None:
            return False
        cached_safetensors = os.path.join(
            self.config.cache_path, f'{sanitized_name}_{desc}.safetensors')
        if os.path.exists(cached_safetensors):
            try:
                tensors = load_file(cached_safetensors)
            except Exception as e:
                print(f'Failed to load {cached_safetensors}')
                print(e)
                return False
            if 'hash' in tensors and 'embeds' in tensors:
                if np.array_equal(
                        tensors['hash'],
                        np.array([ord(c) for c in hash], dtype=np.int8)):
                    self.embeds = tensors['embeds']
                    if len(self.embeds.shape) == 2:
                        self.embeds = [
                            self.embeds[i] for i in range(self.embeds.shape[0])
                        ]
                    return True
        return False
    def _rank(self,
              image_features: torch.Tensor,
              text_embeds: torch.Tensor,
              top_count: int = 1,
              reverse: bool = False) -> str:
        top_count = min(top_count, len(text_embeds))
        text_embeds = torch.stack([torch.from_numpy(t)
                                   for t in text_embeds]).to(self.device)
        with torch.cuda.amp.autocast():
            similarity = image_features @ text_embeds.T
            if reverse:
                similarity = -similarity
        _, top_labels = similarity.float().cpu().topk(top_count, dim=-1)
        return [top_labels[0][i].numpy() for i in range(top_count)]
    def rank(self,
             image_features: torch.Tensor,
             top_count: int = 1,
             reverse: bool = False) -> List[str]:
        if len(self.labels) <= self.chunk_size:
            tops = self._rank(
                image_features,
                self.embeds,
                top_count=top_count,
                reverse=reverse)
            return [self.labels[i] for i in tops]
        num_chunks = int(math.ceil(len(self.labels) / self.chunk_size))
        keep_per_chunk = int(self.chunk_size / num_chunks)
        top_labels, top_embeds = [], []
        for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
            start = chunk_idx * self.chunk_size
            stop = min(start + self.chunk_size, len(self.embeds))
            tops = self._rank(
                image_features,
                self.embeds[start:stop],
                top_count=keep_per_chunk,
                reverse=reverse)
            top_labels.extend([self.labels[start + i] for i in tops])
            top_embeds.extend([self.embeds[start + i] for i in tops])
        tops = self._rank(image_features, top_embeds, top_count=top_count)
        return [top_labels[i] for i in tops]
 def _download_file(url: str,
                   filepath: str,
                   chunk_size: int = 4 * 1024 * 1024,
                   quiet: bool = False):
    r = requests.get(url, stream=True)
    if r.status_code != 200:
        return
    file_size = int(r.headers.get('Content-Length', 0))
    filename = url.split('/')[-1]
    progress = tqdm(
        total=file_size,
        unit='B',
        unit_scale=True,
        desc=filename,
        disable=quiet)
    with open(filepath, 'wb') as f:
        for chunk in r.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                progress.update(len(chunk))
    progress.close()
 def _merge_tables(tables: List[LabelTable], ci: Interrogator) -> LabelTable:
    m = LabelTable([], None, ci)
    for table in tables:
        m.labels.extend(table.labels)
        m.embeds.extend(table.embeds)
    return m
 def _prompt_at_max_len(text: str, tokenize) -> bool:
    tokens = tokenize([text])
    return tokens[0][-1] != 0
 def _truncate_to_fit(text: str, tokenize) -> str:
    parts = text.split(', ')
    new_text = parts[0]
    for part in parts[1:]:
        if _prompt_at_max_len(new_text + part, tokenize):
            break
        new_text += ', ' + part
    return new_text
 def list_caption_models() -> List[str]:
    return list(CAPTION_MODELS.keys())
 def list_clip_models() -> List[str]:
    return ['/'.join(x) for x in open_clip.list_pretrained()]
 def load_list(data_path: str, filename: Optional[str] = None) -> List[str]:
    """Load a list of strings from a file."""
    if filename is not None:
        data_path = os.path.join(data_path, filename)
    with open(data_path, 'r', encoding='utf-8', errors='replace') as f:
        items = [line.strip() for line in f.readlines()]
    return items
@MODELS.register_module(
    Tasks.image_captioning, module_name=Models.clip_interrogator)
 class CLIP_Interrogator(TorchModel):
    def __init__(self, model_dir, device='cuda', device_id=0, *args, **kwargs):
        super().__init__(
            model_dir=model_dir, device_id=device_id, *args, **kwargs)
        self.device = device
        self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
        cf = Config(clip_model_name='ViT-L-14/openai')
        cf.data_path = os.path.join(model_dir, 'data')
        cf.clip_model_path = model_dir
        cf.cache_path = model_dir
        self.ci = Interrogator(cf)
    def forward(self, inputs):
        image = transforms.ToPILImage()(inputs)
        return {'caption': self.ci.interrogate(image)}
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -128,13 +128,13 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
                          local_transform,
                          s=None,
                          e=None):
-        video_mask = np.zeros(self.max_frames, dtype=np.long)
+        video_mask = np.zeros(self.max_frames, dtype=int)
        max_video_length = 0
        # T x 3 x H x W
        video = np.zeros((self.max_frames, 3, rawVideoExtractor.size,
                          rawVideoExtractor.size),
-                         dtype=np.float)
+                         dtype=float)
        if s is None:
            start_time, end_time = None, None
--- a/modelscope/models/multi_modal/mplug_owl/init.py
+++ b/modelscope/models/multi_modal/mplug_owl/init.py
@@ -0,0 +1,18 @@
 # Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
 # All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
                                      MplugOwlVisualAbstractorConfig)
 from .modeling_mplug_owl import MplugOwlForConditionalGeneration
--- a/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py
+++ b/modelscope/models/multi_modal/mplug_owl/configuration_mplug_owl.py
@@ -0,0 +1,257 @@
 # Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ MPLUG OWL model configuration """
 import copy
 import os
 from typing import Union
 from transformers import PretrainedConfig
 from transformers.models.auto import CONFIG_MAPPING
 from transformers.utils import logging
 from modelscope.utils.constant import Tasks
 logger = logging.get_logger()
 class MplugOwlVisionConfig(PretrainedConfig):
    r"""
    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
    ```"""
    model_type = 'mplug_owl_vision_model'
    def __init__(
        self,
        hidden_size=1024,
        intermediate_size=4096,
        projection_dim=768,
        num_hidden_layers=24,
        num_attention_heads=16,
        num_channels=3,
        image_size=224,
        patch_size=14,
        hidden_act='quick_gelu',
        layer_norm_eps=1e-6,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        use_flash_attn=False,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.projection_dim = projection_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.use_flash_attn = use_flash_attn
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
                                                                  os.PathLike],
                        **kwargs) -> 'PretrainedConfig':
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        # get the vision config dict if we are loading from MplugOwlConfig
        if config_dict.get('model_type') == 'mplug_owl':
            config_dict = config_dict['vision_config']
        if 'model_type' in config_dict and hasattr(
                cls,
                'model_type') and config_dict['model_type'] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
            )
        return cls.from_dict(config_dict, **kwargs)
 class MplugOwlVisualAbstractorConfig(PretrainedConfig):
    model_type = 'MPlugOwlVisualAbstractor'
    def __init__(
        self,
        hidden_size=1024,
        num_hidden_layers=6,
        num_attention_heads=16,
        intermediate_size=4096,
        attention_probs_dropout_prob=0.1,
        initializer_range=0.02,
        layer_norm_eps=1e-6,
        encoder_hidden_size=1024,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.encoder_hidden_size = encoder_hidden_size
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
                                                                  os.PathLike],
                        **kwargs) -> 'PretrainedConfig':
        config_dict, kwargs = cls.get_config_dict(
            pretrained_model_name_or_path, **kwargs)
        # get the qformer config dict if we are loading from MplugOwlConfig
        if config_dict.get('model_type') == 'mplug_owl':
            config_dict = config_dict['abstractor_config']
        if 'model_type' in config_dict and hasattr(
                cls,
                'model_type') and config_dict['model_type'] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
            )
        return cls.from_dict(config_dict, **kwargs)
 class MplugOwlConfig(PretrainedConfig):
    r"""
    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.
        kwargs (*optional*):
            Dictionary of keyword arguments.
    """
    model_type = 'mplug_owl'
    is_composition = True
    def __init__(self,
                 task=Tasks.multimodal_dialogue,
                 vision_config=None,
                 visual_abstractor_config=None,
                 text_config=None,
                 num_query_tokens=64,
                 **kwargs):
        super().__init__(**kwargs)
        self.task = task
        if vision_config is None:
            vision_config = MplugOwlVisionConfig().to_dict()
            logger.info('vision_config is None.')
        if visual_abstractor_config is None:
            visual_abstractor_config = {}
            logger.info('abstractor_config is None. ')
        if text_config is None:
            # we use LLAMA 7b by default
            from transformers.models.llama.configuration_llama import \
                LlamaConfig
            text_config = LlamaConfig(pad_token_id=2).to_dict()
            logger.info('text_config is None.')
        self.vision_config = MplugOwlVisionConfig(**vision_config)
        self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
            **visual_abstractor_config)
        text_model_type = text_config[
            'model_type'] if 'model_type' in text_config else 'llama'
        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
        self.tie_word_embeddings = self.text_config.tie_word_embeddings
        self.num_query_tokens = num_query_tokens
        self.initializer_factor = 1.0
        self.initializer_range = 0.02
    @classmethod
    def from_vision_abstractor_text_configs(
        cls,
        vision_config: MplugOwlVisionConfig,
        visual_abstractor_config: MplugOwlVisualAbstractorConfig,
        text_config: PretrainedConfig,
        **kwargs,
    ):
        r"""
        Returns:
            [`MplugOwlConfig`]: An instance of a configuration object
        """
        return cls(
            vision_config=vision_config.to_dict(),
            visual_abstractor_config=visual_abstractor_config.to_dict(),
            text_config=text_config.to_dict(),
            **kwargs,
        )
    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
        Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        output = copy.deepcopy(self.__dict__)
        output['vision_config'] = self.vision_config.to_dict()
        tmp = self.visual_abstractor_config.to_dict()
        output['visual_abstractor_config'] = tmp
        output['text_config'] = self.text_config.to_dict()
        output['model_type'] = self.__class__.model_type
        return output
--- a/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
+++ b/modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
--- a/modelscope/models/nlp/mglm/blocklm_utils.py
+++ b/modelscope/models/nlp/mglm/blocklm_utils.py
@@ -212,10 +212,10 @@ class ConstructBlockStrategy:
                         block_spans,
                         rng,
                         task='bert'):
-        position_ids = np.arange(len(tokens), dtype=np.long)
+        position_ids = np.arange(len(tokens), dtype=int)
        targets = copy.deepcopy(tokens)
        mask_id = self.tokenizer.get_command('MASK').Id
-        mlm_masks = np.zeros(len(tokens), dtype=np.long)
+        mlm_masks = np.zeros(len(tokens), dtype=int)
        for start, end in block_spans:
            for idx in range(start, end):
                tokens[idx] = mask_id
@@ -231,7 +231,7 @@ class ConstructBlockStrategy:
                        rng,
                        task='bert'):
        text_length = len(tokens)
-        position_ids = np.ones(len(tokens), dtype=np.long)
+        position_ids = np.ones(len(tokens), dtype=int)
        for start, end in block_spans:
            position_ids[start + 1:end] = 0
        position_ids = np.cumsum(position_ids) - 1
@@ -270,7 +270,7 @@ class ConstructBlockStrategy:
                                           (end - start + 1))
            if self.block_position_encoding:
                target_block_position_ids.append(
-                    np.arange(1, end - start + 2, dtype=np.long))
+                    np.arange(1, end - start + 2, dtype=int))
            else:
                target_block_position_ids.append([1] * (end - start + 1))
        block_spans.sort(key=lambda x: x[0])
@@ -307,7 +307,7 @@ class ConstructBlockStrategy:
            target_tokens = target_tokens + [
                self.tokenizer.get_command('eop').Id
            ]
-            loss_masks = np.ones(len(target_tokens), dtype=np.long)
+            loss_masks = np.ones(len(target_tokens), dtype=int)
            return source_tokens, target_tokens, loss_masks
        else:
            tokens = np.concatenate(source_tokens + target_tokens)
@@ -326,12 +326,12 @@ class ConstructBlockStrategy:
                for pos in mask_pos:
                    tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
            targets = np.concatenate(source_tokens + targets)
-            loss_masks = np.ones(len(tokens), dtype=np.long)
+            loss_masks = np.ones(len(tokens), dtype=int)
            loss_masks[:source_length] = 0
            position_ids = np.concatenate(source_position_ids
                                          + target_position_ids)
            block_position_ids = np.concatenate(
-                [np.zeros(source_length, dtype=np.long)]
+                [np.zeros(source_length, dtype=int)]
                + target_block_position_ids)
            position_ids = np.stack([position_ids, block_position_ids], axis=0)
            if attention_mask is not None:
@@ -539,22 +539,21 @@ class ConstructBlockStrategy:
                        (source_tokens, [self.generation_mask], target_tokens))
                    loss_masks = np.concatenate(
                        (np.zeros(len(source_tokens) + 1,
-                                  dtype=np.long), target_masks))
+                                  dtype=int), target_masks))
                    token_batch.append(tokens)
                    target_batch.append(targets)
                    loss_mask_batch.append(loss_masks)
                    position_ids = np.arange(
-                        len(source_tokens) + len(target_tokens) + 1,
+                        len(source_tokens) + len(target_tokens) + 1, dtype=int)
                        dtype=np.long)
                    position_ids[len(source_tokens) + 1:] = len(source_tokens)
                    if self.block_position_encoding:
                        block_position_ids = np.concatenate(
-                            (np.zeros(len(source_tokens), dtype=np.long),
+                            (np.zeros(len(source_tokens), dtype=int),
-                             np.arange(len(target_tokens) + 1, dtype=np.long)))
+                             np.arange(len(target_tokens) + 1, dtype=int)))
                    else:
                        block_position_ids = np.concatenate(
-                            (np.zeros(len(source_tokens) + 1, dtype=np.long),
+                            (np.zeros(len(source_tokens) + 1, dtype=int),
-                             np.ones(len(target_tokens) + 1, dtype=np.long)))
+                             np.ones(len(target_tokens) + 1, dtype=int)))
                    position_id_batch.append(
                        np.stack([position_ids, block_position_ids], axis=0))
                else:
@@ -597,27 +596,25 @@ class ConstructBlockStrategy:
            max_length = max(seq_lengths)
            token_batch = [
                np.concatenate(
-                    (tokens, np.zeros(max_length - len(tokens),
+                    (tokens, np.zeros(max_length - len(tokens), dtype=int)))
                                      dtype=np.long)))
                for tokens in token_batch
            ]
            target_batch = [
                np.concatenate(
-                    (targets,
+                    (targets, np.zeros(max_length - len(targets), dtype=int)))
                     np.zeros(max_length - len(targets), dtype=np.long)))
                for targets in target_batch
            ]
            loss_mask_batch = [
                np.concatenate(
                    (loss_masks,
-                     np.zeros(max_length - len(loss_masks), dtype=np.long)))
+                     np.zeros(max_length - len(loss_masks), dtype=int)))
                for loss_masks in loss_mask_batch
            ]
            position_id_batch = [
-                np.concatenate((position_ids,
+                np.concatenate(
-                                np.zeros(
+                    (position_ids,
-                                    (2, max_length - position_ids.shape[1]),
+                     np.zeros(
-                                    dtype=np.long)),
+                         (2, max_length - position_ids.shape[1]), dtype=int)),
-                               axis=1) for position_ids in position_id_batch
+                    axis=1) for position_ids in position_id_batch
            ]
        return token_batch, target_batch, loss_mask_batch, position_id_batch
--- a/modelscope/models/nlp/mglm/data_utils/datasets.py
+++ b/modelscope/models/nlp/mglm/data_utils/datasets.py
@@ -583,8 +583,8 @@ class XLDataset(data.Dataset):
    def getidx(self, idx):
        tokens, targets, loss_masks = [], [], []
        attention_mask = np.concatenate(
-            (np.zeros((self.max_seq_len, self.mem_len), dtype=np.long),
+            (np.zeros((self.max_seq_len, self.mem_len), dtype=int),
-             np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)),
+             np.ones((self.max_seq_len, self.max_seq_len), dtype=int)),
            axis=1)
        sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
        last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
--- a/modelscope/models/nlp/mglm/test/test_block.py
+++ b/modelscope/models/nlp/mglm/test/test_block.py
@@ -28,7 +28,7 @@ def main():
    counts = np.array([0] * 10)
    for _ in range(10000):
        spans = strategy.sample_span_in_document(
-            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
+            np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=int), [1, 1],
            random.Random())
        for start, end in spans:
            counts[start:end] += 1
--- a/modelscope/models/nlp/mglm/test/test_rel_shift.py
+++ b/modelscope/models/nlp/mglm/test/test_rel_shift.py
@@ -17,7 +17,7 @@ def main():
        num_iters=300000,
        decay_style='cosine',
        decay_ratio=0.1)
-    steps = np.arange(0, 400000, 10, dtype=np.long)
+    steps = np.arange(0, 400000, 10, dtype=int)
    rates = []
    for step in steps:
        lr_scheduler.num_iters = step
--- a/modelscope/models/nlp/unite/init.py
+++ b/modelscope/models/nlp/unite/init.py
@@ -5,12 +5,12 @@ from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
-    from .configuration_unite import UniTEConfig
+    from .configuration import UniTEConfig
-    from .modeling_unite import UniTEForTranslationEvaluation
+    from .translation_evaluation import UniTEForTranslationEvaluation
 else:
    _import_structure = {
-        'configuration_unite': ['UniTEConfig'],
+        'configuration': ['UniTEConfig'],
-        'modeling_unite': ['UniTEForTranslationEvaluation'],
+        'translation_evaluation': ['UniTEForTranslationEvaluation'],
    }
    import sys
--- a/modelscope/models/nlp/unite/configuration_unite.py
+++ b/modelscope/models/nlp/unite/configuration_unite.py
@@ -9,7 +9,7 @@ from modelscope.utils.config import Config
 logger = logging.get_logger()
-class EvaluationMode(Enum):
+class InputFormat(Enum):
    SRC = 'src'
    REF = 'ref'
    SRC_REF = 'src-ref'
--- a/Show More
+++ b/Show More
		`@@ -1 +0,0 @@`
			`{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}`
		`@@ -0,0 +1 @@`
							`# Copyright (c) Alibaba, Inc. and its affiliates.`