This commit is contained in:
xingjun.wang
2023-05-22 10:53:18 +08:00
parent 52aea36c12
commit 48c0d2a9af
468 changed files with 12942 additions and 7176 deletions

View File

@@ -108,9 +108,9 @@ Audio:
* [speech_charctc_kws_phone-xiaoyun](https://modelscope.cn/models/damo/speech_charctc_kws_phone-xiaoyun) * [speech_charctc_kws_phone-xiaoyun](https://modelscope.cn/models/damo/speech_charctc_kws_phone-xiaoyun)
* [u2pp_conformer-asr-cn-16k-online](https://modelscope.cn/models/wenet/u2pp_conformer-asr-cn-16k-online) * [u2pp_conformer-asr-cn-16k-online](https://modelscope.cn/models/wenet/u2pp_conformer-asr-cn-16k-online)
* [speech_fsmn_vad_zh-cn-16k-common-pytorch](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) * [speech_fsmn_vad_zh-cn-16k-common-pytorch](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
* [punc_ct-transformer_zh-cn-common-vocab272727-pytorch](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) * [punc_ct-transformer_zh-cn-common-vocab272727-pytorch](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)
* [speech_frcrn_ans_cirm_16k](https://modelscope.cn/models/damo/speech_frcrn_ans_cirm_16k) * [speech_frcrn_ans_cirm_16k](https://modelscope.cn/models/damo/speech_frcrn_ans_cirm_16k)

View File

@@ -1,13 +1,12 @@
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from modelscope import MsDataset, TrainingArgs
from modelscope.metainfo import Trainers from modelscope.metainfo import Trainers
from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.trainers.builder import build_trainer from modelscope.trainers.builder import build_trainer
from modelscope.trainers.training_args import TrainingArgs
@dataclass @dataclass(init=False)
class ImageClassificationTrainingArgs(TrainingArgs): class ImageClassificationTrainingArgs(TrainingArgs):
num_classes: int = field( num_classes: int = field(
default=None, default=None,
@@ -46,26 +45,35 @@ def create_dataset(name, split):
dataset_name, namespace=namespace, subset_name='default', split=split) dataset_name, namespace=namespace, subset_name='default', split=split)
def train(): training_args = ImageClassificationTrainingArgs(
args = ImageClassificationTrainingArgs.from_cli( model='damo/cv_vit-base_image-classification_ImageNet-labels',
model='damo/cv_vit-base_image-classification_ImageNet-labels', max_epochs=1,
max_epochs=1, lr=1e-4,
lr=1e-4, optimizer='AdamW',
optimizer='AdamW', warmup_iters=1,
warmup_iters=1, topk=(1, )).parse_cli()
topk=(1, )) config, args = training_args.to_config()
if args.dataset_name is not None:
train_dataset = create_dataset(args.dataset_name, split='train')
val_dataset = create_dataset(args.dataset_name, split='validation') def cfg_modify_fn(cfg):
if args.use_model_config:
cfg.merge_from_dict(config)
else: else:
train_dataset = create_dataset(args.train_dataset_name, split='train') cfg = config
val_dataset = create_dataset(args.val_dataset_name, split='validation') return cfg
def train():
train_dataset = create_dataset(
training_args.train_dataset_name, split=training_args.train_split)
val_dataset = create_dataset(
training_args.val_dataset_name, split=training_args.val_split)
kwargs = dict( kwargs = dict(
model=args.model, # model id model=args.model, # model id
train_dataset=train_dataset, # training dataset train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # validation dataset eval_dataset=val_dataset, # validation dataset
cfg_modify_fn=args # callback to modify configuration cfg_modify_fn=cfg_modify_fn # callback to modify configuration
) )
# in distributed training, specify pytorch launcher # in distributed training, specify pytorch launcher

View File

@@ -2,4 +2,7 @@ PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
examples/pytorch/image_classification/finetune_image_classification.py \ examples/pytorch/image_classification/finetune_image_classification.py \
--num_classes 2 \ --num_classes 2 \
--train_dataset_name 'tany0699/cats_and_dogs' \ --train_dataset_name 'tany0699/cats_and_dogs' \
--val_dataset_name 'tany0699/cats_and_dogs' --val_dataset_name 'tany0699/cats_and_dogs' \
--train_split train \
--val_split validation \
--use_model_config true \

View File

@@ -1,15 +1,13 @@
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import partial
from modelscope import MsDataset, TrainingArgs
from modelscope.metainfo import Trainers from modelscope.metainfo import Trainers
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer from modelscope.trainers import build_trainer
from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value, from modelscope.trainers.training_args import set_flatten_value
set_flatten_value)
@dataclass @dataclass(init=False)
class MultiModalEmbeddingArguments(TrainingArgs): class MultiModalEmbeddingArguments(TrainingArgs):
trainer: str = field( trainer: str = field(
@@ -17,6 +15,12 @@ class MultiModalEmbeddingArguments(TrainingArgs):
'help': 'The trainer used', 'help': 'The trainer used',
}) })
work_dir: str = field(
default='./tmp',
metadata={
'help': 'The working path for saving checkpoint',
})
use_fp16: bool = field( use_fp16: bool = field(
default=None, default=None,
metadata={ metadata={
@@ -35,7 +39,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
default=None, default=None,
metadata={ metadata={
'cfg_node': 'train.optimizer_hparams', 'cfg_node': 'train.optimizer_hparams',
'cfg_getter': partial(get_flatten_value, exclusions=['lr']),
'cfg_setter': set_flatten_value, 'cfg_setter': set_flatten_value,
'help': 'The optimizer init params except `lr`', 'help': 'The optimizer init params except `lr`',
}) })
@@ -51,7 +54,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
default=None, default=None,
metadata={ metadata={
'cfg_node': 'dataset.column_map', 'cfg_node': 'dataset.column_map',
'cfg_getter': get_flatten_value,
'cfg_setter': set_flatten_value, 'cfg_setter': set_flatten_value,
'help': 'The column map for dataset', 'help': 'The column map for dataset',
}) })
@@ -67,7 +69,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
default=None, default=None,
metadata={ metadata={
'cfg_node': 'train.lr_scheduler_hook', 'cfg_node': 'train.lr_scheduler_hook',
'cfg_getter': get_flatten_value,
'cfg_setter': set_flatten_value, 'cfg_setter': set_flatten_value,
'help': 'The parameters for lr scheduler hook', 'help': 'The parameters for lr scheduler hook',
}) })
@@ -76,7 +77,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
default=None, default=None,
metadata={ metadata={
'cfg_node': 'train.optimizer_hook', 'cfg_node': 'train.optimizer_hook',
'cfg_getter': get_flatten_value,
'cfg_setter': set_flatten_value, 'cfg_setter': set_flatten_value,
'help': 'The parameters for optimizer hook', 'help': 'The parameters for optimizer hook',
}) })
@@ -92,23 +92,28 @@ class MultiModalEmbeddingArguments(TrainingArgs):
'help': 'The data parallel world size', 'help': 'The data parallel world size',
}) })
def __call__(self, config):
config = super().__call__(config) config, args = MultiModalEmbeddingArguments().parse_cli().to_config()
config.merge_from_dict({'pretrained_model.model_name': self.model}) print(config, args)
if self.clip_clamp:
config.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
if self.world_size > 1:
config.train.launcher = 'pytorch'
return config
args = MultiModalEmbeddingArguments.from_cli(task='multi-modal-embedding') def cfg_modify_fn(cfg):
print(args) if args.use_model_config:
cfg.merge_from_dict(config)
else:
cfg = config
cfg.merge_from_dict({'pretrained_model.model_name': args.model})
if args.clip_clamp:
cfg.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
if args.world_size > 1:
cfg.train.launcher = 'pytorch'
return cfg
train_dataset = MsDataset.load( train_dataset = MsDataset.load(
args.dataset_name, namespace='modelscope', split='train') args.train_dataset_name, namespace='modelscope', split='train')
eval_dataset = MsDataset.load( eval_dataset = MsDataset.load(
args.dataset_name, namespace='modelscope', split='validation') args.train_dataset_name, namespace='modelscope', split='validation')
os.makedirs(args.work_dir, exist_ok=True) os.makedirs(args.work_dir, exist_ok=True)
kwargs = dict( kwargs = dict(
@@ -116,6 +121,6 @@ kwargs = dict(
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=eval_dataset, eval_dataset=eval_dataset,
work_dir=args.work_dir, work_dir=args.work_dir,
cfg_modify_fn=args) cfg_modify_fn=cfg_modify_fn)
trainer = build_trainer(name=args.trainer, default_args=kwargs) trainer = build_trainer(name=args.trainer, default_args=kwargs)
trainer.train() trainer.train()

View File

@@ -6,14 +6,16 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
--trainer 'clip-multi-modal-embedding' \ --trainer 'clip-multi-modal-embedding' \
--work_dir './workspace/ckpts/clip' \ --work_dir './workspace/ckpts/clip' \
--model 'damo/multi-modal_clip-vit-base-patch16_zh' \ --model 'damo/multi-modal_clip-vit-base-patch16_zh' \
--dataset_name 'muge' \ --train_dataset_name 'muge' \
--dataset_column_map 'img=image,text=query' \ --dataset_column_map 'img=image,text=query' \
--max_epochs 1 \ --max_epochs 1 \
--use_fp16 true \ --use_fp16 true \
--per_device_train_batch_size 180 \ --per_device_train_batch_size 180 \
--train_data_worker 0 \
--train_shuffle true \ --train_shuffle true \
--train_drop_last true \ --train_drop_last true \
--per_device_eval_batch_size 128 \ --per_device_eval_batch_size 128 \
--eval_data_worker 0 \
--eval_shuffle true \ --eval_shuffle true \
--eval_drop_last true \ --eval_drop_last true \
--save_ckpt_best true \ --save_ckpt_best true \
@@ -33,3 +35,4 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
--optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \ --optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \
--clip_clamp true \ --clip_clamp true \
--world_size $DATA_PARALLEL_SIZE \ --world_size $DATA_PARALLEL_SIZE \
--use_model_config true \

View File

@@ -4,30 +4,32 @@ from modelscope.msdatasets import MsDataset
from modelscope.trainers import EpochBasedTrainer, build_trainer from modelscope.trainers import EpochBasedTrainer, build_trainer
from modelscope.trainers.training_args import TrainingArgs from modelscope.trainers.training_args import TrainingArgs
training_args = TrainingArgs(task='efficient-diffusion-tuning').parse_cli()
@dataclass config, args = training_args.to_config()
class StableDiffusionArguments(TrainingArgs):
def __call__(self, config):
config = super().__call__(config)
config.train.lr_scheduler.T_max = self.max_epochs
config.model.inference = False
return config
args = StableDiffusionArguments.from_cli(task='efficient-diffusion-tuning')
print(args) print(args)
dataset = MsDataset.load(args.dataset_name, namespace=args.namespace) dataset = MsDataset.load(
args.train_dataset_name, namespace=args.train_dataset_namespace)
train_dataset = dataset['train'] train_dataset = dataset['train']
validation_dataset = dataset['validation'] validation_dataset = dataset['validation']
def cfg_modify_fn(cfg):
if args.use_model_config:
cfg.merge_from_dict(config)
else:
cfg = config
cfg.train.lr_scheduler.T_max = training_args.max_epochs
cfg.model.inference = False
return cfg
kwargs = dict( kwargs = dict(
model=args.model, model=training_args.model,
work_dir=args.work_dir, work_dir=training_args.work_dir,
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=validation_dataset, eval_dataset=validation_dataset,
cfg_modify_fn=args) cfg_modify_fn=cfg_modify_fn)
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs) trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
trainer.train() trainer.train()

View File

@@ -1,11 +1,12 @@
PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \ PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \
--model 'damo/multi-modal_efficient-diffusion-tuning-lora' \ --model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
--work_dir './tmp/stable_diffusion_tuning' \ --work_dir './tmp/stable_diffusion_tuning' \
--namespace 'damo' \ --train_dataset_namespace 'damo' \
--dataset_name 'buptwq/lora-stable-diffusion-finetune-dog' \ --train_dataset_name 'controlnet_dataset_condition_fill50k' \
--max_epochs 150 \ --max_epochs 1 \
--save_ckpt_strategy 'by_epoch' \ --save_ckpt_strategy 'by_epoch' \
--logging_interval 100 \ --logging_interval 100 \
--train.dataloader.workers_per_gpu 0 \ --train.dataloader.workers_per_gpu 0 \
--evaluation.dataloader.workers_per_gpu 0 \ --evaluation.dataloader.workers_per_gpu 0 \
--train.optimizer.lr 1e-4 --train.optimizer.lr 1e-5 \
--use_model_config true

View File

@@ -1,26 +1,18 @@
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from modelscope.msdatasets import MsDataset from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
from modelscope.trainers import EpochBasedTrainer, build_trainer build_dataset_from_file)
from modelscope.trainers.training_args import TrainingArgs from modelscope.trainers import build_trainer
def get_labels(cfg, metadata): def set_labels(labels):
label2id = cfg.safe_get(metadata['cfg_node'])
if label2id is not None:
return ','.join(label2id.keys())
def set_labels(cfg, labels, metadata):
if isinstance(labels, str): if isinstance(labels, str):
labels = labels.split(',') labels = labels.split(',')
cfg.merge_from_dict( return {label: id for id, label in enumerate(labels)}
{metadata['cfg_node']: {label: id
for id, label in enumerate(labels)}})
@dataclass @dataclass(init=False)
class TextClassificationArguments(TrainingArgs): class TextClassificationArguments(TrainingArgs):
first_sequence: str = field( first_sequence: str = field(
@@ -49,7 +41,6 @@ class TextClassificationArguments(TrainingArgs):
metadata={ metadata={
'help': 'The labels of the dataset', 'help': 'The labels of the dataset',
'cfg_node': 'preprocessor.label2id', 'cfg_node': 'preprocessor.label2id',
'cfg_getter': get_labels,
'cfg_setter': set_labels, 'cfg_setter': set_labels,
}) })
@@ -60,30 +51,39 @@ class TextClassificationArguments(TrainingArgs):
'cfg_node': 'preprocessor.type' 'cfg_node': 'preprocessor.type'
}) })
def __call__(self, config):
config = super().__call__(config) config, args = TextClassificationArguments().parse_cli().to_config()
config.model['num_labels'] = len(self.labels)
if config.train.lr_scheduler.type == 'LinearLR': print(config, args)
config.train.lr_scheduler['total_iters'] = \
int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
return config
args = TextClassificationArguments.from_cli( def cfg_modify_fn(cfg):
task='text-classification', eval_metrics='seq-cls-metric') if args.use_model_config:
cfg.merge_from_dict(config)
else:
cfg = config
cfg.model['num_labels'] = len(cfg.preprocessor.label2id)
if cfg.train.lr_scheduler.type == 'LinearLR':
cfg.train.lr_scheduler['total_iters'] = \
int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
return cfg
print(args)
dataset = MsDataset.load(args.dataset_name, subset_name=args.subset_name) if args.dataset_json_file is None:
train_dataset = dataset['train'] dataset = MsDataset.load(
validation_dataset = dataset['validation'] args.train_dataset_name, subset_name=args.train_subset_name)
train_dataset = dataset['train']
validation_dataset = dataset['validation']
else:
train_dataset, validation_dataset = build_dataset_from_file(
args.dataset_json_file)
kwargs = dict( kwargs = dict(
model=args.model, model=args.model,
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=validation_dataset, eval_dataset=validation_dataset,
seed=args.seed, seed=args.seed,
cfg_modify_fn=args) cfg_modify_fn=cfg_modify_fn)
os.environ['LOCAL_RANK'] = str(args.local_rank) os.environ['LOCAL_RANK'] = str(args.local_rank)
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs) trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)

View File

@@ -1,12 +1,16 @@
PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \ PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
--task 'text-classification' \
--model 'damo/nlp_structbert_backbone_base_std' \ --model 'damo/nlp_structbert_backbone_base_std' \
--dataset_name 'clue' \ --train_dataset_name 'clue' \
--subset_name 'tnews' \ --train_subset_name 'tnews' \
--first_sequence 'sentence' \ --first_sequence 'sentence' \
--preprocessor.label label \ --preprocessor.label label \
--model.num_labels 15 \ --model.num_labels 15 \
--labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \ --labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
--preprocessor 'sen-cls-tokenizer' \ --preprocessor 'sen-cls-tokenizer' \
--use_model_config True \
--max_epochs 1 \
--train.dataloader.workers_per_gpu 0 \ --train.dataloader.workers_per_gpu 0 \
--evaluation.dataloader.workers_per_gpu 0 \ --evaluation.dataloader.workers_per_gpu 0 \
--train.optimizer.lr 1e-5 \ --train.optimizer.lr 1e-5 \
--eval_metrics 'seq-cls-metric' \

View File

@@ -1,12 +1,11 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from modelscope import EpochBasedTrainer, MsDataset, TrainingArgs
from modelscope.metainfo import Trainers from modelscope.metainfo import Trainers
from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer
from modelscope.trainers import EpochBasedTrainer, build_trainer
from modelscope.trainers.training_args import TrainingArgs
@dataclass @dataclass(init=False)
class TextGenerationArguments(TrainingArgs): class TextGenerationArguments(TrainingArgs):
trainer: str = field( trainer: str = field(
@@ -67,30 +66,35 @@ class TextGenerationArguments(TrainingArgs):
'help': 'Whether to use MegatronHook', 'help': 'Whether to use MegatronHook',
}) })
def __call__(self, config):
config = super().__call__(config)
if config.train.lr_scheduler.type == 'noam':
config.train.lr_scheduler = {
'type': 'LambdaLR',
'lr_lambda': noam_lambda,
'options': {
'by_epoch': False
}
}
if self.use_megatron:
config.train.hooks.append({'type': 'MegatronHook'})
return config
def noam_lambda(current_step: int): def noam_lambda(current_step: int):
current_step += 1 current_step += 1
return min(current_step**(-0.5), current_step * 100**(-1.5)) return min(current_step**(-0.5), current_step * 100**(-1.5))
args = TextGenerationArguments.from_cli(task='text-generation') config, args = TextGenerationArguments().parse_cli().to_config()
print(args) print(config, args)
dataset = MsDataset.load(args.dataset_name)
def cfg_modify_fn(cfg):
if args.use_model_config:
cfg.merge_from_dict(config)
else:
cfg = config
if cfg.train.lr_scheduler.type == 'noam':
cfg.train.lr_scheduler = {
'type': 'LambdaLR',
'lr_lambda': noam_lambda,
'options': {
'by_epoch': False
}
}
if args.use_megatron:
cfg.train.hooks.append({'type': 'MegatronHook'})
return cfg
dataset = MsDataset.load(args.train_dataset_name)
train_dataset = dataset['train'] train_dataset = dataset['train']
eval_dataset = dataset['validation' if 'validation' in dataset else 'test'] eval_dataset = dataset['validation' if 'validation' in dataset else 'test']
@@ -100,7 +104,7 @@ kwargs = dict(
eval_dataset=eval_dataset, eval_dataset=eval_dataset,
seed=args.seed, seed=args.seed,
work_dir=args.work_dir, work_dir=args.work_dir,
cfg_modify_fn=args) cfg_modify_fn=cfg_modify_fn)
trainer: EpochBasedTrainer = build_trainer( trainer: EpochBasedTrainer = build_trainer(
name=args.trainer, default_args=kwargs) name=args.trainer, default_args=kwargs)

View File

@@ -8,7 +8,7 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
--trainer 'nlp-gpt3-trainer' \ --trainer 'nlp-gpt3-trainer' \
--work_dir './tmp' \ --work_dir './tmp' \
--model 'damo/nlp_gpt3_text-generation_1.3B' \ --model 'damo/nlp_gpt3_text-generation_1.3B' \
--dataset_name 'chinese-poetry-collection' \ --train_dataset_name 'chinese-poetry-collection' \
--preprocessor 'text-gen-jieba-tokenizer' \ --preprocessor 'text-gen-jieba-tokenizer' \
--src_txt 'text1' \ --src_txt 'text1' \
--tgt_txt 'text2' \ --tgt_txt 'text2' \
@@ -20,4 +20,5 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
--world_size $WORLD_SIZE \ --world_size $WORLD_SIZE \
--tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \ --tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \
--use_megatron true \ --use_megatron true \
# --dataset_name 'DuReader_robust-QG' \ # input&output --use_model_config true \
# --train_dataset_name 'DuReader_robust-QG' \ # input&output

View File

@@ -0,0 +1,13 @@
PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.py \
--trainer 'text-generation-trainer' \
--work_dir './tmp' \
--task 'text2text-generation' \
--model 'damo/nlp_mt5_zero-shot-augment_chinese-base' \
--train_dataset_name 'DuReader_robust-QG' \
--src_txt 'text1' \
--tgt_txt 'text2' \
--max_epochs 1 \
--use_model_config True \
--per_device_train_batch_size 8 \
--lr 1e-3 \
--lr_scheduler 'noam' \

View File

@@ -2,10 +2,11 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
--trainer 'text-generation-trainer' \ --trainer 'text-generation-trainer' \
--work_dir './tmp' \ --work_dir './tmp' \
--model 'damo/nlp_palm2.0_pretrained_chinese-base' \ --model 'damo/nlp_palm2.0_pretrained_chinese-base' \
--dataset_name 'DuReader_robust-QG' \ --train_dataset_name 'DuReader_robust-QG' \
--src_txt 'text1' \ --src_txt 'text1' \
--tgt_txt 'text2' \ --tgt_txt 'text2' \
--max_epochs 15 \ --max_epochs 1 \
--use_model_config True \
--per_device_train_batch_size 8 \ --per_device_train_batch_size 8 \
--lr 1e-3 \ --lr 1e-3 \
--lr_scheduler 'noam' \ --lr_scheduler 'noam' \

View File

@@ -1,20 +1,22 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from modelscope.metainfo import Trainers from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
from modelscope.msdatasets import MsDataset build_dataset_from_file)
from modelscope.trainers import build_trainer
from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
set_flatten_value)
@dataclass @dataclass(init=False)
class TokenClassificationArguments(TrainingArgs): class TokenClassificationArguments(TrainingArgs):
trainer: str = field( trainer: str = field(
default=Trainers.default, metadata={ default=None, metadata={
'help': 'The trainer used', 'help': 'The trainer used',
}) })
work_dir: str = field(
default='./tmp',
metadata={
'help': 'The working path for saving checkpoint',
})
preprocessor: str = field( preprocessor: str = field(
default=None, default=None,
metadata={ metadata={
@@ -29,60 +31,99 @@ class TokenClassificationArguments(TrainingArgs):
'cfg_node': 'preprocessor.padding' 'cfg_node': 'preprocessor.padding'
}) })
train_dataset_params: str = field( mode: str = field(
default='inference',
metadata={
'help': 'The preprocessor padding',
'cfg_node': 'preprocessor.mode'
})
first_sequence: str = field(
default=None, default=None,
metadata={ metadata={
'cfg_node': 'dataset.train', 'cfg_node': 'preprocessor.first_sequence',
'cfg_getter': get_flatten_value,
'cfg_setter': set_flatten_value,
'help': 'The parameters for train dataset', 'help': 'The parameters for train dataset',
}) })
def __call__(self, config): label: str = field(
config = super().__call__(config) default=None,
if config.safe_get('dataset.train.label') == 'ner_tags': metadata={
ner_tags_labels = train_dataset['ner_tags'] + eval_dataset[ 'cfg_node': 'preprocessor.label',
'ner_tags'] 'help': 'The parameters for train dataset',
label_enumerate_values = self._get_label_list(ner_tags_labels) })
config.merge_from_dict(
{'dataset.train.labels': label_enumerate_values})
if config.train.lr_scheduler.type == 'LinearLR':
config.train.lr_scheduler['total_iters'] = \
int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
return config
# TODO: Future performance optimization in MsDataset sequence_length: int = field(
@staticmethod default=128,
def _get_label_list(labels): metadata={
unique_labels = set() 'cfg_node': 'preprocessor.sequence_length',
for label in labels: 'help': 'The parameters for train dataset',
unique_labels = unique_labels | set(label) })
label_list = list(unique_labels)
label_list.sort()
return label_list
args = TokenClassificationArguments.from_cli(task='token-classification') training_args = TokenClassificationArguments().parse_cli()
config, args = training_args.to_config()
print(args) print(args)
# load dataset
train_dataset = MsDataset.load( def get_label_list(labels):
args.dataset_name, unique_labels = set()
subset_name=args.subset_name, for label in labels:
split='train', unique_labels = unique_labels | set(label)
namespace='damo')['train'] label_list = list(unique_labels)
eval_dataset = MsDataset.load( label_list.sort()
args.dataset_name, return label_list
subset_name=args.subset_name,
split='validation',
namespace='damo')['validation'] def cfg_modify_fn(cfg):
if args.use_model_config:
cfg.merge_from_dict(config)
else:
cfg = config
labels = train_dataset[training_args.label] + validation_dataset[
training_args.label]
label_enumerate_values = get_label_list(labels)
cfg.merge_from_dict({
'preprocessor.label2id':
{label: id
for id, label in enumerate(label_enumerate_values)}
})
cfg.merge_from_dict({'model.num_labels': len(label_enumerate_values)})
cfg.merge_from_dict({'preprocessor.use_fast': True})
cfg.merge_from_dict({
'evaluation.metrics': {
'type': 'token-cls-metric',
'label2id':
{label: id
for id, label in enumerate(label_enumerate_values)}
}
})
if cfg.train.lr_scheduler.type == 'LinearLR':
cfg.train.lr_scheduler['total_iters'] = \
int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
return cfg
if args.dataset_json_file is None:
train_dataset = MsDataset.load(
args.train_dataset_name,
subset_name=args.train_subset_name,
split='train',
namespace=args.train_dataset_namespace)['train']
validation_dataset = MsDataset.load(
args.train_dataset_name,
subset_name=args.train_subset_name,
split='validation',
namespace=args.train_dataset_namespace)['validation']
else:
train_dataset, validation_dataset = build_dataset_from_file(
args.dataset_json_file)
kwargs = dict( kwargs = dict(
model=args.model, model=args.model,
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=eval_dataset, eval_dataset=validation_dataset,
work_dir=args.work_dir, work_dir=args.work_dir,
cfg_modify_fn=args) cfg_modify_fn=cfg_modify_fn)
trainer = build_trainer(name=args.trainer, default_args=kwargs) trainer = EpochBasedTrainer(**kwargs)
trainer.train() trainer.train()

View File

@@ -1,15 +1,22 @@
PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \ PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
--task 'token-classification' \
--trainer 'nlp-base-trainer' \ --trainer 'nlp-base-trainer' \
--work_dir './tmp' \ --work_dir './tmp' \
--model 'damo/mgeo_backbone_chinese_base' \ --model 'damo/mgeo_backbone_chinese_base' \
--dataset_name 'GeoGLUE' \ --train_dataset_name 'GeoGLUE' \
--subset_name 'GeoETA' \ --train_subset_name 'GeoETA' \
--train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \ --train_dataset_namespace 'damo' \
--first_sequence 'tokens' \
--eval_strategy by_step \
--eval_interval 10 \
--label 'ner_tags' \
--sequence_length 128 \
--preprocessor 'token-cls-tokenizer' \ --preprocessor 'token-cls-tokenizer' \
--preprocessor_padding 'max_length' \ --preprocessor_padding 'max_length' \
--max_epochs 1 \ --max_epochs 1 \
--mode 'inference' \
--use_model_config True \
--per_device_train_batch_size 32 \ --per_device_train_batch_size 32 \
--train_data_worker 0 \
--eval_data_worker 0 \
--lr 3e-5 \ --lr 3e-5 \
--save_ckpt_strategy 'by_epoch' \
--logging_interval 100 \
--eval_strategy 'by_epoch' \

View File

@@ -1,16 +1,22 @@
PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \ PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
--task 'token-classification' \
--trainer 'nlp-base-trainer' \ --trainer 'nlp-base-trainer' \
--work_dir './tmp' \ --work_dir './tmp' \
--model 'damo/nlp_structbert_backbone_base_std' \ --model 'damo/nlp_structbert_backbone_base_std' \
--dataset_name 'GeoGLUE' \ --train_dataset_name 'GeoGLUE' \
--subset_name 'GeoETA' \ --train_subset_name 'GeoETA' \
--train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \ --train_dataset_namespace 'damo' \
--first_sequence 'tokens' \
--eval_strategy by_step \
--eval_interval 20 \
--label 'ner_tags' \
--sequence_length 128 \
--preprocessor 'token-cls-tokenizer' \ --preprocessor 'token-cls-tokenizer' \
--preprocessor_padding 'max_length' \ --preprocessor_padding 'max_length' \
--max_epochs 2 \ --max_epochs 2 \
--mode 'inference' \
--use_model_config True \
--per_device_train_batch_size 32 \ --per_device_train_batch_size 32 \
--train_data_worker 0 \
--eval_data_worker 0 \
--lr 3e-5 \ --lr 3e-5 \
--save_ckpt_strategy 'by_epoch' \
--logging_interval 1 \
--eval_strategy 'by_step' \
--eval_interval 20 \

View File

@@ -1 +0,0 @@
{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}

View File

@@ -5,11 +5,11 @@ from datasets import load_dataset
from transformers import (BertForSequenceClassification, BertTokenizerFast, from transformers import (BertForSequenceClassification, BertTokenizerFast,
default_data_collator) default_data_collator)
from modelscope import TrainingArgs
from modelscope.trainers import EpochBasedTrainer, build_trainer from modelscope.trainers import EpochBasedTrainer, build_trainer
from modelscope.trainers.default_config import DEFAULT_CONFIG, TrainingArgs
@dataclass @dataclass(init=False)
class TransformersArguments(TrainingArgs): class TransformersArguments(TrainingArgs):
num_labels: int = field( num_labels: int = field(
@@ -17,13 +17,27 @@ class TransformersArguments(TrainingArgs):
'help': 'The number of labels', 'help': 'The number of labels',
}) })
sentence: str = field(
default=None, metadata={
'help': 'The sentence key',
})
args = TransformersArguments.from_cli( label: str = field(
task='text-classification', eval_metrics='seq-cls-metric') default=None, metadata={
'help': 'The label key',
})
print(args)
dataset = load_dataset(args.dataset_name, args.subset_name) training_args = TransformersArguments(
task='text-classification', eval_metrics='seq-cls-metric').parse_cli()
config, args = training_args.to_config()
print(config, args)
train_dataset = load_dataset(
args.train_dataset_name, args.train_subset_name, split=args.train_split)
val_dataset = load_dataset(
args.val_dataset_name, args.val_subset_name, split=args.val_split)
model = BertForSequenceClassification.from_pretrained( model = BertForSequenceClassification.from_pretrained(
args.model, num_labels=args.num_labels) args.model, num_labels=args.num_labels)
@@ -31,26 +45,30 @@ tokenizer = BertTokenizerFast.from_pretrained(args.model)
def tokenize_sentence(row): def tokenize_sentence(row):
return tokenizer(row['sentence'], padding='max_length', max_length=128) return tokenizer(
row[training_args.sentence], padding='max_length', max_length=128)
# Extra columns, Rename columns # Extra columns, Rename columns
dataset = dataset.map(tokenize_sentence).remove_columns(['sentence', train_dataset = train_dataset.map(tokenize_sentence)
'idx']).rename_column( val_dataset = val_dataset.map(tokenize_sentence)
'label', 'labels') if training_args.label != 'labels':
train_dataset = train_dataset.rename_columns(
{training_args.label: 'labels'})
val_dataset = val_dataset.rename_columns({training_args.label: 'labels'})
cfg_file = os.path.join(args.work_dir or './', 'configuration.json') cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
DEFAULT_CONFIG.dump(cfg_file) config.dump(cfg_file)
kwargs = dict( kwargs = dict(
model=model, model=model,
cfg_file=cfg_file, cfg_file=cfg_file,
# data_collator # data_collator
data_collator=default_data_collator, data_collator=default_data_collator,
train_dataset=dataset['train'], train_dataset=train_dataset,
eval_dataset=dataset['validation'], eval_dataset=val_dataset,
seed=args.seed, remove_unused_data=True,
cfg_modify_fn=args) seed=args.seed)
os.environ['LOCAL_RANK'] = str(args.local_rank) os.environ['LOCAL_RANK'] = str(args.local_rank)
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs) trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)

View File

@@ -1,5 +1,14 @@
PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \ PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
--model bert-base-uncased \ --model bert-base-uncased \
--num_labels 15 \ --num_labels 15 \
--dataset_name clue \ --train_dataset_name clue \
--subset_name tnews --train_subset_name tnews \
--train_split train \
--val_dataset_name clue \
--val_subset_name tnews \
--train_split train \
--val_split validation \
--sentence sentence \
--label label \
--eval_strategy by_step \
--eval_interval 100

View File

@@ -1,4 +1,79 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
from .version import __release_datetime__, __version__ from typing import TYPE_CHECKING
__all__ = ['__version__', '__release_datetime__'] from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .version import __release_datetime__, __version__
from .trainers import EpochBasedTrainer, TrainingArgs, build_dataset_from_file
from .trainers import Hook, Priority
from .exporters import Exporter
from .exporters import TfModelExporter
from .exporters import TorchModelExporter
from .hub.api import HubApi
from .hub.snapshot_download import snapshot_download
from .hub.push_to_hub import push_to_hub, push_to_hub_async
from .hub.check_model import check_model_is_id, check_local_model_is_latest
from .metrics import AudioNoiseMetric, Metric, task_default_metrics, ImageColorEnhanceMetric, ImageDenoiseMetric, \
ImageInstanceSegmentationCOCOMetric, ImagePortraitEnhancementMetric, SequenceClassificationMetric, \
TextGenerationMetric, TokenClassificationMetric, VideoSummarizationMetric, MovieSceneSegmentationMetric, \
AccuracyMetric, BleuMetric, ImageInpaintingMetric, ReferringVideoObjectSegmentationMetric, \
VideoFrameInterpolationMetric, VideoStabilizationMetric, VideoSuperResolutionMetric, PplMetric, \
ImageQualityAssessmentDegradationMetric, ImageQualityAssessmentMosMetric, TextRankingMetric, \
LossMetric, ImageColorizationMetric, OCRRecognitionMetric
from .models import Model, TorchModel
from .preprocessors import Preprocessor
from .pipelines import Pipeline, pipeline
from .utils.hub import read_config, create_model_if_not_exist
from .utils.logger import get_logger
from .msdatasets import MsDataset
else:
_import_structure = {
'version': ['__release_datetime__', '__version__'],
'trainers': [
'EpochBasedTrainer', 'TrainingArgs', 'Hook', 'Priority',
'build_dataset_from_file'
],
'exporters': [
'Exporter',
'TfModelExporter',
'TorchModelExporter',
],
'hub.api': ['HubApi'],
'hub.snapshot_download': ['snapshot_download'],
'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'],
'hub.check_model':
['check_model_is_id', 'check_local_model_is_latest'],
'metrics': [
'AudioNoiseMetric', 'Metric', 'task_default_metrics',
'ImageColorEnhanceMetric', 'ImageDenoiseMetric',
'ImageInstanceSegmentationCOCOMetric',
'ImagePortraitEnhancementMetric', 'SequenceClassificationMetric',
'TextGenerationMetric', 'TokenClassificationMetric',
'VideoSummarizationMetric', 'MovieSceneSegmentationMetric',
'AccuracyMetric', 'BleuMetric', 'ImageInpaintingMetric',
'ReferringVideoObjectSegmentationMetric',
'VideoFrameInterpolationMetric', 'VideoStabilizationMetric',
'VideoSuperResolutionMetric', 'PplMetric',
'ImageQualityAssessmentDegradationMetric',
'ImageQualityAssessmentMosMetric', 'TextRankingMetric',
'LossMetric', 'ImageColorizationMetric', 'OCRRecognitionMetric'
],
'models': ['Model', 'TorchModel'],
'preprocessors': ['Preprocessor'],
'pipelines': ['Pipeline', 'pipeline'],
'utils.hub': ['read_config', 'create_model_if_not_exist'],
'utils.logger': ['get_logger'],
'msdatasets': ['MsDataset']
}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -122,10 +122,11 @@ class ${pipeline_name}(Pipeline):
# Tips: usr_config_path is the temporary save configuration location after upload modelscope hub, it is the model_id # Tips: usr_config_path is the temporary save configuration location after upload modelscope hub, it is the model_id
usr_config_path = '${configuration_path}' usr_config_path = '${configuration_path}'
config = Config({ config = Config({
'framework': 'pytorch', "framework": 'pytorch',
'task': '${task_name}', "task": '${task_name}',
'model': {'type': 'my-custom-model'}, "model": {'type': 'my-custom-model'},
"pipeline": {"type": "my-custom-pipeline"} "pipeline": {"type": "my-custom-pipeline"},
"allow_remote": True
}) })
config.dump('${configuration_path}' + 'configuration.json') config.dump('${configuration_path}' + 'configuration.json')

View File

@@ -1,14 +1,14 @@
# Copyright (c) Alibaba, Inc. and its affiliates. # Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING: if TYPE_CHECKING:
from .human_wholebody_keypoint import HumanWholeBodyKeypoint from .ans_dfsmn_exporter import ANSDFSMNExporter
else: else:
_import_structure = { _import_structure = {
'human_wholebody_keypoint': ['HumanWholeBodyKeypoint'] 'ans_dfsmn_exporter': ['ANSDFSMNExporter'],
} }
import sys import sys

View File

@@ -0,0 +1,62 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import torch
from modelscope.exporters.builder import EXPORTERS
from modelscope.exporters.torch_model_exporter import TorchModelExporter
from modelscope.metainfo import Models
from modelscope.utils.constant import ModelFile, Tasks
INPUT_NAME = 'input'
OUTPUT_NAME = 'output'
@EXPORTERS.register_module(
Tasks.acoustic_noise_suppression, module_name=Models.speech_dfsmn_ans)
class ANSDFSMNExporter(TorchModelExporter):
def export_onnx(self, output_dir: str, opset=9, **kwargs):
"""Export the model as onnx format files.
Args:
output_dir: The output dir.
opset: The version of the ONNX operator set to use.
kwargs:
device: The device used to forward.
Returns:
A dict containing the model key - model file path pairs.
"""
model = self.model if 'model' not in kwargs else kwargs.pop('model')
device_name = 'cpu' if 'device' not in kwargs else kwargs.pop('device')
model_bin_file = os.path.join(model.model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
if os.path.exists(model_bin_file):
checkpoint = torch.load(model_bin_file, map_location='cpu')
model.load_state_dict(checkpoint)
onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
with torch.no_grad():
model.eval()
device = torch.device(device_name)
model.to(device)
model_script = torch.jit.script(model)
fbank_input = torch.zeros((1, 3, 120), dtype=torch.float32)
torch.onnx.export(
model_script,
fbank_input,
onnx_file,
opset_version=opset,
input_names=[INPUT_NAME],
output_names=[OUTPUT_NAME],
dynamic_axes={
INPUT_NAME: {
0: 'batch_size',
1: 'number_of_frame'
},
OUTPUT_NAME: {
0: 'batch_size',
1: 'number_of_frame'
}
})
return {'model': onnx_file}

View File

@@ -6,6 +6,7 @@ import functools
import os import os
import pickle import pickle
import platform import platform
import re
import shutil import shutil
import tempfile import tempfile
import uuid import uuid
@@ -15,10 +16,10 @@ from http.cookiejar import CookieJar
from os.path import expanduser from os.path import expanduser
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import requests
from requests import Session from requests import Session
from requests.adapters import HTTPAdapter, Retry from requests.adapters import HTTPAdapter, Retry
from modelscope import __version__
from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT, from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
API_RESPONSE_FIELD_DATA, API_RESPONSE_FIELD_DATA,
API_RESPONSE_FIELD_EMAIL, API_RESPONSE_FIELD_EMAIL,
@@ -45,7 +46,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
MASTER_MODEL_BRANCH, DatasetFormations, MASTER_MODEL_BRANCH, DatasetFormations,
DatasetMetaFormats, DatasetMetaFormats,
DatasetVisibilityMap, DownloadChannel, DatasetVisibilityMap, DownloadChannel,
ModelFile) ModelFile, VirgoDatasetConfig)
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
from .utils.utils import (get_endpoint, get_release_datetime, from .utils.utils import (get_endpoint, get_release_datetime,
model_id_to_group_owner_name) model_id_to_group_owner_name)
@@ -160,6 +161,7 @@ class HubApi:
'Visibility': visibility, # server check 'Visibility': visibility, # server check
'License': license, 'License': license,
'OriginalModelId': original_model_id, 'OriginalModelId': original_model_id,
'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''),
} }
r = self.session.post( r = self.session.post(
path, json=body, cookies=cookies, headers=self.headers) path, json=body, cookies=cookies, headers=self.headers)
@@ -236,8 +238,10 @@ class HubApi:
license: Optional[str] = Licenses.APACHE_V2, license: Optional[str] = Licenses.APACHE_V2,
chinese_name: Optional[str] = None, chinese_name: Optional[str] = None,
commit_message: Optional[str] = 'upload model', commit_message: Optional[str] = 'upload model',
tag: Optional[str] = None,
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION, revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
original_model_id: Optional[str] = None): original_model_id: Optional[str] = None,
ignore_file_pattern: Optional[Union[List[str], str]] = None):
"""Upload model from a given directory to given repository. A valid model directory """Upload model from a given directory to given repository. A valid model directory
must contain a configuration.json file. must contain a configuration.json file.
@@ -268,10 +272,13 @@ class HubApi:
chinese name of the new created model. chinese name of the new created model.
commit_message(`str`, *optional*, defaults to `None`): commit_message(`str`, *optional*, defaults to `None`):
commit message of the push request. commit message of the push request.
tag(`str`, *optional*, defaults to `None`):
The tag on this commit
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION): revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
which branch to push. If the branch is not exists, It will create a new which branch to push. If the branch is not exists, It will create a new
branch and push to it. branch and push to it.
original_model_id (str, optional): The base model id which this model is trained from original_model_id (str, optional): The base model id which this model is trained from
ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading
Raises: Raises:
InvalidParameter: Parameter invalid. InvalidParameter: Parameter invalid.
@@ -292,6 +299,10 @@ class HubApi:
if cookies is None: if cookies is None:
raise NotLoginException('Must login before upload!') raise NotLoginException('Must login before upload!')
files_to_save = os.listdir(model_dir) files_to_save = os.listdir(model_dir)
if ignore_file_pattern is None:
ignore_file_pattern = []
if isinstance(ignore_file_pattern, str):
ignore_file_pattern = [ignore_file_pattern]
try: try:
self.get_model(model_id=model_id) self.get_model(model_id=model_id)
except Exception: except Exception:
@@ -325,6 +336,8 @@ class HubApi:
shutil.rmtree(src, ignore_errors=True) shutil.rmtree(src, ignore_errors=True)
for f in files_to_save: for f in files_to_save:
if f[0] != '.': if f[0] != '.':
if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
continue
src = os.path.join(model_dir, f) src = os.path.join(model_dir, f)
if os.path.isdir(src): if os.path.isdir(src):
shutil.copytree(src, os.path.join(tmp_dir, f)) shutil.copytree(src, os.path.join(tmp_dir, f))
@@ -338,6 +351,8 @@ class HubApi:
commit_message=commit_message, commit_message=commit_message,
local_branch=revision, local_branch=revision,
remote_branch=revision) remote_branch=revision)
if tag is not None:
repo.tag_and_push(tag, tag)
except Exception: except Exception:
raise raise
finally: finally:
@@ -581,6 +596,17 @@ class HubApi:
file_list = file_list['Files'] file_list = file_list['Files']
return file_list return file_list
@staticmethod
def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
"""
Dump the data_type as a local file, in order to get the dataset formation without calling the datahub.
More details, please refer to the class `modelscope.utils.constant.DatasetFormations`.
"""
dataset_type_file_path = os.path.join(meta_cache_dir,
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
with open(dataset_type_file_path, 'w') as fp:
fp.write('*** Automatically-generated file, do not modify ***')
def get_dataset_meta_files_local_paths(self, dataset_name: str, def get_dataset_meta_files_local_paths(self, dataset_name: str,
namespace: str, namespace: str,
revision: str, revision: str,
@@ -591,10 +617,7 @@ class HubApi:
cookies = ModelScopeConfig.get_cookies() cookies = ModelScopeConfig.get_cookies()
# Dump the data_type as a local file # Dump the data_type as a local file
dataset_type_file_path = os.path.join(meta_cache_dir, HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
with open(dataset_type_file_path, 'w') as fp:
fp.write('*** Automatically-generated file, do not modify ***')
for file_info in file_list: for file_info in file_list:
file_path = file_info['Path'] file_path = file_info['Path']
@@ -661,7 +684,6 @@ class HubApi:
cookies = self._check_cookie(use_cookies=True) cookies = self._check_cookie(use_cookies=True)
else: else:
cookies = ModelScopeConfig.get_cookies() cookies = ModelScopeConfig.get_cookies()
r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
r = self.session.get( r = self.session.get(
url=datahub_url, cookies=cookies, headers=self.headers) url=datahub_url, cookies=cookies, headers=self.headers)
@@ -669,6 +691,31 @@ class HubApi:
raise_on_error(resp) raise_on_error(resp)
return resp['Data'] return resp['Data']
def get_virgo_meta(self, dataset_id: str, version: int = 1) -> dict:
"""
Get virgo dataset meta info.
"""
virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
if not virgo_endpoint:
raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
cookies = requests.utils.dict_from_cookiejar(ModelScopeConfig.get_cookies())
dataset_info = dict(
dataSetId=dataset_id,
dataSetVersion=version
)
data = dict(
data=dataset_info,
)
r = self.session.post(url=virgo_dataset_url, json=data, cookies=cookies, headers=self.headers, timeout=900)
resp = r.json()
if resp['code'] != 0:
raise RuntimeError(f'Failed to get virgo dataset: {resp}')
return resp['data']
def get_dataset_access_config_for_unzipped(self, def get_dataset_access_config_for_unzipped(self,
dataset_name: str, dataset_name: str,
namespace: str, namespace: str,
@@ -895,6 +942,7 @@ class ModelScopeConfig:
if MODELSCOPE_CLOUD_USERNAME in os.environ: if MODELSCOPE_CLOUD_USERNAME in os.environ:
user_name = os.environ[MODELSCOPE_CLOUD_USERNAME] user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
from modelscope import __version__
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % ( ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
__version__, __version__,
platform.python_version(), platform.python_version(),

View File

@@ -2,6 +2,7 @@
from http import HTTPStatus from http import HTTPStatus
import requests
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
@@ -57,13 +58,22 @@ def is_ok(rsp):
return rsp['Code'] == HTTPStatus.OK and rsp['Success'] return rsp['Code'] == HTTPStatus.OK and rsp['Success']
def _decode_response_error(response: requests.Response):
if 'application/json' in response.headers.get('content-type', ''):
message = response.json()
else:
message = response.content.decode('utf-8')
return message
def handle_http_post_error(response, url, request_body): def handle_http_post_error(response, url, request_body):
try: try:
response.raise_for_status() response.raise_for_status()
except HTTPError as error: except HTTPError as error:
logger.error('Request %s with body: %s exception' % logger.error('Request %s with body: %s exception' %
(url, request_body)) (url, request_body))
logger.error('Response details: %s' % response.content) message = _decode_response_error(response)
logger.error('Response details: %s' % message)
raise error raise error
@@ -75,7 +85,8 @@ def handle_http_response(response, logger, cookies, model_id):
logger.error( logger.error(
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
private. Please login first.') private. Please login first.')
logger.error('Response details: %s' % response.content) message = _decode_response_error(response)
logger.error('Response details: %s' % message)
raise error raise error

View File

@@ -12,7 +12,6 @@ import requests
from requests.adapters import Retry from requests.adapters import Retry
from tqdm import tqdm from tqdm import tqdm
from modelscope import __version__
from modelscope.hub.api import HubApi, ModelScopeConfig from modelscope.hub.api import HubApi, ModelScopeConfig
from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE, from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
API_FILE_DOWNLOAD_RETRY_TIMES, API_FILE_DOWNLOAD_RETRY_TIMES,

View File

@@ -55,16 +55,10 @@ class GitCommandWrapper(metaclass=Singleton):
response.check_returncode() response.check_returncode()
return response return response
except subprocess.CalledProcessError as error: except subprocess.CalledProcessError as error:
if response.returncode == 1: logger.error('There are error run git command.')
logger.info('Nothing to commit.') raise GitError(
return response 'stdout: %s, stderr: %s' %
else: (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
logger.error(
'There are error run git command, you may need to login first.'
)
raise GitError('stdout: %s, stderr: %s' %
(response.stdout.decode('utf8'),
error.stderr.decode('utf8')))
def config_auth_token(self, repo_dir, auth_token): def config_auth_token(self, repo_dir, auth_token):
url = self.get_repo_remote_url(repo_dir) url = self.get_repo_remote_url(repo_dir)
@@ -199,8 +193,11 @@ class GitCommandWrapper(metaclass=Singleton):
else: else:
return ['/'.join(line.split('/')[1:]) for line in info[1:]] return ['/'.join(line.split('/')[1:]) for line in info[1:]]
def pull(self, repo_dir: str): def pull(self,
cmds = ['-C', repo_dir, 'pull'] repo_dir: str,
remote: str = 'origin',
branch: str = 'master'):
cmds = ['-C', repo_dir, 'pull', remote, branch]
return self._run_git_command(*cmds) return self._run_git_command(*cmds)
def push(self, def push(self,

View File

@@ -4,8 +4,8 @@ import concurrent.futures
import os import os
from modelscope.hub.api import HubApi from modelscope.hub.api import HubApi
from modelscope.hub.constants import Licenses, ModelVisibility from modelscope.hub.constants import ModelVisibility
from modelscope.hub.errors import NotExistError from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
logger = get_logger() logger = get_logger()
@@ -18,7 +18,10 @@ def _api_push_to_hub(repo_name,
token, token,
private=True, private=True,
commit_message='', commit_message='',
source_repo=''): tag=None,
source_repo='',
ignore_file_pattern=None,
revision=DEFAULT_REPOSITORY_REVISION):
try: try:
api = HubApi() api = HubApi()
api.login(token) api.login(token)
@@ -29,7 +32,10 @@ def _api_push_to_hub(repo_name,
if not private else ModelVisibility.PRIVATE, if not private else ModelVisibility.PRIVATE,
chinese_name=repo_name, chinese_name=repo_name,
commit_message=commit_message, commit_message=commit_message,
original_model_id=source_repo) tag=tag,
original_model_id=source_repo,
ignore_file_pattern=ignore_file_pattern,
revision=revision)
commit_message = commit_message or 'No commit message' commit_message = commit_message or 'No commit message'
logger.info( logger.info(
f'Successfully upload the model to {repo_name} with message: {commit_message}' f'Successfully upload the model to {repo_name} with message: {commit_message}'
@@ -48,7 +54,10 @@ def push_to_hub(repo_name,
private=True, private=True,
retry=3, retry=3,
commit_message='', commit_message='',
source_repo=''): tag=None,
source_repo='',
ignore_file_pattern=None,
revision=DEFAULT_REPOSITORY_REVISION):
""" """
Args: Args:
repo_name: The repo name for the modelhub repo repo_name: The repo name for the modelhub repo
@@ -57,13 +66,18 @@ def push_to_hub(repo_name,
private: If is a private repo, default True private: If is a private repo, default True
retry: Retry times if something error in uploading, default 3 retry: Retry times if something error in uploading, default 3
commit_message: The commit message commit_message: The commit message
tag: The tag of this commit
source_repo: The source repo (model id) which this model comes from source_repo: The source repo (model id) which this model comes from
ignore_file_pattern: The file pattern to be ignored in uploading.
revision: The branch to commit to
Returns: Returns:
The boolean value to represent whether the model is uploaded. The boolean value to represent whether the model is uploaded.
""" """
if token is None: if token is None:
token = os.environ.get('MODELSCOPE_API_TOKEN') token = os.environ.get('MODELSCOPE_API_TOKEN')
if ignore_file_pattern is None:
ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
assert repo_name is not None
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.' assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
assert os.path.isdir(output_dir) assert os.path.isdir(output_dir)
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \ assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
@@ -73,7 +87,8 @@ def push_to_hub(repo_name,
f'Uploading {output_dir} to {repo_name} with message {commit_message}') f'Uploading {output_dir} to {repo_name} with message {commit_message}')
for i in range(retry): for i in range(retry):
if _api_push_to_hub(repo_name, output_dir, token, private, if _api_push_to_hub(repo_name, output_dir, token, private,
commit_message, source_repo): commit_message, tag, source_repo,
ignore_file_pattern, revision):
return True return True
return False return False
@@ -83,7 +98,10 @@ def push_to_hub_async(repo_name,
token=None, token=None,
private=True, private=True,
commit_message='', commit_message='',
source_repo=''): tag=None,
source_repo='',
ignore_file_pattern=None,
revision=DEFAULT_REPOSITORY_REVISION):
""" """
Args: Args:
repo_name: The repo name for the modelhub repo repo_name: The repo name for the modelhub repo
@@ -91,13 +109,18 @@ def push_to_hub_async(repo_name,
token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None
private: If is a private repo, default True private: If is a private repo, default True
commit_message: The commit message commit_message: The commit message
tag: The tag of this commit
source_repo: The source repo (model id) which this model comes from source_repo: The source repo (model id) which this model comes from
ignore_file_pattern: The file pattern to be ignored in uploading
revision: The branch to commit to
Returns: Returns:
A handler to check the result and the status A handler to check the result and the status
""" """
if token is None: if token is None:
token = os.environ.get('MODELSCOPE_API_TOKEN') token = os.environ.get('MODELSCOPE_API_TOKEN')
if ignore_file_pattern is None:
ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
assert repo_name is not None
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.' assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
assert os.path.isdir(output_dir) assert os.path.isdir(output_dir)
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \ assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
@@ -106,4 +129,5 @@ def push_to_hub_async(repo_name,
logger.info( logger.info(
f'Uploading {output_dir} to {repo_name} with message {commit_message}') f'Uploading {output_dir} to {repo_name} with message {commit_message}')
return _executor.submit(_api_push_to_hub, repo_name, output_dir, token, return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
private, commit_message, source_repo) private, commit_message, tag, source_repo,
ignore_file_pattern, revision)

View File

@@ -88,6 +88,26 @@ class Repository:
remote = None remote = None
return remote return remote
def pull(self, remote: str = 'origin', branch: str = 'master'):
"""Pull remote branch
Args:
remote (str, optional): The remote name. Defaults to 'origin'.
branch (str, optional): The remote branch. Defaults to 'master'.
"""
self.git_wrapper.pull(self.model_dir, remote=remote, branch=branch)
def add_lfs_type(self, file_name_suffix: str):
"""Add file suffix to lfs list.
Args:
file_name_suffix (str): The file name suffix.
examples '*.safetensors'
"""
os.system(
"printf '%s filter=lfs diff=lfs merge=lfs -text\n'>>%s" %
(file_name_suffix, os.path.join(self.model_dir, '.gitattributes')))
def push(self, def push(self,
commit_message: str, commit_message: str,
local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION, local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
@@ -120,7 +140,6 @@ class Repository:
self.model_repo_name) self.model_repo_name)
url = self.git_wrapper.get_repo_remote_url(self.model_dir) url = self.git_wrapper.get_repo_remote_url(self.model_dir)
self.git_wrapper.pull(self.model_dir)
self.git_wrapper.add(self.model_dir, all_files=True) self.git_wrapper.add(self.model_dir, all_files=True)
self.git_wrapper.commit(self.model_dir, commit_message) self.git_wrapper.commit(self.model_dir, commit_message)

View File

@@ -116,15 +116,9 @@ class Models(object):
bad_image_detecting = 'bad-image-detecting' bad_image_detecting = 'bad-image-detecting'
controllable_image_generation = 'controllable-image-generation' controllable_image_generation = 'controllable-image-generation'
longshortnet = 'longshortnet' longshortnet = 'longshortnet'
fastinst = 'fastinst'
pedestrian_attribute_recognition = 'pedestrian-attribute-recognition' pedestrian_attribute_recognition = 'pedestrian-attribute-recognition'
# EasyCV models
yolox = 'YOLOX'
segformer = 'Segformer'
hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
image_object_detection_auto = 'image-object-detection-auto'
dino = 'DINO'
# nlp models # nlp models
bert = 'bert' bert = 'bert'
palm = 'palm-v2' palm = 'palm-v2'
@@ -177,6 +171,7 @@ class Models(object):
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
speech_dfsmn_ans = 'speech_dfsmn_ans' speech_dfsmn_ans = 'speech_dfsmn_ans'
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield' speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k' speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
kws_kwsbp = 'kws-kwsbp' kws_kwsbp = 'kws-kwsbp'
@@ -187,6 +182,9 @@ class Models(object):
generic_sv = 'generic-sv' generic_sv = 'generic-sv'
ecapa_tdnn_sv = 'ecapa-tdnn-sv' ecapa_tdnn_sv = 'ecapa-tdnn-sv'
campplus_sv = 'cam++-sv' campplus_sv = 'cam++-sv'
eres2net_sv = 'eres2net-sv'
scl_sd = 'scl-sd'
rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
generic_lm = 'generic-lm' generic_lm = 'generic-lm'
# multi-modal models # multi-modal models
@@ -205,6 +203,8 @@ class Models(object):
hitea = 'hitea' hitea = 'hitea'
soonet = 'soonet' soonet = 'soonet'
efficient_diffusion_tuning = 'efficient-diffusion-tuning' efficient_diffusion_tuning = 'efficient-diffusion-tuning'
mplug_owl = 'mplug-owl'
clip_interrogator = 'clip-interrogator'
# science models # science models
unifold = 'unifold' unifold = 'unifold'
@@ -255,6 +255,7 @@ class Pipelines(object):
should use task name for this pipeline. should use task name for this pipeline.
For pipeline which suuport only one model, we should use ${Model}-${Task} as its name. For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
""" """
pipeline_template = 'pipeline-template'
# vision tasks # vision tasks
portrait_matting = 'unet-image-matting' portrait_matting = 'unet-image-matting'
universal_matting = 'unet-universal-matting' universal_matting = 'unet-universal-matting'
@@ -277,8 +278,6 @@ class Pipelines(object):
tbs_detection = 'tbs-detection' tbs_detection = 'tbs-detection'
object_detection = 'vit-object-detection' object_detection = 'vit-object-detection'
abnormal_object_detection = 'abnormal-object-detection' abnormal_object_detection = 'abnormal-object-detection'
easycv_detection = 'easycv-detection'
easycv_segmentation = 'easycv-segmentation'
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment' face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
salient_detection = 'u2net-salient-detection' salient_detection = 'u2net-salient-detection'
salient_boudary_detection = 'res2net-salient-detection' salient_boudary_detection = 'res2net-salient-detection'
@@ -347,7 +346,6 @@ class Pipelines(object):
video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking' video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking'
video_multi_object_tracking = 'video-multi-object-tracking' video_multi_object_tracking = 'video-multi-object-tracking'
image_panoptic_segmentation = 'image-panoptic-segmentation' image_panoptic_segmentation = 'image-panoptic-segmentation'
image_panoptic_segmentation_easycv = 'image-panoptic-segmentation-easycv'
video_summarization = 'googlenet_pgl_video_summarization' video_summarization = 'googlenet_pgl_video_summarization'
language_guided_video_summarization = 'clip-it-video-summarization' language_guided_video_summarization = 'clip-it-video-summarization'
image_semantic_segmentation = 'image-semantic-segmentation' image_semantic_segmentation = 'image-semantic-segmentation'
@@ -402,7 +400,7 @@ class Pipelines(object):
nerf_recon_acc = 'nerf-recon-acc' nerf_recon_acc = 'nerf-recon-acc'
bad_image_detecting = 'bad-image-detecting' bad_image_detecting = 'bad-image-detecting'
controllable_image_generation = 'controllable-image-generation' controllable_image_generation = 'controllable-image-generation'
fast_instance_segmentation = 'fast-instance-segmentation'
image_quality_assessment_mos = 'image-quality-assessment-mos' image_quality_assessment_mos = 'image-quality-assessment-mos'
image_quality_assessment_man = 'image-quality-assessment-man' image_quality_assessment_man = 'image-quality-assessment-man'
image_quality_assessment_degradation = 'image-quality-assessment-degradation' image_quality_assessment_degradation = 'image-quality-assessment-degradation'
@@ -485,6 +483,9 @@ class Pipelines(object):
speaker_diarization_inference = 'speaker-diarization-inference' speaker_diarization_inference = 'speaker-diarization-inference'
vad_inference = 'vad-inference' vad_inference = 'vad-inference'
speaker_verification = 'speaker-verification' speaker_verification = 'speaker-verification'
speaker_verification_rdino = 'speaker-verification-rdino'
speaker_verification_eres2net = 'speaker-verification-eres2net'
speaker_change_locating = 'speaker-change-locating'
lm_inference = 'language-score-prediction' lm_inference = 'language-score-prediction'
speech_timestamp_inference = 'speech-timestamp-inference' speech_timestamp_inference = 'speech-timestamp-inference'
@@ -514,6 +515,7 @@ class Pipelines(object):
gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding' gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
soonet_video_temporal_grounding = 'soonet-video-temporal-grounding' soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
efficient_diffusion_tuning = 'efficient-diffusion-tuning' efficient_diffusion_tuning = 'efficient-diffusion-tuning'
multimodal_dialogue = 'multimodal-dialogue'
# science tasks # science tasks
protein_structure = 'unifold-protein-structure' protein_structure = 'unifold-protein-structure'
@@ -881,6 +883,7 @@ class NLPTrainers(object):
document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer' document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer' document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
siamese_uie_trainer = 'siamese-uie-trainer' siamese_uie_trainer = 'siamese-uie-trainer'
translation_evaluation_trainer = 'translation-evaluation-trainer'
class MultiModalTrainers(object): class MultiModalTrainers(object):
@@ -911,7 +914,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
""" """
default = 'trainer' default = 'trainer'
easycv = 'easycv'
tinynas_damoyolo = 'tinynas-damoyolo' tinynas_damoyolo = 'tinynas-damoyolo'
@staticmethod @staticmethod
@@ -933,8 +935,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
return Fields.multi_modal return Fields.multi_modal
elif attribute_or_value == Trainers.default: elif attribute_or_value == Trainers.default:
return Trainers.default return Trainers.default
elif attribute_or_value == Trainers.easycv:
return Trainers.easycv
else: else:
return 'unknown' return 'unknown'
@@ -1034,6 +1034,8 @@ class Preprocessors(object):
vldoc_preprocessor = 'vldoc-preprocessor' vldoc_preprocessor = 'vldoc-preprocessor'
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor' hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor' diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
mplug_owl_preprocessor = 'mplug-owl-preprocessor'
image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'
# science preprocessor # science preprocessor
unifold_preprocessor = 'unifold-preprocessor' unifold_preprocessor = 'unifold-preprocessor'
@@ -1098,6 +1100,8 @@ class Metrics(object):
# metric for image-colorization task # metric for image-colorization task
image_colorization_metric = 'image-colorization-metric' image_colorization_metric = 'image-colorization-metric'
ocr_recognition_metric = 'ocr-recognition-metric' ocr_recognition_metric = 'ocr-recognition-metric'
# metric for translation evaluation
translation_evaluation_metric = 'translation-evaluation-metric'
class Optimizers(object): class Optimizers(object):
@@ -1165,14 +1169,6 @@ class LR_Schedulers(object):
class CustomDatasets(object): class CustomDatasets(object):
""" Names for different datasets. """ Names for different datasets.
""" """
ClsDataset = 'ClsDataset'
Face2dKeypointsDataset = 'FaceKeypointDataset'
HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
HumanWholeBodyKeypointDataset = 'WholeBodyCocoTopDownDataset'
SegDataset = 'SegDataset'
DetDataset = 'DetDataset'
DetImagesMixDataset = 'DetImagesMixDataset'
PanopticDataset = 'PanopticDataset'
PairedDataset = 'PairedDataset' PairedDataset = 'PairedDataset'
SiddDataset = 'SiddDataset' SiddDataset = 'SiddDataset'
GoproDataset = 'GoproDataset' GoproDataset = 'GoproDataset'

View File

@@ -31,6 +31,7 @@ if TYPE_CHECKING:
from .loss_metric import LossMetric from .loss_metric import LossMetric
from .image_colorization_metric import ImageColorizationMetric from .image_colorization_metric import ImageColorizationMetric
from .ocr_recognition_metric import OCRRecognitionMetric from .ocr_recognition_metric import OCRRecognitionMetric
from .translation_evaluation_metric import TranslationEvaluationMetric
else: else:
_import_structure = { _import_structure = {
'audio_noise_metric': ['AudioNoiseMetric'], 'audio_noise_metric': ['AudioNoiseMetric'],
@@ -62,7 +63,8 @@ else:
'text_ranking_metric': ['TextRankingMetric'], 'text_ranking_metric': ['TextRankingMetric'],
'loss_metric': ['LossMetric'], 'loss_metric': ['LossMetric'],
'image_colorization_metric': ['ImageColorizationMetric'], 'image_colorization_metric': ['ImageColorizationMetric'],
'ocr_recognition_metric': ['OCRRecognitionMetric'] 'ocr_recognition_metric': ['OCRRecognitionMetric'],
'translation_evaluation_metric': ['TranslationEvaluationMetric']
} }
import sys import sys

View File

@@ -42,6 +42,7 @@ class MetricKeys(object):
NDCG = 'ndcg' NDCG = 'ndcg'
AR = 'AR' AR = 'AR'
Colorfulness = 'colorfulness' Colorfulness = 'colorfulness'
Kendall_Tau_Correlation = 'kendall_tau_correlation'
task_default_metrics = { task_default_metrics = {
@@ -76,6 +77,7 @@ task_default_metrics = {
Tasks.bad_image_detecting: [Metrics.accuracy], Tasks.bad_image_detecting: [Metrics.accuracy],
Tasks.ocr_recognition: [Metrics.ocr_recognition_metric], Tasks.ocr_recognition: [Metrics.ocr_recognition_metric],
Tasks.efficient_diffusion_tuning: [Metrics.loss_metric], Tasks.efficient_diffusion_tuning: [Metrics.loss_metric],
Tasks.translation_evaluation: [Metrics.translation_evaluation_metric]
} }

View File

@@ -0,0 +1,174 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import importlib
from typing import Dict, List, Union
from pandas import DataFrame
from modelscope.metainfo import Metrics
from modelscope.metrics.base import Metric
from modelscope.metrics.builder import METRICS, MetricKeys
from modelscope.models.nlp.unite.configuration import InputFormat
from modelscope.utils.logger import get_logger
from modelscope.utils.registry import default_group
logger = get_logger()
@METRICS.register_module(
group_key=default_group, module_name=Metrics.translation_evaluation_metric)
class TranslationEvaluationMetric(Metric):
r"""The metric class for translation evaluation.
"""
def __init__(self, gap_threshold: float = 25.0):
r"""Build a translation evaluation metric, following the designed
Kendall's tau correlation from WMT Metrics Shared Task competitions.
Args:
gap_threshold: The score gap denoting the available hypothesis pair.
Returns:
A metric for translation evaluation.
"""
self.gap_threshold = gap_threshold
self.lp = list()
self.segment_id = list()
self.raw_score = list()
self.score = list()
self.input_format = list()
def clear(self) -> None:
r"""Clear all the stored variables.
"""
self.lp.clear()
self.segment_id.clear()
self.raw_score.clear()
self.input_format.clear()
self.score.clear()
return
def add(self, outputs: Dict[str, List[float]],
inputs: Dict[str, List[Union[float, int]]]) -> None:
r"""Collect the related results for processing.
Args:
outputs: Dict containing 'scores'
inputs: Dict containing 'labels' and 'segment_ids'
"""
self.lp += inputs['lp']
self.segment_id += inputs['segment_id']
self.raw_score += inputs['raw_score']
self.input_format += inputs['input_format']
self.score += outputs['score']
return
def evaluate(self) -> Dict[str, Dict[str, float]]:
r"""Compute the Kendall's tau correlation.
Returns:
A dict denoting Kendall's tau correlation.
"""
data = {
'lp': self.lp,
'segment_id': self.segment_id,
'raw_score': self.raw_score,
'input_format': self.input_format,
'score': self.score
}
data = DataFrame(data=data)
correlation = dict()
for input_format in data.input_format.unique():
logger.info('Evaluation results for %s input format'
% input_format.value)
input_format_data = data[data.input_format == input_format]
temp_correlation = dict()
for lp in sorted(input_format_data.lp.unique()):
sub_data = input_format_data[input_format_data.lp == lp]
temp_correlation[input_format.value + '_'
+ lp] = self.compute_kendall_tau(sub_data)
logger.info(
'\t%s: %f' %
(lp,
temp_correlation[input_format.value + '_' + lp] * 100))
avg_correlation = sum(
temp_correlation.values()) / len(temp_correlation)
correlation[input_format.value + '_avg'] = avg_correlation
logger.info('Average evaluation result for %s input format: %f' %
(input_format.value, avg_correlation))
logger.info('')
correlation.update(temp_correlation)
return correlation
def merge(self, other: 'TranslationEvaluationMetric') -> None:
r"""Merge the predictions from other TranslationEvaluationMetric objects.
Args:
other: Another TranslationEvaluationMetric object.
"""
self.lp += other.lp
self.segment_id += other.segment_ids
self.raw_score += other.raw_score
self.input_format += other.input_format
self.score += other.score
return
def compute_kendall_tau(self, csv_data: DataFrame) -> float:
r"""Compute kendall's tau correlation.
Args:
csv_data: The pandas dataframe.
Returns:
float: THe kendall's Tau correlation.
"""
concor = discor = 0
for segment_id in sorted(csv_data.segment_id.unique()):
group_csv_data = csv_data[csv_data.segment_id == segment_id]
examples = group_csv_data.to_dict('records')
for i in range(0, len(examples)):
for j in range(i + 1, len(examples)):
if self.raw_score[i] - self.raw_score[
j] >= self.gap_threshold:
if self.score[i] > self.score[j]:
concor += 1
elif self.score[i] < self.score[j]:
discor += 1
elif self.raw_score[i] - self.raw_score[
j] <= -self.gap_threshold:
if self.score[i] < self.score[j]:
concor += 1
elif self.score[i] > self.score[j]:
discor += 1
if concor + discor == 0:
logger.warning(
'We don\'t have available pairs when evaluation. '
'Marking the kendall tau correlation as the lowest value (-1.0).'
)
return -1.0
else:
return (concor - discor) / (concor + discor)

View File

@@ -39,7 +39,7 @@ class ConvSTFT(nn.Module):
super(ConvSTFT, self).__init__() super(ConvSTFT, self).__init__()
if fft_len is None: if fft_len is None:
self.fft_len = np.int(2**np.ceil(np.log2(win_len))) self.fft_len = int(2**np.ceil(np.log2(win_len)))
else: else:
self.fft_len = fft_len self.fft_len = fft_len
@@ -78,7 +78,7 @@ class ConviSTFT(nn.Module):
fix=True): fix=True):
super(ConviSTFT, self).__init__() super(ConviSTFT, self).__init__()
if fft_len is None: if fft_len is None:
self.fft_len = np.int(2**np.ceil(np.log2(win_len))) self.fft_len = int(2**np.ceil(np.log2(win_len)))
else: else:
self.fft_len = fft_len self.fft_len = fft_len
kernel, window = init_kernels( kernel, window = init_kernels(

View File

@@ -45,27 +45,5 @@ class GenericAutomaticSpeechRecognition(Model):
def forward(self) -> Dict[str, Any]: def forward(self) -> Dict[str, Any]:
"""preload model and return the info of the model """preload model and return the info of the model
""" """
if self.model_cfg['model_config']['type'] == Frameworks.tf:
from easyasr import asr_inference_paraformer_tf
if hasattr(asr_inference_paraformer_tf, 'preload'):
model_workspace = self.model_cfg['model_workspace']
model_path = os.path.join(model_workspace,
self.model_cfg['am_model'])
vocab_path = os.path.join(
model_workspace,
self.model_cfg['model_config']['vocab_file'])
sampled_ids = 'seq2seq/sampled_ids'
sampled_lengths = 'seq2seq/sampled_lengths'
if 'sampled_ids' in self.model_cfg['model_config']:
sampled_ids = self.model_cfg['model_config']['sampled_ids']
if 'sampled_lengths' in self.model_cfg['model_config']:
sampled_lengths = self.model_cfg['model_config'][
'sampled_lengths']
asr_inference_paraformer_tf.preload(
ngpu=1,
asr_model_file=model_path,
vocab_file=vocab_path,
sampled_ids=sampled_ids,
sampled_lengths=sampled_lengths)
return self.model_cfg return self.model_cfg

View File

@@ -0,0 +1,233 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F
from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32
class DFSMNUnit(nn.Module):
""" one multi-channel deep fsmn unit
Args:
dimin: input dimension
dimexpand: feature expansion dimension
dimout: output dimension
lorder: left ofder
rorder: right order
"""
def __init__(self,
dimin=64,
dimexpand=128,
dimout=64,
lorder=10,
rorder=1):
super(DFSMNUnit, self).__init__()
self.expand = AffineTransform(dimin, dimexpand)
self.shrink = LinearTransform(dimexpand, dimout)
self.fsmn = Fsmn(dimout, dimout, lorder, rorder, 1, 1)
self.debug = False
self.dataout = None
def forward(self, input):
"""
Args:
input: [batch, time, feature]
"""
out1 = F.relu(self.expand(input))
out2 = self.shrink(out1)
out3 = self.fsmn(out2)
# add skip connection for matched data
if input.shape[-1] == out3.shape[-1]:
out3 = input + out3
if self.debug:
self.dataout = out3
return out3
def print_model(self):
self.expand.printModel()
self.shrink.printModel()
self.fsmn.printModel()
def to_kaldi_nnet(self):
re_str = self.expand.toKaldiNNet()
relu = RectifiedLinear(self.expand.linear.out_features,
self.expand.linear.out_features)
re_str += relu.toKaldiNNet()
re_str = self.shrink.toKaldiNNet()
re_str += self.fsmn.toKaldiNNet()
return re_str
class FSMNSeleNetV3(nn.Module):
""" Deep FSMN model with channel selection performs multi-channel kws.
Zhang, Shiliang, et al. "Deep-FSMN for large vocabulary continuous speech
recognition." 2018 IEEE International Conference on Acoustics, Speech and
Signal Processing (ICASSP). IEEE, 2018.
Args:
input_dim: input dimension
linear_dim: fsmn input dimension
proj_dim: fsmn projection dimension
lorder: fsmn left order
rorder: fsmn right order
num_syn: output dimension
fsmn_layers: no. of fsmn units
"""
def __init__(self,
input_dim=120,
linear_dim=128,
proj_dim=64,
lorder=10,
rorder=1,
num_syn=5,
fsmn_layers=5):
super(FSMNSeleNetV3, self).__init__()
self.mem = []
# the first unit, mapping input dim to proj dim
unit = DFSMNUnit(input_dim, linear_dim, proj_dim, lorder, rorder)
self.mem.append(unit)
self.add_module('mem_{:d}'.format(0), unit)
# deep fsmn layers with skip connection
for i in range(1, fsmn_layers):
unit = DFSMNUnit(proj_dim, linear_dim, proj_dim, lorder, rorder)
self.mem.append(unit)
self.add_module('mem_{:d}'.format(i), unit)
self.expand2 = AffineTransform(proj_dim, linear_dim)
self.decision = AffineTransform(linear_dim, num_syn)
def forward(self, input):
# multi-channel temp space, [batch, time, channel, feature]
if torch.cuda.is_available():
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
self.expand2.linear.out_features).cuda()
else:
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
self.expand2.linear.out_features)
for n in range(input.shape[2]):
chin = input[:, :, n, :]
for unit in self.mem:
chout = unit(chin)
chin = chout
x[:, :, n, :] = F.relu(self.expand2(chout))
# perform max pooling
pool = nn.MaxPool2d((x.shape[2], 1), stride=(x.shape[2], 1))
y = pool(x)
# remove channel dimension
y = torch.squeeze(y, -2)
z = self.decision(y)
return z
def print_model(self):
for unit in self.mem:
unit.print_model()
self.expand2.printModel()
self.decision.printModel()
def print_header(self):
""" get DFSMN params
"""
input_dim = self.mem[0].expand.linear.in_features
linear_dim = self.mem[0].expand.linear.out_features
proj_dim = self.mem[0].shrink.linear.out_features
lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
rorder = 0
if self.mem[0].fsmn.conv_right is not None:
rorder = self.mem[0].fsmn.conv_right.kernel_size[0]
num_syn = self.decision.linear.out_features
fsmn_layers = len(self.mem)
# no. of output channels, 0.0 means the same as numins
numouts = 1.0
#
# write total header
#
header = [0.0] * HEADER_BLOCK_SIZE * 5
# numins
header[0] = 0.0
# numouts
header[1] = numouts
# dimins
header[2] = input_dim
# dimouts
header[3] = num_syn
# numlayers
header[4] = 4
#
# write each layer's header
#
hidx = 1
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DFSMN.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
header[HEADER_BLOCK_SIZE * hidx + 4] = proj_dim
header[HEADER_BLOCK_SIZE * hidx + 5] = lorder
header[HEADER_BLOCK_SIZE * hidx + 6] = rorder
header[HEADER_BLOCK_SIZE * hidx + 7] = fsmn_layers
hidx += 1
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DENSE.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = proj_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
ActivationType.ACTIVATION_RELU.value)
hidx += 1
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_MAX_POOLING.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
hidx += 1
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
LayerType.LAYER_DENSE.value)
header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
ActivationType.ACTIVATION_SOFTMAX.value)
for h in header:
print(f32ToI32(h))
def to_kaldi_nnet(self):
re_str = '<Nnet>\n'
for unit in self.mem:
re_str += unit.to_kaldi_nnet()
re_str = self.expand2.toKaldiNNet()
relu = RectifiedLinear(self.expand2.linear.out_features,
self.expand2.linear.out_features)
re_str += relu.toKaldiNNet()
re_str += self.decision.toKaldiNNet()
re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
self.decision.linear.out_features)
re_str += '<!EndOfComponent>\n'
re_str += '</Nnet>\n'
return re_str

View File

@@ -11,6 +11,7 @@ from modelscope.models.builder import MODELS
from modelscope.utils.audio.audio_utils import update_conf from modelscope.utils.audio.audio_utils import update_conf
from modelscope.utils.constant import Tasks from modelscope.utils.constant import Tasks
from .fsmn_sele_v2 import FSMNSeleNetV2 from .fsmn_sele_v2 import FSMNSeleNetV2
from .fsmn_sele_v3 import FSMNSeleNetV3
@MODELS.register_module( @MODELS.register_module(
@@ -18,6 +19,7 @@ from .fsmn_sele_v2 import FSMNSeleNetV2
class FSMNSeleNetV2Decorator(TorchModel): class FSMNSeleNetV2Decorator(TorchModel):
r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """ r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """
MODEL_CLASS = FSMNSeleNetV2
MODEL_TXT = 'model.txt' MODEL_TXT = 'model.txt'
SC_CONFIG = 'sound_connect.conf' SC_CONFIG = 'sound_connect.conf'
@@ -33,7 +35,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
""" """
super().__init__(model_dir, *args, **kwargs) super().__init__(model_dir, *args, **kwargs)
if training: if training:
self.model = FSMNSeleNetV2(*args, **kwargs) self.model = self.MODEL_CLASS(*args, **kwargs)
else: else:
sc_config_file = os.path.join(model_dir, self.SC_CONFIG) sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
model_txt_file = os.path.join(model_dir, self.MODEL_TXT) model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
@@ -42,7 +44,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
self._sc = None self._sc = None
if os.path.exists(model_txt_file): if os.path.exists(model_txt_file):
conf_dict = dict(mode=56542, kws_model=model_txt_file) conf_dict = dict(kws_model=model_txt_file)
update_conf(sc_config_file, new_config_file, conf_dict) update_conf(sc_config_file, new_config_file, conf_dict)
import py_sound_connect import py_sound_connect
self._sc = py_sound_connect.SoundConnect(new_config_file) self._sc = py_sound_connect.SoundConnect(new_config_file)
@@ -50,8 +52,8 @@ class FSMNSeleNetV2Decorator(TorchModel):
self.size_out = self._sc.bytesPerBlockOut() self.size_out = self._sc.bytesPerBlockOut()
else: else:
raise Exception( raise Exception(
f'Invalid model directory! Failed to load model file: {model_txt_file}.' f'Invalid model directory! Failed to load model file:'
) f' {model_txt_file}.')
def __del__(self): def __del__(self):
if hasattr(self, 'tmp_dir'): if hasattr(self, 'tmp_dir'):
@@ -73,3 +75,24 @@ class FSMNSeleNetV2Decorator(TorchModel):
'confidence': self._sc.kwsConfidence() 'confidence': self._sc.kwsConfidence()
} }
return result return result
@MODELS.register_module(
Tasks.keyword_spotting,
module_name=Models.speech_dfsmn_kws_char_farfield_iot)
class FSMNSeleNetV3Decorator(FSMNSeleNetV2Decorator):
r""" A decorator of FSMNSeleNetV3 for integrating into modelscope framework """
MODEL_CLASS = FSMNSeleNetV3
def __init__(self,
model_dir: str,
training: Optional[bool] = False,
*args,
**kwargs):
"""initialize the dfsmn model from the `model_dir` path.
Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, training, *args, **kwargs)

View File

@@ -76,11 +76,13 @@ class CAMPPlus(nn.Module):
bn_size=4, bn_size=4,
init_channels=128, init_channels=128,
config_str='batchnorm-relu', config_str='batchnorm-relu',
memory_efficient=True): memory_efficient=True,
output_level='segment'):
super(CAMPPlus, self).__init__() super(CAMPPlus, self).__init__()
self.head = FCM(feat_dim=feat_dim) self.head = FCM(feat_dim=feat_dim)
channels = self.head.out_channels channels = self.head.out_channels
self.output_level = output_level
self.xvector = nn.Sequential( self.xvector = nn.Sequential(
OrderedDict([ OrderedDict([
@@ -118,10 +120,14 @@ class CAMPPlus(nn.Module):
self.xvector.add_module('out_nonlinear', self.xvector.add_module('out_nonlinear',
get_nonlinear(config_str, channels)) get_nonlinear(config_str, channels))
self.xvector.add_module('stats', StatsPool()) if self.output_level == 'segment':
self.xvector.add_module( self.xvector.add_module('stats', StatsPool())
'dense', self.xvector.add_module(
DenseLayer(channels * 2, embedding_size, config_str='batchnorm_')) 'dense',
DenseLayer(
channels * 2, embedding_size, config_str='batchnorm_'))
else:
assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. '
for m in self.modules(): for m in self.modules():
if isinstance(m, (nn.Conv1d, nn.Linear)): if isinstance(m, (nn.Conv1d, nn.Linear)):
@@ -133,6 +139,8 @@ class CAMPPlus(nn.Module):
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
x = self.head(x) x = self.head(x)
x = self.xvector(x) x = self.xvector(x)
if self.output_level == 'frame':
x = x.transpose(1, 2)
return x return x

View File

@@ -0,0 +1,344 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
fusion (LFF) fuses the features within one single residual block to extract the local signal.
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
"""
import math
import os
from typing import Any, Dict, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.compliance.kaldi as Kaldi
import modelscope.models.audio.sv.pooling_layers as pooling_layers
from modelscope.metainfo import Models
from modelscope.models import MODELS, TorchModel
from modelscope.models.audio.sv.fusion import AFF
from modelscope.utils.constant import Tasks
class ReLU(nn.Hardtanh):
def __init__(self, inplace=False):
super(ReLU, self).__init__(0, 20, inplace)
def __repr__(self):
inplace_str = 'inplace' if self.inplace else ''
return self.__class__.__name__ + ' (' \
+ inplace_str + ')'
def conv1x1(in_planes, out_planes, stride=1):
'1x1 convolution without padding'
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=1,
stride=stride,
padding=0,
bias=False)
def conv3x3(in_planes, out_planes, stride=1):
'3x3 convolution with padding'
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
class BasicBlockRes2Net(nn.Module):
expansion = 2
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
super(BasicBlockRes2Net, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = conv1x1(in_planes, width * scale, stride)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
convs = []
bns = []
for i in range(self.nums):
convs.append(conv3x3(width, width))
bns.append(nn.BatchNorm2d(width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.relu = ReLU(inplace=True)
self.conv3 = conv1x1(width * scale, planes * self.expansion)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(
in_planes,
self.expansion * planes,
kernel_size=1,
stride=stride,
bias=False), nn.BatchNorm2d(self.expansion * planes))
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = sp + spx[i]
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class BasicBlockRes2Net_diff_AFF(nn.Module):
expansion = 2
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
super(BasicBlockRes2Net_diff_AFF, self).__init__()
width = int(math.floor(planes * (baseWidth / 64.0)))
self.conv1 = conv1x1(in_planes, width * scale, stride)
self.bn1 = nn.BatchNorm2d(width * scale)
self.nums = scale
convs = []
fuse_models = []
bns = []
for i in range(self.nums):
convs.append(conv3x3(width, width))
bns.append(nn.BatchNorm2d(width))
for j in range(self.nums - 1):
fuse_models.append(AFF(channels=width))
self.convs = nn.ModuleList(convs)
self.bns = nn.ModuleList(bns)
self.fuse_models = nn.ModuleList(fuse_models)
self.relu = ReLU(inplace=True)
self.conv3 = conv1x1(width * scale, planes * self.expansion)
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(
in_planes,
self.expansion * planes,
kernel_size=1,
stride=stride,
bias=False), nn.BatchNorm2d(self.expansion * planes))
self.stride = stride
self.width = width
self.scale = scale
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
spx = torch.split(out, self.width, 1)
for i in range(self.nums):
if i == 0:
sp = spx[i]
else:
sp = self.fuse_models[i - 1](sp, spx[i])
sp = self.convs[i](sp)
sp = self.relu(self.bns[i](sp))
if i == 0:
out = sp
else:
out = torch.cat((out, sp), 1)
out = self.conv3(out)
out = self.bn3(out)
residual = self.shortcut(x)
out += residual
out = self.relu(out)
return out
class ERes2Net(nn.Module):
def __init__(self,
block=BasicBlockRes2Net,
block_fuse=BasicBlockRes2Net_diff_AFF,
num_blocks=[3, 4, 6, 3],
m_channels=32,
feat_dim=80,
embed_dim=192,
pooling_func='TSTP',
two_emb_layer=False):
super(ERes2Net, self).__init__()
self.in_planes = m_channels
self.feat_dim = feat_dim
self.embed_dim = embed_dim
self.stats_dim = int(feat_dim / 8) * m_channels * 8
self.two_emb_layer = two_emb_layer
self.conv1 = nn.Conv2d(
1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(m_channels)
self.layer1 = self._make_layer(
block, m_channels, num_blocks[0], stride=1)
self.layer2 = self._make_layer(
block, m_channels * 2, num_blocks[1], stride=2)
self.layer3 = self._make_layer(
block_fuse, m_channels * 4, num_blocks[2], stride=2)
self.layer4 = self._make_layer(
block_fuse, m_channels * 8, num_blocks[3], stride=2)
# downsampling
self.layer1_downsample = nn.Conv2d(
m_channels * 2,
m_channels * 4,
kernel_size=3,
stride=2,
padding=1,
bias=False)
self.layer2_downsample = nn.Conv2d(
m_channels * 4,
m_channels * 8,
kernel_size=3,
padding=1,
stride=2,
bias=False)
self.layer3_downsample = nn.Conv2d(
m_channels * 8,
m_channels * 16,
kernel_size=3,
padding=1,
stride=2,
bias=False)
# bottom-up fusion
self.fuse_mode12 = AFF(channels=m_channels * 4)
self.fuse_mode123 = AFF(channels=m_channels * 8)
self.fuse_mode1234 = AFF(channels=m_channels * 16)
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
self.pool = getattr(pooling_layers, pooling_func)(
in_dim=self.stats_dim * block.expansion)
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
embed_dim)
if self.two_emb_layer:
self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
self.seg_2 = nn.Linear(embed_dim, embed_dim)
else:
self.seg_bn_1 = nn.Identity()
self.seg_2 = nn.Identity()
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
x = x.permute(0, 2, 1)
x = x.unsqueeze_(1)
out = F.relu(self.bn1(self.conv1(x)))
out1 = self.layer1(out)
# bottom-up fusion
out2 = self.layer2(out1)
out1_downsample = self.layer1_downsample(out1)
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
out3 = self.layer3(out2)
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
out4 = self.layer4(out3)
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
stats = self.pool(fuse_out1234)
embed_a = self.seg_1(stats)
if self.two_emb_layer:
out = F.relu(embed_a)
out = self.seg_bn_1(out)
embed_b = self.seg_2(out)
return embed_b
else:
return embed_a
@MODELS.register_module(
Tasks.speaker_verification, module_name=Models.eres2net_sv)
class SpeakerVerificationERes2Net(TorchModel):
r"""Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
Args:
model_dir: A model dir.
model_config: The model config.
"""
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
**kwargs):
super().__init__(model_dir, model_config, *args, **kwargs)
self.model_config = model_config
self.other_config = kwargs
self.feature_dim = 80
self.embedding_model = ERes2Net()
pretrained_model_name = kwargs['pretrained_model']
self.__load_check_point(pretrained_model_name)
self.embedding_model.eval()
def forward(self, audio):
assert len(audio.shape) == 2 and audio.shape[
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
# audio shape: [1, T]
feature = self.__extract_feature(audio)
embedding = self.embedding_model(feature)
return embedding
def __extract_feature(self, audio):
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
feature = feature - feature.mean(dim=0, keepdim=True)
feature = feature.unsqueeze(0)
return feature
def __load_check_point(self, pretrained_model_name, device=None):
if not device:
device = torch.device('cpu')
self.embedding_model.load_state_dict(
torch.load(
os.path.join(self.model_dir, pretrained_model_name),
map_location=device),
strict=True)

View File

@@ -0,0 +1,32 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
class AFF(nn.Module):
def __init__(self, channels=64, r=4):
super(AFF, self).__init__()
inter_channels = int(channels // r)
self.local_att = nn.Sequential(
nn.Conv2d(
channels * 2,
inter_channels,
kernel_size=1,
stride=1,
padding=0),
nn.BatchNorm2d(inter_channels),
nn.SiLU(inplace=True),
nn.Conv2d(
inter_channels, channels, kernel_size=1, stride=1, padding=0),
nn.BatchNorm2d(channels),
)
def forward(self, x, ds_y):
xa = torch.cat((x, ds_y), dim=1)
x_att = self.local_att(xa)
x_att = 1.0 + torch.tanh(x_att)
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
return xo

View File

@@ -0,0 +1,107 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker.
"""
import torch
import torch.nn as nn
class TAP(nn.Module):
"""
Temporal average pooling, only first-order mean is considered
"""
def __init__(self, **kwargs):
super(TAP, self).__init__()
def forward(self, x):
pooling_mean = x.mean(dim=-1)
# To be compatable with 2D input
pooling_mean = pooling_mean.flatten(start_dim=1)
return pooling_mean
class TSDP(nn.Module):
"""
Temporal standard deviation pooling, only second-order std is considered
"""
def __init__(self, **kwargs):
super(TSDP, self).__init__()
def forward(self, x):
# The last dimension is the temporal axis
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
pooling_std = pooling_std.flatten(start_dim=1)
return pooling_std
class TSTP(nn.Module):
"""
Temporal statistics pooling, concatenate mean and std, which is used in
x-vector
Comment: simple concatenation can not make full use of both statistics
"""
def __init__(self, **kwargs):
super(TSTP, self).__init__()
def forward(self, x):
# The last dimension is the temporal axis
pooling_mean = x.mean(dim=-1)
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
pooling_mean = pooling_mean.flatten(start_dim=1)
pooling_std = pooling_std.flatten(start_dim=1)
stats = torch.cat((pooling_mean, pooling_std), 1)
return stats
class ASTP(nn.Module):
""" Attentive statistics pooling: Channel- and context-dependent
statistics pooling, first used in ECAPA_TDNN.
"""
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
super(ASTP, self).__init__()
self.global_context_att = global_context_att
# Use Conv1d with stride == 1 rather than Linear, then we don't
# need to transpose inputs.
if global_context_att:
self.linear1 = nn.Conv1d(
in_dim * 3, bottleneck_dim,
kernel_size=1) # equals W and b in the paper
else:
self.linear1 = nn.Conv1d(
in_dim, bottleneck_dim,
kernel_size=1) # equals W and b in the paper
self.linear2 = nn.Conv1d(
bottleneck_dim, in_dim,
kernel_size=1) # equals V and k in the paper
def forward(self, x):
"""
x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
or a 4-dimensional tensor in resnet architecture (B,C,F,T)
0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
"""
if len(x.shape) == 4:
x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
assert len(x.shape) == 3
if self.global_context_att:
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
context_std = torch.sqrt(
torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
x_in = torch.cat((x, context_mean, context_std), dim=1)
else:
x_in = x
# DON'T use ReLU here! ReLU may be hard to converge.
alpha = torch.tanh(
self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
alpha = torch.softmax(self.linear2(alpha), dim=2)
mean = torch.sum(alpha * x, dim=2)
var = torch.sum(alpha * (x**2), dim=2) - mean**2
std = torch.sqrt(var.clamp(min=1e-10))
return torch.cat([mean, std], dim=1)

View File

@@ -0,0 +1,573 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
RDINOHead implementation is adapted from DINO framework.
"""
import math
import os
from typing import Any, Dict, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.compliance.kaldi as Kaldi
from modelscope.metainfo import Models
from modelscope.models import MODELS, TorchModel
from modelscope.utils.constant import Tasks
def length_to_mask(length, max_len=None, dtype=None, device=None):
assert len(length.shape) == 1
if max_len is None:
max_len = length.max().long().item()
mask = torch.arange(
max_len, device=length.device, dtype=length.dtype).expand(
len(length), max_len) < length.unsqueeze(1)
if dtype is None:
dtype = length.dtype
if device is None:
device = length.device
mask = torch.as_tensor(mask, dtype=dtype, device=device)
return mask
def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
if stride > 1:
n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
L_out = stride * (n_steps - 1) + kernel_size * dilation
padding = [kernel_size // 2, kernel_size // 2]
else:
L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
return padding
class Conv1d(nn.Module):
def __init__(
self,
out_channels,
kernel_size,
in_channels,
stride=1,
dilation=1,
padding='same',
groups=1,
bias=True,
padding_mode='reflect',
):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.padding_mode = padding_mode
self.conv = nn.Conv1d(
in_channels,
out_channels,
self.kernel_size,
stride=self.stride,
dilation=self.dilation,
padding=0,
groups=groups,
bias=bias,
)
def forward(self, x):
if self.padding == 'same':
x = self._manage_padding(x, self.kernel_size, self.dilation,
self.stride)
elif self.padding == 'causal':
num_pad = (self.kernel_size - 1) * self.dilation
x = F.pad(x, (num_pad, 0))
elif self.padding == 'valid':
pass
else:
raise ValueError(
"Padding must be 'same', 'valid' or 'causal'. Got "
+ self.padding)
wx = self.conv(x)
return wx
def _manage_padding(
self,
x,
kernel_size: int,
dilation: int,
stride: int,
):
L_in = x.shape[-1]
padding = get_padding_elem(L_in, stride, kernel_size, dilation)
x = F.pad(x, padding, mode=self.padding_mode)
return x
class BatchNorm1d(nn.Module):
def __init__(
self,
input_size,
eps=1e-05,
momentum=0.1,
):
super().__init__()
self.norm = nn.BatchNorm1d(
input_size,
eps=eps,
momentum=momentum,
)
def forward(self, x):
return self.norm(x)
class TDNNBlock(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
dilation,
activation=nn.ReLU,
groups=1,
):
super(TDNNBlock, self).__init__()
self.conv = Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
dilation=dilation,
groups=groups,
)
self.activation = activation()
self.norm = BatchNorm1d(input_size=out_channels)
def forward(self, x):
return self.norm(self.activation(self.conv(x)))
class Res2NetBlock(torch.nn.Module):
def __init__(self,
in_channels,
out_channels,
scale=8,
kernel_size=3,
dilation=1):
super(Res2NetBlock, self).__init__()
assert in_channels % scale == 0
assert out_channels % scale == 0
in_channel = in_channels // scale
hidden_channel = out_channels // scale
self.blocks = nn.ModuleList([
TDNNBlock(
in_channel,
hidden_channel,
kernel_size=kernel_size,
dilation=dilation,
) for i in range(scale - 1)
])
self.scale = scale
def forward(self, x):
y = []
for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
if i == 0:
y_i = x_i
elif i == 1:
y_i = self.blocks[i - 1](x_i)
else:
y_i = self.blocks[i - 1](x_i + y_i)
y.append(y_i)
y = torch.cat(y, dim=1)
return y
class SEBlock(nn.Module):
def __init__(self, in_channels, se_channels, out_channels):
super(SEBlock, self).__init__()
self.conv1 = Conv1d(
in_channels=in_channels, out_channels=se_channels, kernel_size=1)
self.relu = torch.nn.ReLU(inplace=True)
self.conv2 = Conv1d(
in_channels=se_channels, out_channels=out_channels, kernel_size=1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x, lengths=None):
L = x.shape[-1]
if lengths is not None:
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
mask = mask.unsqueeze(1)
total = mask.sum(dim=2, keepdim=True)
s = (x * mask).sum(dim=2, keepdim=True) / total
else:
s = x.mean(dim=2, keepdim=True)
s = self.relu(self.conv1(s))
s = self.sigmoid(self.conv2(s))
return s * x
class AttentiveStatisticsPooling(nn.Module):
def __init__(self, channels, attention_channels=128, global_context=True):
super().__init__()
self.eps = 1e-12
self.global_context = global_context
if global_context:
self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
else:
self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
self.tanh = nn.Tanh()
self.conv = Conv1d(
in_channels=attention_channels,
out_channels=channels,
kernel_size=1)
def forward(self, x, lengths=None):
L = x.shape[-1]
def _compute_statistics(x, m, dim=2, eps=self.eps):
mean = (m * x).sum(dim)
std = torch.sqrt(
(m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
return mean, std
if lengths is None:
lengths = torch.ones(x.shape[0], device=x.device)
# Make binary mask of shape [N, 1, L]
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
mask = mask.unsqueeze(1)
# Expand the temporal context of the pooling layer by allowing the
# self-attention to look at global properties of the utterance.
if self.global_context:
# torch.std is unstable for backward computation
# https://github.com/pytorch/pytorch/issues/4320
total = mask.sum(dim=2, keepdim=True).float()
mean, std = _compute_statistics(x, mask / total)
mean = mean.unsqueeze(2).repeat(1, 1, L)
std = std.unsqueeze(2).repeat(1, 1, L)
attn = torch.cat([x, mean, std], dim=1)
else:
attn = x
# Apply layers
attn = self.conv(self.tanh(self.tdnn(attn)))
# Filter out zero-paddings
attn = attn.masked_fill(mask == 0, float('-inf'))
attn = F.softmax(attn, dim=2)
mean, std = _compute_statistics(x, attn)
# Append mean and std of the batch
pooled_stats = torch.cat((mean, std), dim=1)
pooled_stats = pooled_stats.unsqueeze(2)
return pooled_stats
class SERes2NetBlock(nn.Module):
def __init__(
self,
in_channels,
out_channels,
res2net_scale=8,
se_channels=128,
kernel_size=1,
dilation=1,
activation=torch.nn.ReLU,
groups=1,
):
super().__init__()
self.out_channels = out_channels
self.tdnn1 = TDNNBlock(
in_channels,
out_channels,
kernel_size=1,
dilation=1,
activation=activation,
groups=groups,
)
self.res2net_block = Res2NetBlock(out_channels, out_channels,
res2net_scale, kernel_size, dilation)
self.tdnn2 = TDNNBlock(
out_channels,
out_channels,
kernel_size=1,
dilation=1,
activation=activation,
groups=groups,
)
self.se_block = SEBlock(out_channels, se_channels, out_channels)
self.shortcut = None
if in_channels != out_channels:
self.shortcut = Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
)
def forward(self, x, lengths=None):
residual = x
if self.shortcut:
residual = self.shortcut(x)
x = self.tdnn1(x)
x = self.res2net_block(x)
x = self.tdnn2(x)
x = self.se_block(x, lengths)
return x + residual
class ECAPA_TDNN(nn.Module):
"""An implementation of the speaker embedding model in a paper.
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
"""
def __init__(
self,
input_size,
device='cpu',
lin_neurons=512,
activation=torch.nn.ReLU,
channels=[512, 512, 512, 512, 1536],
kernel_sizes=[5, 3, 3, 3, 1],
dilations=[1, 2, 3, 4, 1],
attention_channels=128,
res2net_scale=8,
se_channels=128,
global_context=True,
groups=[1, 1, 1, 1, 1],
):
super().__init__()
assert len(channels) == len(kernel_sizes)
assert len(channels) == len(dilations)
self.channels = channels
self.blocks = nn.ModuleList()
# The initial TDNN layer
self.blocks.append(
TDNNBlock(
input_size,
channels[0],
kernel_sizes[0],
dilations[0],
activation,
groups[0],
))
# SE-Res2Net layers
for i in range(1, len(channels) - 1):
self.blocks.append(
SERes2NetBlock(
channels[i - 1],
channels[i],
res2net_scale=res2net_scale,
se_channels=se_channels,
kernel_size=kernel_sizes[i],
dilation=dilations[i],
activation=activation,
groups=groups[i],
))
# Multi-layer feature aggregation
self.mfa = TDNNBlock(
channels[-1],
channels[-1],
kernel_sizes[-1],
dilations[-1],
activation,
groups=groups[-1],
)
# Attentive Statistical Pooling
self.asp = AttentiveStatisticsPooling(
channels[-1],
attention_channels=attention_channels,
global_context=global_context,
)
self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
# Final linear transformation
self.fc = Conv1d(
in_channels=channels[-1] * 2,
out_channels=lin_neurons,
kernel_size=1,
)
def forward(self, x, lengths=None):
"""Returns the embedding vector.
Arguments
---------
x : torch.Tensor
Tensor of shape (batch, time, channel).
"""
x = x.transpose(1, 2)
xl = []
for layer in self.blocks:
try:
x = layer(x, lengths=lengths)
except TypeError:
x = layer(x)
xl.append(x)
# Multi-layer feature aggregation
x = torch.cat(xl[1:], dim=1)
x = self.mfa(x)
# Attentive Statistical Pooling
x = self.asp(x, lengths=lengths)
x = self.asp_bn(x)
# Final linear transformation
x = self.fc(x)
x = x.transpose(1, 2).squeeze(1)
return x
class RDINOHead(nn.Module):
def __init__(self,
in_dim,
out_dim,
use_bn=False,
norm_last_layer=True,
nlayers=3,
hidden_dim=2048,
bottleneck_dim=256,
add_dim=8192):
super().__init__()
nlayers = max(nlayers, 1)
if nlayers == 1:
self.mlp = nn.Linear(in_dim, bottleneck_dim)
else:
layers = [nn.Linear(in_dim, hidden_dim)]
if use_bn:
layers.append(nn.BatchNorm1d(hidden_dim))
layers.append(nn.GELU())
for _ in range(nlayers - 2):
layers.append(nn.Linear(hidden_dim, hidden_dim))
if use_bn:
layers.append(nn.BatchNorm1d(hidden_dim))
layers.append(nn.GELU())
layers.append(nn.Linear(hidden_dim, add_dim))
self.mlp = nn.Sequential(*layers)
self.add_layer = nn.Linear(add_dim, bottleneck_dim)
self.apply(self._init_weights)
self.last_layer = nn.utils.weight_norm(
nn.Linear(bottleneck_dim, out_dim, bias=False))
self.last_layer.weight_g.data.fill_(1)
if norm_last_layer:
self.last_layer.weight_g.requires_grad = False
def _init_weights(self, m):
if isinstance(m, nn.Linear):
torch.nn.init.trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
vicr_out = self.mlp(x)
x = self.add_layer(vicr_out)
x = nn.functional.normalize(x, dim=-1, p=2)
x = self.last_layer(x)
return vicr_out, x
class Combine(nn.Module):
def __init__(self, backbone, head):
super(Combine, self).__init__()
self.backbone = backbone
self.head = head
def forward(self, x):
x = self.backbone(x)
output = self.head(x)
return output
@MODELS.register_module(
Tasks.speaker_verification, module_name=Models.rdino_tdnn_sv)
class SpeakerVerification_RDINO(TorchModel):
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
**kwargs):
super().__init__(model_dir, model_config, *args, **kwargs)
self.model_config = model_config
self.other_config = kwargs
if self.model_config['channel'] != 1024:
raise ValueError(
'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
)
self.feature_dim = 80
channels_config = [1024, 1024, 1024, 1024, 3072]
self.embedding_model = ECAPA_TDNN(
self.feature_dim, channels=channels_config)
self.embedding_model = Combine(self.embedding_model,
RDINOHead(512, 65536, True))
pretrained_model_name = kwargs['pretrained_model']
self.__load_check_point(pretrained_model_name)
self.embedding_model.eval()
def forward(self, audio):
assert len(audio.shape) == 2 and audio.shape[
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
# audio shape: [1, T]
feature = self.__extract_feature(audio)
embedding = self.embedding_model.backbone(feature)
return embedding
def __extract_feature(self, audio):
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
feature = feature - feature.mean(dim=0, keepdim=True)
feature = feature.unsqueeze(0)
return feature
def __load_check_point(self, pretrained_model_name, device=None):
if not device:
device = torch.device('cpu')
state_dict = torch.load(
os.path.join(self.model_dir, pretrained_model_name),
map_location=device)
state_dict_tea = {
k.replace('module.', ''): v
for k, v in state_dict['teacher'].items()
}
self.embedding_model.load_state_dict(state_dict_tea, strict=True)

View File

@@ -0,0 +1,319 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from collections import OrderedDict
from typing import Any, Dict, Union
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.compliance.kaldi as Kaldi
from modelscope.metainfo import Models
from modelscope.models import MODELS, TorchModel
from modelscope.models.audio.sv.DTDNN import CAMPPlus
from modelscope.utils.constant import Tasks
class MultiHeadSelfAttention(nn.Module):
def __init__(self, n_units, h=8, dropout=0.1):
super(MultiHeadSelfAttention, self).__init__()
self.linearQ = nn.Linear(n_units, n_units)
self.linearK = nn.Linear(n_units, n_units)
self.linearV = nn.Linear(n_units, n_units)
self.linearO = nn.Linear(n_units, n_units)
self.d_k = n_units // h
self.h = h
self.dropout = nn.Dropout(p=dropout)
self.att = None
def forward(self, x, batch_size):
# x: (BT, F)
q = self.linearQ(x).reshape(batch_size, -1, self.h, self.d_k)
k = self.linearK(x).reshape(batch_size, -1, self.h, self.d_k)
v = self.linearV(x).reshape(batch_size, -1, self.h, self.d_k)
scores = torch.matmul(q.transpose(1, 2), k.permute(
0, 2, 3, 1)) / np.sqrt(self.d_k)
# scores: (B, h, T, T)
self.att = F.softmax(scores, dim=3)
p_att = self.dropout(self.att)
# v : (B, T, h, d_k)
# p_att : (B, h, T, T)
x = torch.matmul(p_att, v.transpose(1, 2))
# x : (B, h, T, d_k)
x = x.transpose(1, 2).reshape(-1, self.h * self.d_k)
return self.linearO(x)
class PositionwiseFeedForward(nn.Module):
def __init__(self, n_units, d_units, dropout):
super(PositionwiseFeedForward, self).__init__()
self.linear1 = nn.Linear(n_units, d_units)
self.linear2 = nn.Linear(d_units, n_units)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x):
return self.linear2(self.dropout(F.relu(self.linear1(x))))
class PosEncoding(nn.Module):
def __init__(self, max_seq_len, d_word_vec):
super(PosEncoding, self).__init__()
pos_enc = np.array([[
pos / np.power(10000, 2.0 * (j // 2) / d_word_vec)
for j in range(d_word_vec)
] for pos in range(max_seq_len)])
pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
pad_row = np.zeros([1, d_word_vec])
pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
self.pos_enc = torch.nn.Embedding(max_seq_len + 1, d_word_vec)
self.pos_enc.weight = torch.nn.Parameter(
torch.from_numpy(pos_enc), requires_grad=False)
def forward(self, input_len):
max_len = torch.max(input_len)
input_pos = torch.LongTensor([
list(range(1, len + 1)) + [0] * (max_len - len)
for len in input_len
])
return self.pos_enc(input_pos)
class TransformerEncoder(nn.Module):
def __init__(self,
idim,
n_units=256,
n_layers=2,
e_units=512,
h=4,
dropout=0.1):
super(TransformerEncoder, self).__init__()
self.linear_in = nn.Linear(idim, n_units)
self.lnorm_in = nn.LayerNorm(n_units)
self.n_layers = n_layers
self.dropout = nn.Dropout(p=dropout)
for i in range(n_layers):
setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
setattr(self, '{}{:d}'.format('self_att_', i),
MultiHeadSelfAttention(n_units, h))
setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
setattr(self, '{}{:d}'.format('ff_', i),
PositionwiseFeedForward(n_units, e_units, dropout))
self.lnorm_out = nn.LayerNorm(n_units)
def forward(self, x):
# x: [B, num_anchors, T, n_in]
bs, num, tframe, dim = x.size()
x = x.reshape(bs * num, tframe, -1) # [B*num_anchors, T, dim]
# x: (B, T, F) ... batch, time, (mel)freq
B_size, T_size, _ = x.shape
# e: (BT, F)
e = self.linear_in(x.reshape(B_size * T_size, -1))
# Encoder stack
for i in range(self.n_layers):
# layer normalization
e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
# self-attention
s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
# residual
e = e + self.dropout(s)
# layer normalization
e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
# positionwise feed-forward
s = getattr(self, '{}{:d}'.format('ff_', i))(e)
# residual
e = e + self.dropout(s)
# final layer normalization
# output: (BT, F)
# output: (B, F, T)
output = self.lnorm_out(e).reshape(B_size, T_size, -1)
output = output.reshape(bs, num, tframe,
-1) # [B, num_anchors, T, dim]
return output
class TransformerEncoder_out(nn.Module):
def __init__(self,
idim,
n_units=256,
n_layers=2,
e_units=512,
h=4,
dropout=0.1):
super(TransformerEncoder_out, self).__init__()
self.linear_in = nn.Linear(idim, n_units)
self.lnorm_in = nn.LayerNorm(n_units)
self.n_layers = n_layers
self.dropout = nn.Dropout(p=dropout)
for i in range(n_layers):
setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
setattr(self, '{}{:d}'.format('self_att_', i),
MultiHeadSelfAttention(n_units, h))
setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
setattr(self, '{}{:d}'.format('ff_', i),
PositionwiseFeedForward(n_units, e_units, dropout))
self.lnorm_out = nn.LayerNorm(n_units)
def forward(self, x):
# x: (B, T, F)
B_size, T_size, _ = x.shape
# e: (BT, F)
e = self.linear_in(x.reshape(B_size * T_size, -1))
# Encoder stack
for i in range(self.n_layers):
# layer normalization
e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
# self-attention
s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
# residual
e = e + self.dropout(s)
# layer normalization
e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
# positionwise feed-forward
s = getattr(self, '{}{:d}'.format('ff_', i))(e)
# residual
e = e + self.dropout(s)
# final layer normalization
# output: (BT, F)
# output: (B, T, F)
output = self.lnorm_out(e).reshape(B_size, T_size, -1)
return output
class OutLayer(nn.Module):
def __init__(self, n_units=256, num_anchors=2):
super(OutLayer, self).__init__()
self.combine = TransformerEncoder_out(num_anchors * n_units, n_units)
self.out_linear = nn.Linear(n_units // num_anchors, 1)
def forward(self, input):
# input: [B, num_anchors, T, dim]
bs, num, tframe, dim = input.size()
output = input.permute(0, 2, 1,
3).reshape(bs, tframe,
-1) # [Bs, t, num_anchors*dim]
output = self.combine(output) # [Bs, t, n_units]
output = output.reshape(
bs, tframe, num, -1) # [Bs, t, num_anchors, n_units//num_anchors]
output = self.out_linear(output).squeeze(-1) # [Bs, t, num_anchors]
return output
class TransformerDetector(nn.Module):
def __init__(self,
frame_dim=512,
anchor_dim=192,
hidden_dim=256,
max_seq_len=1000):
super(TransformerDetector, self).__init__()
self.detection = TransformerEncoder(
idim=frame_dim + anchor_dim, n_units=hidden_dim)
self.output = OutLayer(n_units=hidden_dim)
self.pos_enc = PosEncoding(max_seq_len, hidden_dim)
def forward(self, feats, anchors):
# feats: [1, t, fdim]
num_frames = feats.shape[1]
num_anchors = anchors.shape[1]
bs = feats.shape[0]
feats = feats.unsqueeze(1).repeat(
1, num_anchors, 1, 1) # shape: [Bs, num_anchors, t, fdim]
anchors = anchors.unsqueeze(2).repeat(
1, 1, num_frames, 1) # shape: [Bs, num_anchors, t, xdim]
sd_in = torch.cat((feats, anchors),
dim=-1) # shape: [Bs, num_anchors, t, fdim+xdim]
sd_out = self.detection(sd_in) # shape: [Bs, num_anchors, t, sd_dim]
# pos
pos_emb = self.pos_enc(torch.tensor([num_frames] * (bs * num_anchors)))
pos_emb = pos_emb.reshape(bs, num_anchors, num_frames, -1)
sd_out += pos_emb
# output
output = self.output(sd_out) # shape: [Bs, t, num_anchors]
return output
@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.scl_sd)
class SpeakerChangeLocatorTransformer(TorchModel):
r"""A speaekr change locator using the transformer architecture as the backbone.
Args:
model_dir: A model dir.
model_config: The model config.
"""
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
**kwargs):
super().__init__(model_dir, model_config, *args, **kwargs)
self.model_config = model_config
self.feature_dim = self.model_config['fbank_dim']
frame_size = self.model_config['frame_size']
anchor_size = self.model_config['anchor_size']
self.encoder = CAMPPlus(self.feature_dim, output_level='frame')
self.backend = TransformerDetector(
frame_dim=frame_size, anchor_dim=anchor_size)
pretrained_encoder = kwargs['pretrained_encoder']
pretrained_backend = kwargs['pretrained_backend']
self.__load_check_point(pretrained_encoder, pretrained_backend)
self.encoder.eval()
self.backend.eval()
def forward(self, audio, anchors):
assert len(audio.shape) == 2 and audio.shape[
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
assert len(
anchors.shape
) == 3 and anchors.shape[0] == 1 and anchors.shape[
1] == 2, 'modelscope error: the shape of input anchors to model needs to be [1, 2, D]'
# audio shape: [1, T]
feature = self.__extract_feature(audio)
frame_state = self.encoder(feature)
output = self.backend(frame_state, anchors)
output = output.squeeze(0).detach().cpu().sigmoid()
time_scale_factor = int(np.ceil(feature.shape[1] / output.shape[0]))
output = output.unsqueeze(1).expand(-1, time_scale_factor,
-1).reshape(-1, output.shape[-1])
return output
def __extract_feature(self, audio):
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
feature = feature - feature.mean(dim=0, keepdim=True)
feature = feature.unsqueeze(0)
return feature
def __load_check_point(self,
pretrained_encoder,
pretrained_backend,
device=None):
if not device:
device = torch.device('cpu')
self.encoder.load_state_dict(
torch.load(
os.path.join(self.model_dir, pretrained_encoder),
map_location=device))
self.backend.load_state_dict(
torch.load(
os.path.join(self.model_dir, pretrained_backend),
map_location=device))

View File

@@ -17,11 +17,9 @@ from kantts.train.trainer import GAN_Trainer, Sambert_Trainer, distributed_init
from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from modelscope import __version__
from modelscope.utils.audio.audio_utils import TtsCustomParams from modelscope.utils.audio.audio_utils import TtsCustomParams
from modelscope.utils.audio.tts_exceptions import ( from modelscope.utils.audio.tts_exceptions import (
TtsModelConfigurationException, TtsModelNotExistsException) TtsModelConfigurationException, TtsModelNotExistsException)
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger from modelscope.utils.logger import get_logger
logger = get_logger() logger = get_logger()
@@ -394,6 +392,7 @@ class Voice:
logger.info(f'TRAINING steps: {train_max_steps}') logger.info(f'TRAINING steps: {train_max_steps}')
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S', config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime()) time.localtime())
from modelscope import __version__
config['modelscope_version'] = __version__ config['modelscope_version'] = __version__
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f: with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
@@ -558,6 +557,7 @@ class Voice:
logger.info(f'resume from: {resume_from}') logger.info(f'resume from: {resume_from}')
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S', config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
time.localtime()) time.localtime())
from modelscope import __version__
config['modelscope_version'] = __version__ config['modelscope_version'] = __version__
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f: with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:

View File

@@ -4,9 +4,8 @@
from . import (action_recognition, animal_recognition, bad_image_detecting, from . import (action_recognition, animal_recognition, bad_image_detecting,
body_2d_keypoints, body_3d_keypoints, cartoon, body_2d_keypoints, body_3d_keypoints, cartoon,
cmdssl_video_embedding, controllable_image_generation, cmdssl_video_embedding, controllable_image_generation,
crowd_counting, face_2d_keypoints, face_detection, crowd_counting, face_detection, face_generation,
face_generation, face_reconstruction, human_reconstruction, face_reconstruction, human_reconstruction, image_classification,
human_wholebody_keypoint, image_classification,
image_color_enhance, image_colorization, image_defrcn_fewshot, image_color_enhance, image_colorization, image_defrcn_fewshot,
image_denoise, image_inpainting, image_instance_segmentation, image_denoise, image_inpainting, image_instance_segmentation,
image_matching, image_mvs_depth_estimation, image_matching, image_mvs_depth_estimation,

View File

@@ -72,7 +72,7 @@ class PoseHighResolutionNetV2(TorchModel):
self.stage4, pre_stage_channels = self._make_stage( self.stage4, pre_stage_channels = self._make_stage(
self.stage4_cfg, num_channels, multi_scale_output=True) self.stage4_cfg, num_channels, multi_scale_output=True)
"""final four layers""" """final four layers"""
last_inp_channels = np.int(np.sum(pre_stage_channels)) last_inp_channels = int(np.sum(pre_stage_channels))
self.final_layer = nn.Sequential( self.final_layer = nn.Sequential(
nn.Conv2d( nn.Conv2d(
in_channels=last_inp_channels, in_channels=last_inp_channels,

View File

@@ -81,7 +81,7 @@ class FaceLandmark:
bbox[2] = center[0] + one_edge // 2 bbox[2] = center[0] + one_edge // 2
bbox[3] = center[1] + one_edge // 2 bbox[3] = center[1] + one_edge // 2
bbox = bbox.astype(np.int) bbox = bbox.astype(int)
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :] crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
h, w, _ = crop_image.shape h, w, _ = crop_image.shape
crop_image = cv2.resize( crop_image = cv2.resize(

View File

@@ -356,7 +356,7 @@ class HighResolutionNet(nn.Module):
num_channels) num_channels)
self.stage3, pre_stage_channels = self._make_stage( self.stage3, pre_stage_channels = self._make_stage(
self.stage3_cfg, num_channels) self.stage3_cfg, num_channels)
last_inp_channels = np.int(np.sum(pre_stage_channels)) + 256 last_inp_channels = int(np.sum(pre_stage_channels)) + 256
self.redc_layer = nn.Sequential( self.redc_layer = nn.Sequential(
nn.Conv2d( nn.Conv2d(
in_channels=last_inp_channels, in_channels=last_inp_channels,

View File

@@ -1,25 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.base import BaseModel
from easycv.utils.ms_utils import EasyCVMeta
from modelscope.models.base import TorchModel
class EasyCVBaseModel(BaseModel, TorchModel):
"""Base model for EasyCV."""
def __init__(self, model_dir=None, args=(), kwargs={}):
kwargs.pop(EasyCVMeta.ARCH, None) # pop useless keys
BaseModel.__init__(self)
TorchModel.__init__(self, model_dir=model_dir)
def forward(self, img, mode='train', **kwargs):
if self.training:
losses = self.forward_train(img, **kwargs)
loss, log_vars = self._parse_losses(losses)
return dict(loss=loss, log_vars=log_vars)
else:
return self.forward_test(img, **kwargs)
def __call__(self, *args, **kwargs):
return self.forward(*args, **kwargs)

View File

@@ -1,20 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .face_2d_keypoints_align import Face2DKeypoints
else:
_import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -1,16 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.face.face_keypoint import FaceKeypoint
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
FaceKeypoint.__init__(self, *args, **kwargs)

View File

@@ -82,7 +82,7 @@ class FaceLandmark:
bbox[2] = center[0] + one_edge // 2 bbox[2] = center[0] + one_edge // 2
bbox[3] = center[1] + one_edge // 2 bbox[3] = center[1] + one_edge // 2
bbox = bbox.astype(np.int) bbox = bbox.astype(int)
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :] crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
h, w, _ = crop_image.shape h, w, _ = crop_image.shape
crop_image = cv2.resize(crop_image, crop_image = cv2.resize(crop_image,

View File

@@ -1,20 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .hand_2d_keypoints import Hand2dKeyPoints
else:
_import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -1,16 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.pose import TopDown
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
TopDown.__init__(self, *args, **kwargs)

View File

@@ -31,7 +31,7 @@ class human_segmenter(object):
img = np.dstack((img, img, img)) img = np.dstack((img, img, img))
elif img.shape[2] == 4: elif img.shape[2] == 4:
img = img[:, :, :3] img = img[:, :, :3]
img = img.astype(np.float) img = img.astype(float)
return img return img
def run(self, img): def run(self, img):

View File

@@ -69,8 +69,8 @@ def eval_grid(coords,
num_samples=512 * 512 * 512): num_samples=512 * 512 * 512):
resolution = coords.shape[1:4] resolution = coords.shape[1:4]
sdf = np.zeros(resolution) sdf = np.zeros(resolution)
dirty = np.ones(resolution, dtype=np.bool) dirty = np.ones(resolution, dtype=bool)
grid_mask = np.zeros(resolution, dtype=np.bool) grid_mask = np.zeros(resolution, dtype=bool)
reso = resolution[0] // init_resolution reso = resolution[0] // init_resolution
while reso > 0: while reso > 0:

View File

@@ -1,17 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.pose.top_down import TopDown
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.human_wholebody_keypoint,
module_name=Models.human_wholebody_keypoint)
class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
TopDown.__init__(self, *args, **kwargs)

View File

@@ -163,7 +163,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
os.path.join(split_dir, os.path.join(split_dir,
'box_{}shot_{}_train.txt'.format(shot, 'box_{}shot_{}_train.txt'.format(shot,
cls))) as f: cls))) as f:
fileids_ = np.loadtxt(f, dtype=np.str).tolist() fileids_ = np.loadtxt(f, dtype=np.str_).tolist()
if isinstance(fileids_, str): if isinstance(fileids_, str):
fileids_ = [fileids_] fileids_ = [fileids_]
fileids_ = [ fileids_ = [
@@ -219,7 +219,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
with PathManager.open( with PathManager.open(
os.path.join(root, dirname, 'ImageSets', 'Main', os.path.join(root, dirname, 'ImageSets', 'Main',
split + '.txt')) as f: split + '.txt')) as f:
fileids = np.loadtxt(f, dtype=np.str) fileids = np.loadtxt(f, dtype=np.str_)
for fileid in fileids: for fileid in fileids:
anno_file = os.path.join(root, dirname, 'Annotations', anno_file = os.path.join(root, dirname, 'Annotations',

View File

@@ -8,10 +8,12 @@ if TYPE_CHECKING:
from .maskdino_swin import MaskDINOSwin from .maskdino_swin import MaskDINOSwin
from .model import CascadeMaskRCNNSwinModel from .model import CascadeMaskRCNNSwinModel
from .maskdino_model import MaskDINOSwinModel from .maskdino_model import MaskDINOSwinModel
from .fastinst_model import FastInst
from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
else: else:
_import_structure = { _import_structure = {
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'], 'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
'fastinst_model': ['FastInst'],
'maskdino_swin': ['MaskDINOSwin'], 'maskdino_swin': ['MaskDINOSwin'],
'model': ['CascadeMaskRCNNSwinModel'], 'model': ['CascadeMaskRCNNSwinModel'],
'maskdino_model': ['MaskDINOSwinModel'], 'maskdino_model': ['MaskDINOSwinModel'],

View File

@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING: if TYPE_CHECKING:
from .swin_transformer import SwinTransformer from .swin_transformer import SwinTransformer
from .swin_transformer import D2SwinTransformer from .swin_transformer import D2SwinTransformer
from .resnet import build_resnet_backbone
else: else:
_import_structure = { _import_structure = {
'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'], 'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
'resnet': ['build_resnet_backbone']
} }
import sys import sys

View File

@@ -0,0 +1,114 @@
# Part of the implementation is borrowed and modified from Detectron2, publicly available at
# https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py
import torch.nn.functional as F
from torch import nn
from modelscope.models.cv.image_human_parsing.backbone.deeplab_resnet import (
BottleneckBlock, DeeplabResNet, get_norm)
from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
Conv2d
class BasicStem(nn.Module):
"""
The standard ResNet stem (layers before the first residual block),
with a conv, relu and max_pool.
"""
def __init__(self, in_channels=3, out_channels=64, norm='BN'):
"""
Args:
norm (str or callable): norm after the first conv layer.
See :func:`layers.get_norm` for supported format.
"""
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = 4
self.conv1 = Conv2d(
in_channels,
out_channels,
kernel_size=7,
stride=2,
padding=3,
bias=False,
norm=get_norm(norm, out_channels),
)
def forward(self, x):
x = self.conv1(x)
x = F.relu_(x)
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
return x
def build_resnet_backbone(out_features, depth, num_groups, width_per_group,
norm, stem_out_channels, res2_out_channels,
stride_in_1x1, res4_dilation, res5_dilation,
res5_multi_grid, input_shape):
stem = BasicStem(
in_channels=input_shape['channels'],
out_channels=stem_out_channels,
norm=norm)
bottleneck_channels = num_groups * width_per_group
in_channels = stem_out_channels
out_channels = res2_out_channels
assert res4_dilation in {
1, 2
}, 'res4_dilation cannot be {}.'.format(res4_dilation)
assert res5_dilation in {
1, 2, 4
}, 'res5_dilation cannot be {}.'.format(res5_dilation)
if res4_dilation == 2:
# Always dilate res5 if res4 is dilated.
assert res5_dilation == 4
num_blocks_per_stage = {
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3]
}[depth]
stages = []
out_stage_idx = [{
'res2': 2,
'res3': 3,
'res4': 4,
'res5': 5
}[f] for f in out_features]
max_stage_idx = max(out_stage_idx)
for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
if stage_idx == 4:
dilation = res4_dilation
elif stage_idx == 5:
dilation = res5_dilation
else:
dilation = 1
first_stride = 1 if idx == 0 or dilation > 1 else 2
stride_per_block = [first_stride]
stride_per_block += [1] * (num_blocks_per_stage[idx] - 1)
stage_kargs = {
'num_blocks': num_blocks_per_stage[idx],
'stride_per_block': stride_per_block,
'in_channels': in_channels,
'out_channels': out_channels,
'norm': norm,
'bottleneck_channels': bottleneck_channels,
'stride_in_1x1': stride_in_1x1,
'dilation': dilation,
'num_groups': num_groups,
'block_class': BottleneckBlock
}
if stage_idx == 5:
stage_kargs.pop('dilation')
stage_kargs['dilation_per_block'] = [
dilation * mg for mg in res5_multi_grid
]
blocks = DeeplabResNet.make_stage(**stage_kargs)
in_channels = out_channels
out_channels *= 2
bottleneck_channels *= 2
stages.append(blocks)
return DeeplabResNet(stem, stages, out_features=out_features)

View File

@@ -0,0 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

View File

@@ -0,0 +1,351 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import math
import torch
from torch import nn
from torch.nn import functional as F
from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import (
MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer)
class QueryProposal(nn.Module):
def __init__(self, num_features, num_queries, num_classes):
super().__init__()
self.topk = num_queries
self.num_classes = num_classes
self.conv_proposal_cls_logits = nn.Sequential(
nn.Conv2d(
num_features, num_features, kernel_size=3, stride=1,
padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(
num_features,
num_classes + 1,
kernel_size=1,
stride=1,
padding=0),
)
@torch.no_grad()
def compute_coordinates(self, x):
h, w = x.size(2), x.size(3)
y_loc = torch.linspace(0, 1, h, device=x.device)
x_loc = torch.linspace(0, 1, w, device=x.device)
y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
locations = torch.stack([x_loc, y_loc], 0).unsqueeze(0)
return locations
def seek_local_maximum(self, x, epsilon=1e-6):
"""
inputs:
x: torch.tensor, shape [b, c, h, w]
return:
torch.tensor, shape [b, c, h, w]
"""
x_pad = F.pad(x, (1, 1, 1, 1), 'constant', 0)
# top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
maximum = (x >= x_pad[:, :, :-2, 1:-1]) & \
(x >= x_pad[:, :, 2:, 1:-1]) & \
(x >= x_pad[:, :, 1:-1, :-2]) & \
(x >= x_pad[:, :, 1:-1, 2:]) & \
(x >= x_pad[:, :, :-2, :-2]) & \
(x >= x_pad[:, :, :-2, 2:]) & \
(x >= x_pad[:, :, 2:, :-2]) & \
(x >= x_pad[:, :, 2:, 2:]) & \
(x >= epsilon)
return maximum.to(x)
def forward(self, x, pos_embeddings):
proposal_cls_logits = self.conv_proposal_cls_logits(x) # b, c, h, w
proposal_cls_probs = proposal_cls_logits.softmax(dim=1) # b, c, h, w
proposal_cls_one_hot = F.one_hot(
proposal_cls_probs[:, :-1, :, :].max(1)[1],
num_classes=self.num_classes + 1).permute(0, 3, 1, 2) # b, c, h, w
proposal_cls_probs = proposal_cls_probs.mul(proposal_cls_one_hot)
proposal_local_maximum_map = self.seek_local_maximum(
proposal_cls_probs) # b, c, h, w
proposal_cls_probs = proposal_cls_probs + proposal_local_maximum_map # b, c, h, w
# top-k indices
topk_indices = torch.topk(
proposal_cls_probs[:, :-1, :, :].flatten(2).max(1)[0],
self.topk,
dim=1)[1] # b, q
topk_indices = topk_indices.unsqueeze(1) # b, 1, q
# topk queries
topk_proposals = torch.gather(
x.flatten(2), dim=2, index=topk_indices.repeat(1, x.shape[1],
1)) # b, c, q
pos_embeddings = pos_embeddings.repeat(x.shape[0], 1, 1, 1).flatten(2)
topk_pos_embeddings = torch.gather(
pos_embeddings,
dim=2,
index=topk_indices.repeat(1, pos_embeddings.shape[1],
1)) # b, c, q
if self.training:
locations = self.compute_coordinates(x).repeat(x.shape[0], 1, 1, 1)
topk_locations = torch.gather(
locations.flatten(2),
dim=2,
index=topk_indices.repeat(1, locations.shape[1], 1))
topk_locations = topk_locations.transpose(-1, -2) # b, q, 2
else:
topk_locations = None
return topk_proposals, topk_pos_embeddings, topk_locations, proposal_cls_logits
class FastInstDecoder(nn.Module):
def __init__(self, in_channels, *, num_classes: int, hidden_dim: int,
num_queries: int, num_aux_queries: int, nheads: int,
dim_feedforward: int, dec_layers: int, pre_norm: bool,
mask_dim: int):
"""
Args:
in_channels: channels of the input features
num_classes: number of classes
hidden_dim: Transformer feature dimension
num_queries: number of queries
num_aux_queries: number of auxiliary queries
nheads: number of heads
dim_feedforward: feature dimension in feedforward network
dec_layers: number of Transformer decoder layers
pre_norm: whether to use pre-LayerNorm or not
mask_dim: mask feature dimension
"""
super().__init__()
self.num_heads = nheads
self.num_layers = dec_layers
self.num_queries = num_queries
self.num_aux_queries = num_aux_queries
self.num_classes = num_classes
meta_pos_size = int(round(math.sqrt(self.num_queries)))
self.meta_pos_embed = nn.Parameter(
torch.empty(1, hidden_dim, meta_pos_size, meta_pos_size))
if num_aux_queries > 0:
self.empty_query_features = nn.Embedding(num_aux_queries,
hidden_dim)
self.empty_query_pos_embed = nn.Embedding(num_aux_queries,
hidden_dim)
self.query_proposal = QueryProposal(hidden_dim, num_queries,
num_classes)
self.transformer_query_cross_attention_layers = nn.ModuleList()
self.transformer_query_self_attention_layers = nn.ModuleList()
self.transformer_query_ffn_layers = nn.ModuleList()
self.transformer_mask_cross_attention_layers = nn.ModuleList()
self.transformer_mask_ffn_layers = nn.ModuleList()
for idx in range(self.num_layers):
self.transformer_query_cross_attention_layers.append(
CrossAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm))
self.transformer_query_self_attention_layers.append(
SelfAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm))
self.transformer_query_ffn_layers.append(
FFNLayer(
d_model=hidden_dim,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=pre_norm))
self.transformer_mask_cross_attention_layers.append(
CrossAttentionLayer(
d_model=hidden_dim,
nhead=nheads,
dropout=0.0,
normalize_before=pre_norm))
self.transformer_mask_ffn_layers.append(
FFNLayer(
d_model=hidden_dim,
dim_feedforward=dim_feedforward,
dropout=0.0,
normalize_before=pre_norm))
self.decoder_query_norm_layers = nn.ModuleList()
self.class_embed_layers = nn.ModuleList()
self.mask_embed_layers = nn.ModuleList()
self.mask_features_layers = nn.ModuleList()
for idx in range(self.num_layers + 1):
self.decoder_query_norm_layers.append(nn.LayerNorm(hidden_dim))
self.class_embed_layers.append(
MLP(hidden_dim, hidden_dim, num_classes + 1, 3))
self.mask_embed_layers.append(
MLP(hidden_dim, hidden_dim, mask_dim, 3))
self.mask_features_layers.append(nn.Linear(hidden_dim, mask_dim))
def forward(self, x, mask_features, targets=None):
bs = x[0].shape[0]
proposal_size = x[1].shape[-2:]
pixel_feature_size = x[2].shape[-2:]
pixel_pos_embeds = F.interpolate(
self.meta_pos_embed,
size=pixel_feature_size,
mode='bilinear',
align_corners=False)
proposal_pos_embeds = F.interpolate(
self.meta_pos_embed,
size=proposal_size,
mode='bilinear',
align_corners=False)
pixel_features = x[2].flatten(2).permute(2, 0, 1)
pixel_pos_embeds = pixel_pos_embeds.flatten(2).permute(2, 0, 1)
query_features, query_pos_embeds, query_locations, proposal_cls_logits = self.query_proposal(
x[1], proposal_pos_embeds)
query_features = query_features.permute(2, 0, 1)
query_pos_embeds = query_pos_embeds.permute(2, 0, 1)
if self.num_aux_queries > 0:
aux_query_features = self.empty_query_features.weight.unsqueeze(
1).repeat(1, bs, 1)
aux_query_pos_embed = self.empty_query_pos_embed.weight.unsqueeze(
1).repeat(1, bs, 1)
query_features = torch.cat([query_features, aux_query_features],
dim=0)
query_pos_embeds = torch.cat(
[query_pos_embeds, aux_query_pos_embed], dim=0)
outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
query_features,
pixel_features,
pixel_feature_size,
-1,
return_attn_mask=True)
predictions_class = [outputs_class]
predictions_mask = [outputs_mask]
predictions_matching_index = [None]
query_feature_memory = [query_features]
pixel_feature_memory = [pixel_features]
for i in range(self.num_layers):
query_features, pixel_features = self.forward_one_layer(
query_features, pixel_features, query_pos_embeds,
pixel_pos_embeds, attn_mask, i)
if i < self.num_layers - 1:
outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
query_features,
pixel_features,
pixel_feature_size,
i,
return_attn_mask=True,
)
else:
outputs_class, outputs_mask, _, matching_indices, gt_attn_mask = self.forward_prediction_heads(
query_features,
pixel_features,
pixel_feature_size,
i,
)
predictions_class.append(outputs_class)
predictions_mask.append(outputs_mask)
predictions_matching_index.append(None)
query_feature_memory.append(query_features)
pixel_feature_memory.append(pixel_features)
out = {
'proposal_cls_logits':
proposal_cls_logits,
'query_locations':
query_locations,
'pred_logits':
predictions_class[-1],
'pred_masks':
predictions_mask[-1],
'pred_indices':
predictions_matching_index[-1],
'aux_outputs':
self._set_aux_loss(predictions_class, predictions_mask,
predictions_matching_index, query_locations)
}
return out
def forward_one_layer(self, query_features, pixel_features,
query_pos_embeds, pixel_pos_embeds, attn_mask, i):
pixel_features = self.transformer_mask_cross_attention_layers[i](
pixel_features,
query_features,
query_pos=pixel_pos_embeds,
pos=query_pos_embeds)
pixel_features = self.transformer_mask_ffn_layers[i](pixel_features)
query_features = self.transformer_query_cross_attention_layers[i](
query_features,
pixel_features,
memory_mask=attn_mask,
query_pos=query_pos_embeds,
pos=pixel_pos_embeds)
query_features = self.transformer_query_self_attention_layers[i](
query_features, query_pos=query_pos_embeds)
query_features = self.transformer_query_ffn_layers[i](query_features)
return query_features, pixel_features
def forward_prediction_heads(self,
query_features,
pixel_features,
pixel_feature_size,
idx_layer,
return_attn_mask=False,
return_gt_attn_mask=False,
targets=None,
query_locations=None):
decoder_query_features = self.decoder_query_norm_layers[idx_layer + 1](
query_features[:self.num_queries])
decoder_query_features = decoder_query_features.transpose(0, 1)
if idx_layer + 1 == self.num_layers:
outputs_class = self.class_embed_layers[idx_layer + 1](
decoder_query_features)
else:
outputs_class = None
outputs_mask_embed = self.mask_embed_layers[idx_layer + 1](
decoder_query_features)
outputs_mask_features = self.mask_features_layers[idx_layer + 1](
pixel_features.transpose(0, 1))
outputs_mask = torch.einsum('bqc,blc->bql', outputs_mask_embed,
outputs_mask_features)
outputs_mask = outputs_mask.reshape(-1, self.num_queries,
*pixel_feature_size)
if return_attn_mask:
# outputs_mask.shape: b, q, h, w
attn_mask = F.pad(outputs_mask,
(0, 0, 0, 0, 0, self.num_aux_queries),
'constant', 1)
attn_mask = (attn_mask < 0.).flatten(2) # b, q, hw
invalid_query = attn_mask.all(-1, keepdim=True) # b, q, 1
attn_mask = (~invalid_query) & attn_mask # b, q, hw
attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1,
1).flatten(0, 1)
attn_mask = attn_mask.detach()
else:
attn_mask = None
matching_indices = None
gt_attn_mask = None
return outputs_class, outputs_mask, attn_mask, matching_indices, gt_attn_mask
@torch.jit.unused
def _set_aux_loss(self, outputs_class, outputs_seg_masks, output_indices,
output_query_locations):
return [{
'query_locations': output_query_locations,
'pred_logits': a,
'pred_masks': b,
'pred_matching_indices': c
} for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],
output_indices[:-1])]

View File

@@ -0,0 +1,180 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import logging
from typing import Callable, Optional, Union
import torch
from torch import nn
from torch.nn import functional as F
from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
Conv2d
# This is a modified FPN decoder.
class BaseFPN(nn.Module):
def __init__(
self,
input_shape,
*,
convs_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
):
"""
Args:
input_shape: shapes (channels and stride) of the input features
convs_dim: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__()
input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
self.in_features = [k for k, v in input_shape
] # starting from "res3" to "res5"
feature_channels = [v['channels'] for k, v in input_shape]
lateral_convs = []
output_convs = []
use_bias = norm == ''
for idx, in_channels in enumerate(feature_channels):
lateral_norm = nn.GroupNorm(32, convs_dim)
output_norm = nn.GroupNorm(32, convs_dim)
lateral_conv = Conv2d(
in_channels,
convs_dim,
kernel_size=1,
bias=use_bias,
norm=lateral_norm)
output_conv = Conv2d(
convs_dim,
convs_dim,
kernel_size=3,
stride=1,
padding=1,
bias=use_bias,
norm=output_norm,
activation=F.relu,
)
self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
self.add_module('layer_{}'.format(idx + 1), output_conv)
lateral_convs.append(lateral_conv)
output_convs.append(output_conv)
# Place convs into top-down order (from low to high resolution)
# to make the top-down computation in forward clearer.
self.lateral_convs = lateral_convs[::-1]
self.output_convs = output_convs[::-1]
self.convs_dim = convs_dim
self.num_feature_levels = 3 # always use 3 scales
def forward_features(self, features):
multi_scale_features = []
num_cur_levels = 0
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[::-1]):
x = features[f]
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
if idx == 0:
y = lateral_conv(x)
else:
cur_fpn = lateral_conv(x)
y = cur_fpn + F.interpolate(
y,
size=cur_fpn.shape[-2:],
mode='bilinear',
align_corners=False)
y = output_conv(y)
if num_cur_levels < self.num_feature_levels:
multi_scale_features.append(y)
num_cur_levels += 1
return None, multi_scale_features
def forward(self, features, targets=None):
logger = logging.getLogger(__name__)
logger.warning(
'Calling forward() may cause unpredicted behavior of PixelDecoder module.'
)
return self.forward_features(features)
class PyramidPoolingModule(nn.Module):
def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
super().__init__()
self.stages = []
self.stages = nn.ModuleList(
[self._make_stage(in_channels, channels, size) for size in sizes])
self.bottleneck = Conv2d(in_channels + len(sizes) * channels,
in_channels, 1)
def _make_stage(self, features, out_features, size):
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
conv = Conv2d(features, out_features, 1)
return nn.Sequential(prior, conv)
def forward(self, feats):
h, w = feats.size(2), feats.size(3)
priors = [
F.interpolate(
input=F.relu_(stage(feats)),
size=(h, w),
mode='bilinear',
align_corners=False) for stage in self.stages
] + [feats]
out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
return out
class PyramidPoolingModuleFPN(BaseFPN):
def __init__(
self,
input_shape,
*,
convs_dim: int,
mask_dim: int,
norm: Optional[Union[str, Callable]] = None,
):
"""
NOTE: this interface is experimental.
Args:
input_shape: shapes (channels and stride) of the input features
convs_dim: number of output channels for the intermediate conv layers.
mask_dim: number of output channels for the final conv layer.
norm (str or callable): normalization for all conv layers
"""
super().__init__(
input_shape, convs_dim=convs_dim, mask_dim=mask_dim, norm=norm)
self.ppm = PyramidPoolingModule(convs_dim, convs_dim // 4)
def forward_features(self, features):
multi_scale_features = []
num_cur_levels = 0
# Reverse feature maps into top-down order (from low to high resolution)
for idx, f in enumerate(self.in_features[::-1]):
x = features[f]
lateral_conv = self.lateral_convs[idx]
output_conv = self.output_convs[idx]
if idx == 0:
y = self.ppm(lateral_conv(x))
else:
cur_fpn = lateral_conv(x)
y = cur_fpn + F.interpolate(
y,
size=cur_fpn.shape[-2:],
mode='bilinear',
align_corners=False)
y = output_conv(y)
if num_cur_levels < self.num_feature_levels:
multi_scale_features.append(y)
num_cur_levels += 1
return None, multi_scale_features

View File

@@ -0,0 +1,221 @@
# Part of implementation is borrowed and modified from Mask2Former, publicly available at
# https://github.com/facebookresearch/Mask2Former.
import os
from typing import Any, Dict, List
import torch
import torch.nn as nn
import torch.nn.functional as F
from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \
ImageList
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from .backbones import build_resnet_backbone
from .fastinst.fastinst_decoder import FastInstDecoder
from .fastinst.fastinst_encoder import PyramidPoolingModuleFPN
logger = get_logger()
@MODELS.register_module(Tasks.image_segmentation, module_name=Models.fastinst)
class FastInst(TorchModel):
def __init__(self,
model_dir,
backbone=None,
encoder=None,
decoder=None,
pretrained=None,
classes=None,
**kwargs):
"""
Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394
Args:
backbone (dict): backbone config.
encoder (dict): encoder config.
decoder (dict): decoder config.
pretrained (bool): whether to use pretrained model
classes (list): class names
"""
super(FastInst, self).__init__(model_dir, **kwargs)
self.backbone = build_resnet_backbone(
**backbone, input_shape={'channels': 3})
in_features = encoder.pop('in_features')
input_shape = {
k: v
for k, v in self.backbone.output_shape().items()
if k in in_features
}
encoder = PyramidPoolingModuleFPN(input_shape=input_shape, **encoder)
decoder = FastInstDecoder(in_channels=encoder.convs_dim, **decoder)
self.sem_seg_head = FastInstHead(
pixel_decoder=encoder, transformer_predictor=decoder)
self.num_classes = decoder.num_classes
self.num_queries = decoder.num_queries
self.size_divisibility = 32
self.register_buffer(
'pixel_mean',
torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
self.register_buffer(
'pixel_std',
torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
self.classes = classes
self.test_topk_per_image = 100
if pretrained:
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
logger.info(f'loading model from {model_path}')
weight = torch.load(model_path, map_location='cpu')['model']
tgt_weight = self.state_dict()
for name in list(weight.keys()):
if name in tgt_weight:
load_size = weight[name].size()
tgt_size = tgt_weight[name].size()
mis_match = False
if len(load_size) != len(tgt_size):
mis_match = True
else:
for n1, n2 in zip(load_size, tgt_size):
if n1 != n2:
mis_match = True
break
if mis_match:
logger.info(
f'size mismatch for {name} '
f'({load_size} -> {tgt_size}), skip loading.')
del weight[name]
else:
logger.info(
f'{name} doesn\'t exist in current model, skip loading.'
)
self.load_state_dict(weight, strict=False)
logger.info('load model done')
def forward(self, batched_inputs: List[dict]) -> Dict[str, Any]:
images = [x['image'].to(self.device) for x in batched_inputs]
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
images = ImageList.from_tensors(images, self.size_divisibility)
features = self.backbone(images.tensor)
outputs = self.sem_seg_head(features)
return dict(
outputs=outputs, batched_inputs=batched_inputs, images=images)
def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
outputs = input['outputs']
batched_inputs = input['batched_inputs']
images = input['images']
if self.training:
raise NotImplementedError
else:
mask_cls_results = outputs['pred_logits'] # (B, Q, C+1)
mask_pred_results = outputs['pred_masks'] # (B, Q, H, W)
# upsample masks
mask_pred_results = F.interpolate(
mask_pred_results,
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
mode='bilinear',
align_corners=False,
)
del outputs
processed_results = []
for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
mask_cls_results, mask_pred_results, batched_inputs,
images.image_sizes):
height = input_per_image.get('height', image_size[0])
width = input_per_image.get('width', image_size[1])
processed_results.append({}) # for each image
mask_pred_result = self.sem_seg_postprocess(
mask_pred_result, image_size, height, width)
mask_cls_result = mask_cls_result.to(mask_pred_result)
instance_r = self.instance_inference(mask_cls_result,
mask_pred_result)
processed_results[-1]['instances'] = instance_r
return dict(eval_result=processed_results)
@property
def device(self):
return self.pixel_mean.device
def sem_seg_postprocess(self, result, img_size, output_height,
output_width):
result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
result = F.interpolate(
result,
size=(output_height, output_width),
mode='bilinear',
align_corners=False)[0]
return result
def instance_inference(self, mask_cls, mask_pred):
# mask_pred is already processed to have the same shape as original input
image_size = mask_pred.shape[-2:]
# [Q, K]
scores = F.softmax(mask_cls, dim=-1)[:, :-1]
labels = torch.arange(
self.num_classes,
device=self.device).unsqueeze(0).repeat(self.num_queries,
1).flatten(0, 1)
scores_per_image, topk_indices = scores.flatten(0, 1).topk(
self.test_topk_per_image, sorted=False)
labels_per_image = labels[topk_indices]
topk_indices = topk_indices // self.num_classes
mask_pred = mask_pred[topk_indices]
result = {'image_size': image_size}
# mask (before sigmoid)
mask_pred_sigmoid = mask_pred.sigmoid()
result['pred_masks'] = (mask_pred_sigmoid > 0.5).float()
# calculate average mask prob
mask_scores_per_image = (mask_pred_sigmoid.flatten(1)
* result['pred_masks'].flatten(1)).sum(1) / (
result['pred_masks'].flatten(1).sum(1)
+ 1e-6)
result['scores'] = scores_per_image * mask_scores_per_image
result['pred_classes'] = labels_per_image
return result
class FastInstHead(nn.Module):
def __init__(
self,
*,
pixel_decoder: nn.Module,
# extra parameters
transformer_predictor: nn.Module):
"""
NOTE: this interface is experimental.
Args:
pixel_decoder: the pixel decoder module
transformer_predictor: the transformer decoder that makes prediction
"""
super().__init__()
self.pixel_decoder = pixel_decoder
self.predictor = transformer_predictor
def forward(self, features, targets=None):
return self.layers(features, targets)
def layers(self, features, targets=None):
mask_features, multi_scale_features = self.pixel_decoder.forward_features(
features)
predictions = self.predictor(multi_scale_features, mask_features,
targets)
return predictions

View File

@@ -108,16 +108,16 @@ def get_img_ins_seg_result(img_seg_result=None,
for seg_result in img_seg_result: for seg_result in img_seg_result:
box = [ box = [
np.int(seg_result[0]), int(seg_result[0]),
np.int(seg_result[1]), int(seg_result[1]),
np.int(seg_result[2]), int(seg_result[2]),
np.int(seg_result[3]) int(seg_result[3])
] ]
score = np.float(seg_result[4]) score = float(seg_result[4])
category = seg_result[5] category = seg_result[5]
mask = np.array(seg_result[6], order='F', dtype='uint8') mask = np.array(seg_result[6], order='F', dtype='uint8')
mask = mask.astype(np.float) mask = mask.astype(float)
results_dict[OutputKeys.BOXES].append(box) results_dict[OutputKeys.BOXES].append(box)
results_dict[OutputKeys.MASKS].append(mask) results_dict[OutputKeys.MASKS].append(mask)

View File

@@ -382,7 +382,7 @@ def processing_single_scene(args):
points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1], points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
points3d[p3d_id].xyz[2], 1 points3d[p3d_id].xyz[2], 1
]) ])
zs.append(np.asscalar(transformed[2])) zs.append(transformed[2].item())
zs_sorted = sorted(zs) zs_sorted = sorted(zs)
# relaxed depth range # relaxed depth range
max_ratio = 0.1 max_ratio = 0.1

View File

@@ -40,7 +40,7 @@ def read_mask(filename):
# save a binary mask # save a binary mask
def save_mask(filename, mask): def save_mask(filename, mask):
assert mask.dtype == np.bool assert mask.dtype == bool
mask = mask.astype(np.uint8) * 255 mask = mask.astype(np.uint8) * 255
Image.fromarray(mask).save(filename) Image.fromarray(mask).save(filename)

View File

@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING: if TYPE_CHECKING:
from .panseg_model import SwinLPanopticSegmentation from .panseg_model import SwinLPanopticSegmentation
from .r50_panseg_model import R50PanopticSegmentation
else: else:
_import_structure = { _import_structure = {

View File

@@ -1,18 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.segmentation import Mask2Former
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.image_segmentation,
module_name=Models.r50_panoptic_segmentation)
class R50PanopticSegmentation(EasyCVBaseModel, Mask2Former):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
Mask2Former.__init__(self, *args, **kwargs)

View File

@@ -1,16 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.segmentation import EncoderDecoder
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.image_segmentation, module_name=Models.segformer)
class Segformer(EasyCVBaseModel, EncoderDecoder):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
EncoderDecoder.__init__(self, *args, **kwargs)

View File

@@ -60,7 +60,7 @@ class SemanticSegmentation(TorchModel):
ids = ids[legal_indices] ids = ids[legal_indices]
segms = (semantic_result[None] == ids[:, None, None]) segms = (semantic_result[None] == ids[:, None, None])
masks = [it.astype(np.int) for it in segms] masks = [it.astype(int) for it in segms]
labels_txt = np.array(self.CLASSES)[ids].tolist() labels_txt = np.array(self.CLASSES)[ids].tolist()
results = { results = {

View File

@@ -458,7 +458,7 @@ class HrnetBackBone(nn.Module):
self.stage4, pre_stage_channels = self._make_stage( self.stage4, pre_stage_channels = self._make_stage(
self.stage4_cfg, num_channels, multi_scale_output=True) self.stage4_cfg, num_channels, multi_scale_output=True)
self.backbone_last_inp_channels = np.int(np.sum(pre_stage_channels)) self.backbone_last_inp_channels = int(np.sum(pre_stage_channels))
def _make_transition_layer(self, num_channels_pre_layer, def _make_transition_layer(self, num_channels_pre_layer,
num_channels_cur_layer): num_channels_cur_layer):

View File

@@ -259,7 +259,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
num_channels = [64, last_inp_channels] num_channels = [64, last_inp_channels]
self.stage_super, super_stage_channels = self._make_stage( self.stage_super, super_stage_channels = self._make_stage(
self.super_dict, num_channels) self.super_dict, num_channels)
last_inp_channels = np.int(np.sum(super_stage_channels)) last_inp_channels = int(np.sum(super_stage_channels))
if self.is_contain_aspp: if self.is_contain_aspp:
aspp_param = kwargs['aspp'] aspp_param = kwargs['aspp']
@@ -372,7 +372,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
num_channels = [64, ocr_mid_channels] num_channels = [64, ocr_mid_channels]
self.stage_super, super_stage_channels = self._make_stage( self.stage_super, super_stage_channels = self._make_stage(
self.super_dict, num_channels) self.super_dict, num_channels)
last_inp_channels = np.int(np.sum(super_stage_channels)) last_inp_channels = int(np.sum(super_stage_channels))
self.cls_head = nn.Sequential( self.cls_head = nn.Sequential(
nn.Conv2d( nn.Conv2d(

View File

@@ -13,7 +13,8 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
import torchvision.transforms as TF import torchvision.transforms as TF
from PIL import Image from PIL import Image
from shotdetect_scenedetect_lgss import shot_detect from shotdetect_scenedetect_lgss import shot_detector
from tqdm import tqdm
from modelscope.metainfo import Models from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel from modelscope.models.base.base_torch_model import TorchModel
@@ -60,6 +61,9 @@ class MovieSceneSegmentationModel(TorchModel):
self.head_sbd = nn.Linear(hdim, 2) self.head_sbd = nn.Linear(hdim, 2)
load_param_with_prefix('head_sbd', self.head_sbd, params) load_param_with_prefix('head_sbd', self.head_sbd, params)
self.shot_detector = shot_detector()
self.shot_detector.init(**self.cfg.preprocessor.shot_detect)
self.test_transform = TF.Compose([ self.test_transform = TF.Compose([
TF.Resize(size=256, interpolation=Image.BICUBIC), TF.Resize(size=256, interpolation=Image.BICUBIC),
TF.CenterCrop(224), TF.CenterCrop(224),
@@ -98,29 +102,45 @@ class MovieSceneSegmentationModel(TorchModel):
def inference(self, batch): def inference(self, batch):
logger.info('Begin scene detect ......') logger.info('Begin scene detect ......')
bs = self.cfg.pipeline.batch_size_per_gpu bs = self.cfg.pipeline.batch_size_per_gpu
sids = batch['sid'] device = self.crn.attention_mask.device
inputs = batch['shot_feat']
shot_num = len(sids) shot_timecode_lst = batch['shot_timecode_lst']
shot_idx_lst = batch['shot_idx_lst']
shot_num = len(shot_timecode_lst)
cnt = math.ceil(shot_num / bs) cnt = math.ceil(shot_num / bs)
infer_sid, infer_pred = [], [] infer_pred = []
infer_result = {} infer_result = {}
for i in range(cnt): self.shot_detector.start()
for i in tqdm(range(cnt)):
start = i * bs start = i * bs
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
input_ = inputs[start:end]
sid_ = sids[start:end] batch_shot_idx_lst = shot_idx_lst[start:end]
input_ = torch.stack(input_)
shot_start_idx = batch_shot_idx_lst[0][0]
shot_end_idx = batch_shot_idx_lst[-1][-1]
batch_timecode_lst = {
i: shot_timecode_lst[i]
for i in range(shot_start_idx, shot_end_idx + 1)
}
batch_shot_keyf_lst = self.shot_detector.get_frame_img(
batch_timecode_lst, shot_start_idx, shot_num)
inputs = self.get_batch_input(batch_shot_keyf_lst, shot_start_idx,
batch_shot_idx_lst)
input_ = torch.stack(inputs).to(device)
outputs = self.shared_step(input_) # shape [b,2] outputs = self.shared_step(input_) # shape [b,2]
prob = F.softmax(outputs, dim=1) prob = F.softmax(outputs, dim=1)
infer_sid.extend(sid_.cpu().detach().numpy())
infer_pred.extend(prob[:, 1].cpu().detach().numpy()) infer_pred.extend(prob[:, 1].cpu().detach().numpy())
infer_result.update({'pred': np.stack(infer_pred)})
infer_result.update({'sid': infer_sid})
assert len(infer_result['sid']) == len(sids) infer_result.update({'pred': np.stack(infer_pred)})
assert len(infer_result['pred']) == len(inputs) infer_result.update({'sid': np.arange(shot_num)})
assert len(infer_result['pred']) == shot_num
self.shot_detector.release()
return infer_result return infer_result
def shared_step(self, inputs): def shared_step(self, inputs):
@@ -162,38 +182,48 @@ class MovieSceneSegmentationModel(TorchModel):
logger.info('Generate scene .......') logger.info('Generate scene .......')
pred_dict = inputs['feat'] pred_dict = inputs['feat']
shot2keyf = inputs['shot2keyf']
thres = self.cfg.pipeline.save_threshold thres = self.cfg.pipeline.save_threshold
anno_dict = get_pred_boundary(pred_dict, thres) anno_dict = get_pred_boundary(pred_dict, thres)
scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene( scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
self.shot2keyf, anno_dict) shot2keyf, anno_dict)
if self.cfg.pipeline.save_split_scene: if self.cfg.pipeline.save_split_scene:
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres) re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
print(f'Split scene video saved to {re_dir}') print(f'Split scene video saved to {re_dir}')
return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
def preprocess(self, inputs): def get_batch_input(self, shot_keyf_lst, shot_start_idx, shot_idx_lst):
logger.info('Begin shot detect......')
shot_keyf_lst, anno, shot2keyf = shot_detect(
inputs, **self.cfg.preprocessor.shot_detect)
logger.info('Shot detect done!')
single_shot_feat, sid = [], [] single_shot_feat = []
for idx, one_shot in enumerate(shot_keyf_lst): for idx, one_shot in enumerate(shot_keyf_lst):
one_shot = [ one_shot = [
self.test_transform(one_frame) for one_frame in one_shot self.test_transform(one_frame) for one_frame in one_shot
] ]
one_shot = torch.stack(one_shot, dim=0) one_shot = torch.stack(one_shot, dim=0)
single_shot_feat.append(one_shot) single_shot_feat.append(one_shot)
sid.append(idx)
single_shot_feat = torch.stack(single_shot_feat, dim=0) single_shot_feat = torch.stack(single_shot_feat, dim=0)
shot_feat = [] shot_feat = []
for idx, shot_idx in enumerate(shot_idx_lst):
shot_idx_ = shot_idx - shot_start_idx
_one_shot = single_shot_feat[shot_idx_]
shot_feat.append(_one_shot)
return shot_feat
def preprocess(self, inputs):
logger.info('Begin shot detect......')
shot_timecode_lst, anno, shot2keyf = self.shot_detector.shot_detect(
inputs, **self.cfg.preprocessor.shot_detect)
logger.info('Shot detect done!')
shot_idx_lst = []
for idx, one_shot in enumerate(anno): for idx, one_shot in enumerate(anno):
shot_idx = int(one_shot['shot_id']) + np.arange( shot_idx = int(one_shot['shot_id']) + np.arange(
-self.neighbor_size, self.neighbor_size + 1) -self.neighbor_size, self.neighbor_size + 1)
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot']) shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'] - 1)
_one_shot = single_shot_feat[shot_idx] shot_idx_lst.append(shot_idx)
shot_feat.append(_one_shot)
self.shot2keyf = shot2keyf return shot2keyf, anno, shot_timecode_lst, shot_idx_lst
self.anno = anno
return shot_feat, sid

View File

@@ -10,11 +10,12 @@ from tqdm import tqdm
def get_pred_boundary(pred_dict, threshold=0.5): def get_pred_boundary(pred_dict, threshold=0.5):
pred = pred_dict['pred'] pred = pred_dict['pred'].cpu().numpy()
sid = pred_dict['sid'].cpu().numpy().astype(np.int32)
tmp = (pred > threshold).astype(np.int32) tmp = (pred > threshold).astype(np.int32)
anno_dict = {} anno_dict = {}
for idx in range(len(tmp)): for idx in range(len(tmp)):
anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])}) anno_dict.update({str(sid[idx]).zfill(4): int(tmp[idx])})
return anno_dict return anno_dict

View File

@@ -31,7 +31,7 @@ class ObjectSegmenter(object):
elif img.shape[2] == 4: elif img.shape[2] == 4:
img = img[:, :, :3] img = img[:, :, :3]
img = img[:, :, ::-1] img = img[:, :, ::-1]
img = img.astype(np.float) img = img.astype(float)
return img return img
def run_mask(self, img): def run_mask(self, img):

View File

@@ -1,16 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.detection.detectors import Detection as _Detection
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.image_object_detection, module_name=Models.dino)
class DINO(EasyCVBaseModel, _Detection):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
_Detection.__init__(self, *args, **kwargs)

View File

@@ -1,21 +0,0 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.detection.detectors import YOLOX as _YOLOX
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.image_object_detection, module_name=Models.yolox)
@MODELS.register_module(
group_key=Tasks.image_object_detection,
module_name=Models.image_object_detection_auto)
@MODELS.register_module(
group_key=Tasks.domain_specific_object_detection, module_name=Models.yolox)
class YOLOX(EasyCVBaseModel, _YOLOX):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
_YOLOX.__init__(self, *args, **kwargs)

View File

@@ -30,7 +30,7 @@ def depth2color(depth):
if gray == 1: if gray == 1:
return tuple(colors[-1].tolist()) return tuple(colors[-1].tolist())
num_rank = len(colors) - 1 num_rank = len(colors) - 1
rank = np.floor(gray * num_rank).astype(np.int) rank = np.floor(gray * num_rank).astype(int)
diff = (gray - rank / num_rank) * num_rank diff = (gray - rank / num_rank) * num_rank
tmp = colors[rank + 1] - colors[rank] tmp = colors[rank + 1] - colors[rank]
return tuple((colors[rank] + tmp * diff).tolist()) return tuple((colors[rank] + tmp * diff).tolist())
@@ -136,7 +136,7 @@ def plot_result(res_path,
l2g = get_lidar2global(infos) l2g = get_lidar2global(infos)
corners_lidar = corners_global @ np.linalg.inv(l2g).T corners_lidar = corners_global @ np.linalg.inv(l2g).T
corners_lidar = corners_lidar[:, :3] corners_lidar = corners_lidar[:, :3]
pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool) pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=bool)
scores = [ scores = [
pred_res[rid]['detection_score'] for rid in range(len(pred_res)) pred_res[rid]['detection_score'] for rid in range(len(pred_res))
] ]
@@ -151,7 +151,7 @@ def plot_result(res_path,
origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3) origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt], corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
axis=0) axis=0)
gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool) gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=bool)
pred_flag = np.concatenate( pred_flag = np.concatenate(
[pred_flag, np.logical_not(gt_flag)], axis=0) [pred_flag, np.logical_not(gt_flag)], axis=0)
scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])] scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
@@ -169,7 +169,7 @@ def plot_result(res_path,
check_point_in_img(corners_img, img.shape[0], img.shape[1])) check_point_in_img(corners_img, img.shape[0], img.shape[1]))
valid = valid.reshape( valid = valid.reshape(
-1, 8) # valid means: d>0 and visible in current view -1, 8) # valid means: d>0 and visible in current view
corners_img = corners_img.reshape(-1, 8, 2).astype(np.int) corners_img = corners_img.reshape(-1, 8, 2).astype(int)
for aid in range(valid.shape[0]): for aid in range(valid.shape[0]):
if scores[aid] < vis_thred and pred_flag[aid]: if scores[aid] < vis_thred and pred_flag[aid]:
continue continue

View File

@@ -90,8 +90,15 @@ class OCRRecognition(TorchModel):
f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}' f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
) )
if model_path != '': if model_path != '':
self.recognizer.load_state_dict( params_pretrained = torch.load(model_path, map_location='cpu')
torch.load(model_path, map_location='cpu')) model_dict = self.recognizer.state_dict()
# remove prefix for finetuned models
check_point = {
k.replace('recognizer.', ''): v
for k, v in params_pretrained.items()
}
model_dict.update(check_point)
self.recognizer.load_state_dict(model_dict)
dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE) dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
self.labelMapping = dict() self.labelMapping = dict()

View File

@@ -176,8 +176,7 @@ class OpenVocabularyDetectionViLD(Model):
# Filter out invalid rois (nmsed rois) # Filter out invalid rois (nmsed rois)
valid_indices = np.where( valid_indices = np.where(
np.logical_and( np.logical_and(
np.isin( np.isin(np.arange(len(roi_scores), dtype=int), nmsed_indices),
np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
np.logical_and( np.logical_and(
np.logical_not(np.all(roi_boxes == 0., axis=-1)), np.logical_not(np.all(roi_boxes == 0., axis=-1)),
np.logical_and(roi_scores >= min_rpn_score_thresh, np.logical_and(roi_scores >= min_rpn_score_thresh,

View File

@@ -72,7 +72,7 @@ class Cube2Equirec(nn.Module):
self.equ_h, 0), 3 * self.equ_w // 8, 1) self.equ_h, 0), 3 * self.equ_w // 8, 1)
# Prepare ceil mask # Prepare ceil mask
mask = np.zeros((self.equ_h, self.equ_w // 4), np.bool) mask = np.zeros((self.equ_h, self.equ_w // 4), bool)
idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4 idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
idx = self.equ_h // 2 - np.round( idx = self.equ_h // 2 - np.round(
np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int) np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)

View File

@@ -29,7 +29,7 @@ def load_depth(file):
elif file.endswith('png'): elif file.endswith('png'):
depth_png = np.array(load_image(file), dtype=int) depth_png = np.array(load_image(file), dtype=int)
assert (np.max(depth_png) > 255), 'Wrong .png depth file' assert (np.max(depth_png) > 255), 'Wrong .png depth file'
return depth_png.astype(np.float) / 256. return depth_png.astype(float) / 256.
else: else:
raise NotImplementedError('Depth extension not supported.') raise NotImplementedError('Depth extension not supported.')

View File

@@ -85,7 +85,7 @@ def do_scene_detect(F01_tensor, F10_tensor, img0_tensor, img1_tensor):
img_diff = ori_img.float() - ref_img.float() img_diff = ori_img.float() - ref_img.float()
img_diff = torch.abs(img_diff) img_diff = torch.abs(img_diff)
kernel = np.ones([8, 8], np.float) / 64 kernel = np.ones([8, 8], float) / 64
kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0) kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
diff = F.conv2d(img_diff, kernel, padding=4) diff = F.conv2d(img_diff, kernel, padding=4)

View File

@@ -27,7 +27,7 @@ def linear_assignment(cost_matrix, thresh):
def ious(atlbrs, btlbrs): def ious(atlbrs, btlbrs):
ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=float)
if ious.size == 0: if ious.size == 0:
return ious return ious
@@ -60,13 +60,13 @@ def embedding_distance(tracks, detections, metric='cosine'):
cost_matrix: np.ndarray cost_matrix: np.ndarray
""" """
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) cost_matrix = np.zeros((len(tracks), len(detections)), dtype=float)
if cost_matrix.size == 0: if cost_matrix.size == 0:
return cost_matrix return cost_matrix
det_features = np.asarray([track.curr_feat for track in detections], det_features = np.asarray([track.curr_feat for track in detections],
dtype=np.float) dtype=float)
track_features = np.asarray([track.smooth_feat for track in tracks], track_features = np.asarray([track.smooth_feat for track in tracks],
dtype=np.float) dtype=float)
cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric)) cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
return cost_matrix return cost_matrix

View File

@@ -28,7 +28,7 @@ class STrack(BaseTrack):
def __init__(self, tlwh, score, temp_feat, buffer_size=30): def __init__(self, tlwh, score, temp_feat, buffer_size=30):
# wait activate # wait activate
self._tlwh = np.asarray(tlwh, dtype=np.float) self._tlwh = np.asarray(tlwh, dtype=float)
self.kalman_filter = None self.kalman_filter = None
self.mean, self.covariance = None, None self.mean, self.covariance = None, None
self.is_activated = False self.is_activated = False

View File

@@ -20,6 +20,8 @@ if TYPE_CHECKING:
from .vldoc import VLDocForDocVLEmbedding from .vldoc import VLDocForDocVLEmbedding
from .video_synthesis import TextToVideoSynthesis from .video_synthesis import TextToVideoSynthesis
from .efficient_diffusion_tuning import EfficientStableDiffusion from .efficient_diffusion_tuning import EfficientStableDiffusion
from .mplug_owl import MplugOwlForConditionalGeneration
from .clip_interrogator import CLIP_Interrogator
else: else:
_import_structure = { _import_structure = {
@@ -37,7 +39,9 @@ else:
['MultiStageDiffusionForTextToImageSynthesis'], ['MultiStageDiffusionForTextToImageSynthesis'],
'vldoc': ['VLDocForDocVLEmbedding'], 'vldoc': ['VLDocForDocVLEmbedding'],
'video_synthesis': ['TextToVideoSynthesis'], 'video_synthesis': ['TextToVideoSynthesis'],
'efficient_diffusion_tuning': ['EfficientStableDiffusion'] 'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
'mplug_owl': ['MplugOwlForConditionalGeneration'],
'clip_interrogator': ['CLIP_Interrogator'],
} }
import sys import sys

View File

@@ -0,0 +1 @@
from .model import CLIP_Interrogator

View File

@@ -0,0 +1,599 @@
# This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at
# https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py
import hashlib
import math
import os
import time
from dataclasses import dataclass
from typing import List, Optional
import numpy as np
import open_clip
import requests
import torch
import torchvision.transforms as transforms
from PIL import Image
from safetensors.numpy import load_file, save_file
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoProcessor,
Blip2ForConditionalGeneration,
BlipForConditionalGeneration)
from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.outputs import OutputKeys
from modelscope.preprocessors import LoadImage
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
__all__ = ['CLIP_Interrogator']
CAPTION_MODELS = {
'blip-base': 'blip-image-captioning-base',
'blip-large': 'blip-image-captioning-large',
'blip2-2.7b': 'blip2-opt-2.7b',
'blip2-flan-t5-xl': 'blip2-flan-t5-xl',
'git-large-coco': 'git-large-coco',
}
@dataclass
class Config:
# models can optionally be passed in directly
caption_model = None
caption_processor = None
clip_model = None
clip_preprocess = None
# blip settings
caption_max_length: int = 32
caption_model_name: Optional[
str] = 'blip-large' # use a key from CAPTION_MODELS or None
caption_offload: bool = False
# clip settings
clip_model_name: str = 'ViT-L-14/openai'
clip_model_path: Optional[str] = None
clip_offload: bool = False
# interrogator settings
cache_path: str = 'cache' # path to store cached text embeddings
download_cache: bool = False # when true, cached embeds are downloaded from huggingface
chunk_size: int = 2048 # batch size for CLIP, use smaller for lower VRAM
data_path: str = os.path.join(os.path.dirname(__file__), 'data')
device: str = ('cuda' if torch.cuda.is_available() else 'cpu')
flavor_intermediate_count: int = 2048
quiet: bool = False # when quiet progress bars are not shown
def apply_low_vram_defaults(self):
self.caption_model_name = 'blip-base'
self.caption_offload = True
self.clip_offload = True
self.chunk_size = 1024
self.flavor_intermediate_count = 1024
# CLIP-Interrogator utilize CLIP and BLIP to generate rich caption for images.
# CLIP is a zero-shot image classifier which can be used to generate image and text embeddings.
# BLIP is a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks.
# BLIP effectively utilizes the noisy web data by bootstrapping the captions, where
# a captioner generates synthetic captions and a filter removes the noisy ones.
# Please infer to the paper CLIP: Learning Transferable Visual Models From Natural Language Supervision
# https://arxiv.org/abs/2103.00020
# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
# https://arxiv.org/abs/2201.12086
class Interrogator():
def __init__(self, config: Config):
self.config = config
self.device = config.device
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
self.caption_offloaded = True
self.clip_offloaded = True
self.load_caption_model()
self.load_clip_model()
def load_caption_model(self):
if self.config.caption_model is None and self.config.caption_model_name:
if not self.config.quiet:
print(
f'Loading caption model {self.config.caption_model_name}...'
)
model_path = CAPTION_MODELS[self.config.caption_model_name]
if self.config.caption_model_name.startswith('git-'):
caption_model = AutoModelForCausalLM.from_pretrained(
os.path.join(self.config.cache_path, model_path),
torch_dtype=torch.float32)
elif self.config.caption_model_name.startswith('blip2-'):
caption_model = Blip2ForConditionalGeneration.from_pretrained(
os.path.join(self.config.cache_path, model_path),
torch_dtype=self.dtype)
else:
caption_model = BlipForConditionalGeneration.from_pretrained(
os.path.join(self.config.cache_path, model_path),
torch_dtype=self.dtype)
self.caption_processor = AutoProcessor.from_pretrained(
os.path.join(self.config.cache_path, model_path))
caption_model.eval()
if not self.config.caption_offload:
caption_model = caption_model.to(self.config.device)
self.caption_model = caption_model
else:
self.caption_model = self.config.caption_model
self.caption_processor = self.config.caption_processor
def load_clip_model(self):
start_time = time.time()
config = self.config
clip_model_name, clip_model_pretrained_name = config.clip_model_name.split(
'/', 2)
if config.clip_model is None:
if not config.quiet:
print(f'Loading CLIP model {config.clip_model_name}...')
self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
clip_model_name,
pretrained=clip_model_pretrained_name,
precision='fp16' if config.device == 'cuda' else 'fp32',
device=config.device,
jit=False,
cache_dir=config.clip_model_path)
self.clip_model.eval()
else:
self.clip_model = config.clip_model
self.clip_preprocess = config.clip_preprocess
self.tokenize = open_clip.get_tokenizer(clip_model_name)
sites = [
'Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart',
'dribbble', 'flickr', 'instagram', 'pexels', 'pinterest',
'pixabay', 'pixiv', 'polycount', 'reddit', 'shutterstock',
'tumblr', 'unsplash', 'zbrush central'
]
trending_list = [site for site in sites]
trending_list.extend(['trending on ' + site for site in sites])
trending_list.extend(['featured on ' + site for site in sites])
trending_list.extend([site + ' contest winner' for site in sites])
raw_artists = load_list(config.data_path, 'artists.txt')
artists = [f'by {a}' for a in raw_artists]
artists.extend([f'inspired by {a}' for a in raw_artists])
self._prepare_clip()
self.artists = LabelTable(artists, 'artists', self)
self.flavors = LabelTable(
load_list(config.data_path, 'flavors.txt'), 'flavors', self)
self.mediums = LabelTable(
load_list(config.data_path, 'mediums.txt'), 'mediums', self)
self.movements = LabelTable(
load_list(config.data_path, 'movements.txt'), 'movements', self)
self.trendings = LabelTable(trending_list, 'trendings', self)
self.negative = LabelTable(
load_list(config.data_path, 'negative.txt'), 'negative', self)
end_time = time.time()
if not config.quiet:
print(
f'Loaded CLIP model and data in {end_time-start_time:.2f} seconds.'
)
def chain(self,
image_features: torch.Tensor,
phrases: List[str],
best_prompt: str = '',
best_sim: float = 0,
min_count: int = 8,
max_count: int = 32,
desc='Chaining',
reverse: bool = False) -> str:
self._prepare_clip()
phrases = set(phrases)
if not best_prompt:
best_prompt = self.rank_top(
image_features, [f for f in phrases], reverse=reverse)
best_sim = self.similarity(image_features, best_prompt)
phrases.remove(best_prompt)
curr_prompt, curr_sim = best_prompt, best_sim
def check(addition: str, idx: int) -> bool:
nonlocal best_prompt, best_sim, curr_prompt, curr_sim
prompt = curr_prompt + ', ' + addition
sim = self.similarity(image_features, prompt)
if reverse:
sim = -sim
if sim > best_sim:
best_prompt, best_sim = prompt, sim
if sim > curr_sim or idx < min_count:
curr_prompt, curr_sim = prompt, sim
return True
return False
for idx in tqdm(
range(max_count), desc=desc, disable=self.config.quiet):
best = self.rank_top(
image_features, [f'{curr_prompt}, {f}' for f in phrases],
reverse=reverse)
flave = best[len(curr_prompt) + 2:]
if not check(flave, idx):
break
if _prompt_at_max_len(curr_prompt, self.tokenize):
break
phrases.remove(flave)
return best_prompt
def generate_caption(self, pil_image: Image) -> str:
assert self.caption_model is not None, 'No caption model loaded.'
self._prepare_caption()
inputs = self.caption_processor(
images=pil_image, return_tensors='pt').to(self.device)
if not self.config.caption_model_name.startswith('git-'):
inputs = inputs.to(self.dtype)
tokens = self.caption_model.generate(
**inputs, max_new_tokens=self.config.caption_max_length)
return self.caption_processor.batch_decode(
tokens, skip_special_tokens=True)[0].strip()
def image_to_features(self, image: Image) -> torch.Tensor:
self._prepare_clip()
images = self.clip_preprocess(image).unsqueeze(0).to(self.device)
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = self.clip_model.encode_image(images)
image_features /= image_features.norm(dim=-1, keepdim=True)
return image_features
def interrogate_classic(self,
image: Image,
max_flavors: int = 3,
caption: Optional[str] = None) -> str:
"""Classic mode creates a prompt in a standard format first describing the image,
then listing the artist, trending, movement, and flavor text modifiers."""
caption = caption or self.generate_caption(image)
image_features = self.image_to_features(image)
medium = self.mediums.rank(image_features, 1)[0]
artist = self.artists.rank(image_features, 1)[0]
trending = self.trendings.rank(image_features, 1)[0]
movement = self.movements.rank(image_features, 1)[0]
flaves = ', '.join(self.flavors.rank(image_features, max_flavors))
if caption.startswith(medium):
prompt = f'{caption} {artist}, {trending}, {movement}, {flaves}'
else:
prompt = f'{caption}, {medium} {artist}, {trending}, {movement}, {flaves}'
return _truncate_to_fit(prompt, self.tokenize)
def interrogate_fast(self,
image: Image,
max_flavors: int = 32,
caption: Optional[str] = None) -> str:
"""Fast mode simply adds the top ranked terms after a caption. It generally results in
better similarity between generated prompt and image than classic mode, but the prompts
are less readable."""
caption = caption or self.generate_caption(image)
image_features = self.image_to_features(image)
merged = _merge_tables([
self.artists, self.flavors, self.mediums, self.movements,
self.trendings
], self)
tops = merged.rank(image_features, max_flavors)
return _truncate_to_fit(caption + ', ' + ', '.join(tops),
self.tokenize)
def interrogate_negative(self, image: Image, max_flavors: int = 32) -> str:
"""Negative mode chains together the most dissimilar terms to the image. It can be used
to help build a negative prompt to pair with the regular positive prompt and often
improve the results of generated images particularly with Stable Diffusion 2."""
image_features = self.image_to_features(image)
flaves = self.flavors.rank(
image_features,
self.config.flavor_intermediate_count,
reverse=True)
flaves = flaves + self.negative.labels
return self.chain(
image_features,
flaves,
max_count=max_flavors,
reverse=True,
desc='Negative chain')
def interrogate(self,
image: Image,
min_flavors: int = 8,
max_flavors: int = 32,
caption: Optional[str] = None) -> str:
caption = caption or self.generate_caption(image)
image_features = self.image_to_features(image)
merged = _merge_tables([
self.artists, self.flavors, self.mediums, self.movements,
self.trendings
], self)
flaves = merged.rank(image_features,
self.config.flavor_intermediate_count)
best_prompt, best_sim = caption, self.similarity(
image_features, caption)
best_prompt = self.chain(
image_features,
flaves,
best_prompt,
best_sim,
min_count=min_flavors,
max_count=max_flavors,
desc='Flavor chain')
fast_prompt = self.interrogate_fast(
image, max_flavors, caption=caption)
classic_prompt = self.interrogate_classic(
image, max_flavors, caption=caption)
candidates = [caption, classic_prompt, fast_prompt, best_prompt]
return candidates[np.argmax(
self.similarities(image_features, candidates))]
def rank_top(self,
image_features: torch.Tensor,
text_array: List[str],
reverse: bool = False) -> str:
self._prepare_clip()
text_tokens = self.tokenize([text
for text in text_array]).to(self.device)
with torch.no_grad(), torch.cuda.amp.autocast():
text_features = self.clip_model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features @ image_features.T
if reverse:
similarity = -similarity
return text_array[similarity.argmax().item()]
def similarity(self, image_features: torch.Tensor, text: str) -> float:
self._prepare_clip()
text_tokens = self.tokenize([text]).to(self.device)
with torch.no_grad(), torch.cuda.amp.autocast():
text_features = self.clip_model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features @ image_features.T
return similarity[0][0].item()
def similarities(self, image_features: torch.Tensor,
text_array: List[str]) -> List[float]:
self._prepare_clip()
text_tokens = self.tokenize([text
for text in text_array]).to(self.device)
with torch.no_grad(), torch.cuda.amp.autocast():
text_features = self.clip_model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = text_features @ image_features.T
return similarity.T[0].tolist()
def _prepare_caption(self):
if self.config.clip_offload and not self.clip_offloaded:
self.clip_model = self.clip_model.to('cpu')
self.clip_offloaded = True
if self.caption_offloaded:
self.caption_model = self.caption_model.to(self.device)
self.caption_offloaded = False
def _prepare_clip(self):
if self.config.caption_offload and not self.caption_offloaded:
self.caption_model = self.caption_model.to('cpu')
self.caption_offloaded = True
if self.clip_offloaded:
self.clip_model = self.clip_model.to(self.device)
self.clip_offloaded = False
class LabelTable():
def __init__(self, labels: List[str], desc: str, ci: Interrogator):
clip_model, config = ci.clip_model, ci.config
self.chunk_size = config.chunk_size
self.config = config
self.device = config.device
self.embeds = []
self.labels = labels
self.tokenize = ci.tokenize
hash = hashlib.sha256(','.join(labels).encode()).hexdigest()
sanitized_name = self.config.clip_model_name.replace('/', '_').replace(
'@', '_')
self._load_cached(desc, hash, sanitized_name)
if len(self.labels) != len(self.embeds):
self.embeds = []
chunks = np.array_split(
self.labels, max(1,
len(self.labels) / config.chunk_size))
for chunk in tqdm(
chunks,
desc=f'Preprocessing {desc}' if desc else None,
disable=self.config.quiet):
text_tokens = self.tokenize(chunk).to(self.device)
with torch.no_grad(), torch.cuda.amp.autocast():
text_features = clip_model.encode_text(text_tokens)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_features = text_features.half().cpu().numpy()
for i in range(text_features.shape[0]):
self.embeds.append(text_features[i])
if desc and self.config.cache_path:
os.makedirs(self.config.cache_path, exist_ok=True)
cache_filepath = os.path.join(
self.config.cache_path,
f'{sanitized_name}_{desc}.safetensors')
tensors = {
'embeds': np.stack(self.embeds),
'hash': np.array([ord(c) for c in hash], dtype=np.int8)
}
save_file(tensors, cache_filepath)
if self.device == 'cpu' or self.device == torch.device('cpu'):
self.embeds = [e.astype(np.float32) for e in self.embeds]
def _load_cached(self, desc: str, hash: str, sanitized_name: str) -> bool:
if self.config.cache_path is None or desc is None:
return False
cached_safetensors = os.path.join(
self.config.cache_path, f'{sanitized_name}_{desc}.safetensors')
if os.path.exists(cached_safetensors):
try:
tensors = load_file(cached_safetensors)
except Exception as e:
print(f'Failed to load {cached_safetensors}')
print(e)
return False
if 'hash' in tensors and 'embeds' in tensors:
if np.array_equal(
tensors['hash'],
np.array([ord(c) for c in hash], dtype=np.int8)):
self.embeds = tensors['embeds']
if len(self.embeds.shape) == 2:
self.embeds = [
self.embeds[i] for i in range(self.embeds.shape[0])
]
return True
return False
def _rank(self,
image_features: torch.Tensor,
text_embeds: torch.Tensor,
top_count: int = 1,
reverse: bool = False) -> str:
top_count = min(top_count, len(text_embeds))
text_embeds = torch.stack([torch.from_numpy(t)
for t in text_embeds]).to(self.device)
with torch.cuda.amp.autocast():
similarity = image_features @ text_embeds.T
if reverse:
similarity = -similarity
_, top_labels = similarity.float().cpu().topk(top_count, dim=-1)
return [top_labels[0][i].numpy() for i in range(top_count)]
def rank(self,
image_features: torch.Tensor,
top_count: int = 1,
reverse: bool = False) -> List[str]:
if len(self.labels) <= self.chunk_size:
tops = self._rank(
image_features,
self.embeds,
top_count=top_count,
reverse=reverse)
return [self.labels[i] for i in tops]
num_chunks = int(math.ceil(len(self.labels) / self.chunk_size))
keep_per_chunk = int(self.chunk_size / num_chunks)
top_labels, top_embeds = [], []
for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
start = chunk_idx * self.chunk_size
stop = min(start + self.chunk_size, len(self.embeds))
tops = self._rank(
image_features,
self.embeds[start:stop],
top_count=keep_per_chunk,
reverse=reverse)
top_labels.extend([self.labels[start + i] for i in tops])
top_embeds.extend([self.embeds[start + i] for i in tops])
tops = self._rank(image_features, top_embeds, top_count=top_count)
return [top_labels[i] for i in tops]
def _download_file(url: str,
filepath: str,
chunk_size: int = 4 * 1024 * 1024,
quiet: bool = False):
r = requests.get(url, stream=True)
if r.status_code != 200:
return
file_size = int(r.headers.get('Content-Length', 0))
filename = url.split('/')[-1]
progress = tqdm(
total=file_size,
unit='B',
unit_scale=True,
desc=filename,
disable=quiet)
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
progress.update(len(chunk))
progress.close()
def _merge_tables(tables: List[LabelTable], ci: Interrogator) -> LabelTable:
m = LabelTable([], None, ci)
for table in tables:
m.labels.extend(table.labels)
m.embeds.extend(table.embeds)
return m
def _prompt_at_max_len(text: str, tokenize) -> bool:
tokens = tokenize([text])
return tokens[0][-1] != 0
def _truncate_to_fit(text: str, tokenize) -> str:
parts = text.split(', ')
new_text = parts[0]
for part in parts[1:]:
if _prompt_at_max_len(new_text + part, tokenize):
break
new_text += ', ' + part
return new_text
def list_caption_models() -> List[str]:
return list(CAPTION_MODELS.keys())
def list_clip_models() -> List[str]:
return ['/'.join(x) for x in open_clip.list_pretrained()]
def load_list(data_path: str, filename: Optional[str] = None) -> List[str]:
"""Load a list of strings from a file."""
if filename is not None:
data_path = os.path.join(data_path, filename)
with open(data_path, 'r', encoding='utf-8', errors='replace') as f:
items = [line.strip() for line in f.readlines()]
return items
@MODELS.register_module(
Tasks.image_captioning, module_name=Models.clip_interrogator)
class CLIP_Interrogator(TorchModel):
def __init__(self, model_dir, device='cuda', device_id=0, *args, **kwargs):
super().__init__(
model_dir=model_dir, device_id=device_id, *args, **kwargs)
self.device = device
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
cf = Config(clip_model_name='ViT-L-14/openai')
cf.data_path = os.path.join(model_dir, 'data')
cf.clip_model_path = model_dir
cf.cache_path = model_dir
self.ci = Interrogator(cf)
def forward(self, inputs):
image = transforms.ToPILImage()(inputs)
return {'caption': self.ci.interrogate(image)}

View File

@@ -128,13 +128,13 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
local_transform, local_transform,
s=None, s=None,
e=None): e=None):
video_mask = np.zeros(self.max_frames, dtype=np.long) video_mask = np.zeros(self.max_frames, dtype=int)
max_video_length = 0 max_video_length = 0
# T x 3 x H x W # T x 3 x H x W
video = np.zeros((self.max_frames, 3, rawVideoExtractor.size, video = np.zeros((self.max_frames, 3, rawVideoExtractor.size,
rawVideoExtractor.size), rawVideoExtractor.size),
dtype=np.float) dtype=float)
if s is None: if s is None:
start_time, end_time = None, None start_time, end_time = None, None

View File

@@ -0,0 +1,18 @@
# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
MplugOwlVisualAbstractorConfig)
from .modeling_mplug_owl import MplugOwlForConditionalGeneration

View File

@@ -0,0 +1,257 @@
# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MPLUG OWL model configuration """
import copy
import os
from typing import Union
from transformers import PretrainedConfig
from transformers.models.auto import CONFIG_MAPPING
from transformers.utils import logging
from modelscope.utils.constant import Tasks
logger = logging.get_logger()
class MplugOwlVisionConfig(PretrainedConfig):
r"""
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 32):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
initializer_factor (`float`, *optional*, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
```"""
model_type = 'mplug_owl_vision_model'
def __init__(
self,
hidden_size=1024,
intermediate_size=4096,
projection_dim=768,
num_hidden_layers=24,
num_attention_heads=16,
num_channels=3,
image_size=224,
patch_size=14,
hidden_act='quick_gelu',
layer_norm_eps=1e-6,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=1.0,
use_flash_attn=False,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.projection_dim = projection_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.use_flash_attn = use_flash_attn
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
os.PathLike],
**kwargs) -> 'PretrainedConfig':
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs)
# get the vision config dict if we are loading from MplugOwlConfig
if config_dict.get('model_type') == 'mplug_owl':
config_dict = config_dict['vision_config']
if 'model_type' in config_dict and hasattr(
cls,
'model_type') and config_dict['model_type'] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
)
return cls.from_dict(config_dict, **kwargs)
class MplugOwlVisualAbstractorConfig(PretrainedConfig):
model_type = 'MPlugOwlVisualAbstractor'
def __init__(
self,
hidden_size=1024,
num_hidden_layers=6,
num_attention_heads=16,
intermediate_size=4096,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
layer_norm_eps=1e-6,
encoder_hidden_size=1024,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
self.encoder_hidden_size = encoder_hidden_size
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
os.PathLike],
**kwargs) -> 'PretrainedConfig':
config_dict, kwargs = cls.get_config_dict(
pretrained_model_name_or_path, **kwargs)
# get the qformer config dict if we are loading from MplugOwlConfig
if config_dict.get('model_type') == 'mplug_owl':
config_dict = config_dict['abstractor_config']
if 'model_type' in config_dict and hasattr(
cls,
'model_type') and config_dict['model_type'] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
)
return cls.from_dict(config_dict, **kwargs)
class MplugOwlConfig(PretrainedConfig):
r"""
Args:
vision_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
qformer_config (`dict`, *optional*):
Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
text_config (`dict`, *optional*):
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
num_query_tokens (`int`, *optional*, defaults to 32):
The number of query tokens passed through the Transformer.
kwargs (*optional*):
Dictionary of keyword arguments.
"""
model_type = 'mplug_owl'
is_composition = True
def __init__(self,
task=Tasks.multimodal_dialogue,
vision_config=None,
visual_abstractor_config=None,
text_config=None,
num_query_tokens=64,
**kwargs):
super().__init__(**kwargs)
self.task = task
if vision_config is None:
vision_config = MplugOwlVisionConfig().to_dict()
logger.info('vision_config is None.')
if visual_abstractor_config is None:
visual_abstractor_config = {}
logger.info('abstractor_config is None. ')
if text_config is None:
# we use LLAMA 7b by default
from transformers.models.llama.configuration_llama import \
LlamaConfig
text_config = LlamaConfig(pad_token_id=2).to_dict()
logger.info('text_config is None.')
self.vision_config = MplugOwlVisionConfig(**vision_config)
self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
**visual_abstractor_config)
text_model_type = text_config[
'model_type'] if 'model_type' in text_config else 'llama'
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
self.tie_word_embeddings = self.text_config.tie_word_embeddings
self.num_query_tokens = num_query_tokens
self.initializer_factor = 1.0
self.initializer_range = 0.02
@classmethod
def from_vision_abstractor_text_configs(
cls,
vision_config: MplugOwlVisionConfig,
visual_abstractor_config: MplugOwlVisualAbstractorConfig,
text_config: PretrainedConfig,
**kwargs,
):
r"""
Returns:
[`MplugOwlConfig`]: An instance of a configuration object
"""
return cls(
vision_config=vision_config.to_dict(),
visual_abstractor_config=visual_abstractor_config.to_dict(),
text_config=text_config.to_dict(),
**kwargs,
)
def to_dict(self):
"""
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
Returns:
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
output['vision_config'] = self.vision_config.to_dict()
tmp = self.visual_abstractor_config.to_dict()
output['visual_abstractor_config'] = tmp
output['text_config'] = self.text_config.to_dict()
output['model_type'] = self.__class__.model_type
return output

File diff suppressed because it is too large Load Diff

View File

@@ -212,10 +212,10 @@ class ConstructBlockStrategy:
block_spans, block_spans,
rng, rng,
task='bert'): task='bert'):
position_ids = np.arange(len(tokens), dtype=np.long) position_ids = np.arange(len(tokens), dtype=int)
targets = copy.deepcopy(tokens) targets = copy.deepcopy(tokens)
mask_id = self.tokenizer.get_command('MASK').Id mask_id = self.tokenizer.get_command('MASK').Id
mlm_masks = np.zeros(len(tokens), dtype=np.long) mlm_masks = np.zeros(len(tokens), dtype=int)
for start, end in block_spans: for start, end in block_spans:
for idx in range(start, end): for idx in range(start, end):
tokens[idx] = mask_id tokens[idx] = mask_id
@@ -231,7 +231,7 @@ class ConstructBlockStrategy:
rng, rng,
task='bert'): task='bert'):
text_length = len(tokens) text_length = len(tokens)
position_ids = np.ones(len(tokens), dtype=np.long) position_ids = np.ones(len(tokens), dtype=int)
for start, end in block_spans: for start, end in block_spans:
position_ids[start + 1:end] = 0 position_ids[start + 1:end] = 0
position_ids = np.cumsum(position_ids) - 1 position_ids = np.cumsum(position_ids) - 1
@@ -270,7 +270,7 @@ class ConstructBlockStrategy:
(end - start + 1)) (end - start + 1))
if self.block_position_encoding: if self.block_position_encoding:
target_block_position_ids.append( target_block_position_ids.append(
np.arange(1, end - start + 2, dtype=np.long)) np.arange(1, end - start + 2, dtype=int))
else: else:
target_block_position_ids.append([1] * (end - start + 1)) target_block_position_ids.append([1] * (end - start + 1))
block_spans.sort(key=lambda x: x[0]) block_spans.sort(key=lambda x: x[0])
@@ -307,7 +307,7 @@ class ConstructBlockStrategy:
target_tokens = target_tokens + [ target_tokens = target_tokens + [
self.tokenizer.get_command('eop').Id self.tokenizer.get_command('eop').Id
] ]
loss_masks = np.ones(len(target_tokens), dtype=np.long) loss_masks = np.ones(len(target_tokens), dtype=int)
return source_tokens, target_tokens, loss_masks return source_tokens, target_tokens, loss_masks
else: else:
tokens = np.concatenate(source_tokens + target_tokens) tokens = np.concatenate(source_tokens + target_tokens)
@@ -326,12 +326,12 @@ class ConstructBlockStrategy:
for pos in mask_pos: for pos in mask_pos:
tokens[pos] = self.tokenizer.get_command('dBLOCK').Id tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
targets = np.concatenate(source_tokens + targets) targets = np.concatenate(source_tokens + targets)
loss_masks = np.ones(len(tokens), dtype=np.long) loss_masks = np.ones(len(tokens), dtype=int)
loss_masks[:source_length] = 0 loss_masks[:source_length] = 0
position_ids = np.concatenate(source_position_ids position_ids = np.concatenate(source_position_ids
+ target_position_ids) + target_position_ids)
block_position_ids = np.concatenate( block_position_ids = np.concatenate(
[np.zeros(source_length, dtype=np.long)] [np.zeros(source_length, dtype=int)]
+ target_block_position_ids) + target_block_position_ids)
position_ids = np.stack([position_ids, block_position_ids], axis=0) position_ids = np.stack([position_ids, block_position_ids], axis=0)
if attention_mask is not None: if attention_mask is not None:
@@ -539,22 +539,21 @@ class ConstructBlockStrategy:
(source_tokens, [self.generation_mask], target_tokens)) (source_tokens, [self.generation_mask], target_tokens))
loss_masks = np.concatenate( loss_masks = np.concatenate(
(np.zeros(len(source_tokens) + 1, (np.zeros(len(source_tokens) + 1,
dtype=np.long), target_masks)) dtype=int), target_masks))
token_batch.append(tokens) token_batch.append(tokens)
target_batch.append(targets) target_batch.append(targets)
loss_mask_batch.append(loss_masks) loss_mask_batch.append(loss_masks)
position_ids = np.arange( position_ids = np.arange(
len(source_tokens) + len(target_tokens) + 1, len(source_tokens) + len(target_tokens) + 1, dtype=int)
dtype=np.long)
position_ids[len(source_tokens) + 1:] = len(source_tokens) position_ids[len(source_tokens) + 1:] = len(source_tokens)
if self.block_position_encoding: if self.block_position_encoding:
block_position_ids = np.concatenate( block_position_ids = np.concatenate(
(np.zeros(len(source_tokens), dtype=np.long), (np.zeros(len(source_tokens), dtype=int),
np.arange(len(target_tokens) + 1, dtype=np.long))) np.arange(len(target_tokens) + 1, dtype=int)))
else: else:
block_position_ids = np.concatenate( block_position_ids = np.concatenate(
(np.zeros(len(source_tokens) + 1, dtype=np.long), (np.zeros(len(source_tokens) + 1, dtype=int),
np.ones(len(target_tokens) + 1, dtype=np.long))) np.ones(len(target_tokens) + 1, dtype=int)))
position_id_batch.append( position_id_batch.append(
np.stack([position_ids, block_position_ids], axis=0)) np.stack([position_ids, block_position_ids], axis=0))
else: else:
@@ -597,27 +596,25 @@ class ConstructBlockStrategy:
max_length = max(seq_lengths) max_length = max(seq_lengths)
token_batch = [ token_batch = [
np.concatenate( np.concatenate(
(tokens, np.zeros(max_length - len(tokens), (tokens, np.zeros(max_length - len(tokens), dtype=int)))
dtype=np.long)))
for tokens in token_batch for tokens in token_batch
] ]
target_batch = [ target_batch = [
np.concatenate( np.concatenate(
(targets, (targets, np.zeros(max_length - len(targets), dtype=int)))
np.zeros(max_length - len(targets), dtype=np.long)))
for targets in target_batch for targets in target_batch
] ]
loss_mask_batch = [ loss_mask_batch = [
np.concatenate( np.concatenate(
(loss_masks, (loss_masks,
np.zeros(max_length - len(loss_masks), dtype=np.long))) np.zeros(max_length - len(loss_masks), dtype=int)))
for loss_masks in loss_mask_batch for loss_masks in loss_mask_batch
] ]
position_id_batch = [ position_id_batch = [
np.concatenate((position_ids, np.concatenate(
np.zeros( (position_ids,
(2, max_length - position_ids.shape[1]), np.zeros(
dtype=np.long)), (2, max_length - position_ids.shape[1]), dtype=int)),
axis=1) for position_ids in position_id_batch axis=1) for position_ids in position_id_batch
] ]
return token_batch, target_batch, loss_mask_batch, position_id_batch return token_batch, target_batch, loss_mask_batch, position_id_batch

View File

@@ -583,8 +583,8 @@ class XLDataset(data.Dataset):
def getidx(self, idx): def getidx(self, idx):
tokens, targets, loss_masks = [], [], [] tokens, targets, loss_masks = [], [], []
attention_mask = np.concatenate( attention_mask = np.concatenate(
(np.zeros((self.max_seq_len, self.mem_len), dtype=np.long), (np.zeros((self.max_seq_len, self.mem_len), dtype=int),
np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)), np.ones((self.max_seq_len, self.max_seq_len), dtype=int)),
axis=1) axis=1)
sample_idx = bisect_right(self.indices, idx * self.max_seq_len) sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1] last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]

View File

@@ -28,7 +28,7 @@ def main():
counts = np.array([0] * 10) counts = np.array([0] * 10)
for _ in range(10000): for _ in range(10000):
spans = strategy.sample_span_in_document( spans = strategy.sample_span_in_document(
np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1], np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=int), [1, 1],
random.Random()) random.Random())
for start, end in spans: for start, end in spans:
counts[start:end] += 1 counts[start:end] += 1

View File

@@ -17,7 +17,7 @@ def main():
num_iters=300000, num_iters=300000,
decay_style='cosine', decay_style='cosine',
decay_ratio=0.1) decay_ratio=0.1)
steps = np.arange(0, 400000, 10, dtype=np.long) steps = np.arange(0, 400000, 10, dtype=int)
rates = [] rates = []
for step in steps: for step in steps:
lr_scheduler.num_iters = step lr_scheduler.num_iters = step

View File

@@ -5,12 +5,12 @@ from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING: if TYPE_CHECKING:
from .configuration_unite import UniTEConfig from .configuration import UniTEConfig
from .modeling_unite import UniTEForTranslationEvaluation from .translation_evaluation import UniTEForTranslationEvaluation
else: else:
_import_structure = { _import_structure = {
'configuration_unite': ['UniTEConfig'], 'configuration': ['UniTEConfig'],
'modeling_unite': ['UniTEForTranslationEvaluation'], 'translation_evaluation': ['UniTEForTranslationEvaluation'],
} }
import sys import sys

View File

@@ -9,7 +9,7 @@ from modelscope.utils.config import Config
logger = logging.get_logger() logger = logging.get_logger()
class EvaluationMode(Enum): class InputFormat(Enum):
SRC = 'src' SRC = 'src'
REF = 'ref' REF = 'ref'
SRC_REF = 'src-ref' SRC_REF = 'src-ref'

Some files were not shown because too many files have changed in this diff Show More