mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
add 1.6
This commit is contained in:
@@ -108,9 +108,9 @@ Audio:
|
|||||||
* [speech_charctc_kws_phone-xiaoyun](https://modelscope.cn/models/damo/speech_charctc_kws_phone-xiaoyun)
|
* [speech_charctc_kws_phone-xiaoyun](https://modelscope.cn/models/damo/speech_charctc_kws_phone-xiaoyun)
|
||||||
|
|
||||||
* [u2pp_conformer-asr-cn-16k-online](https://modelscope.cn/models/wenet/u2pp_conformer-asr-cn-16k-online)
|
* [u2pp_conformer-asr-cn-16k-online](https://modelscope.cn/models/wenet/u2pp_conformer-asr-cn-16k-online)
|
||||||
|
|
||||||
* [speech_fsmn_vad_zh-cn-16k-common-pytorch](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
|
* [speech_fsmn_vad_zh-cn-16k-common-pytorch](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
|
||||||
|
|
||||||
* [punc_ct-transformer_zh-cn-common-vocab272727-pytorch](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)
|
* [punc_ct-transformer_zh-cn-common-vocab272727-pytorch](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)
|
||||||
|
|
||||||
* [speech_frcrn_ans_cirm_16k](https://modelscope.cn/models/damo/speech_frcrn_ans_cirm_16k)
|
* [speech_frcrn_ans_cirm_16k](https://modelscope.cn/models/damo/speech_frcrn_ans_cirm_16k)
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from modelscope import MsDataset, TrainingArgs
|
||||||
from modelscope.metainfo import Trainers
|
from modelscope.metainfo import Trainers
|
||||||
from modelscope.msdatasets.ms_dataset import MsDataset
|
|
||||||
from modelscope.trainers.builder import build_trainer
|
from modelscope.trainers.builder import build_trainer
|
||||||
from modelscope.trainers.training_args import TrainingArgs
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(init=False)
|
||||||
class ImageClassificationTrainingArgs(TrainingArgs):
|
class ImageClassificationTrainingArgs(TrainingArgs):
|
||||||
num_classes: int = field(
|
num_classes: int = field(
|
||||||
default=None,
|
default=None,
|
||||||
@@ -46,26 +45,35 @@ def create_dataset(name, split):
|
|||||||
dataset_name, namespace=namespace, subset_name='default', split=split)
|
dataset_name, namespace=namespace, subset_name='default', split=split)
|
||||||
|
|
||||||
|
|
||||||
def train():
|
training_args = ImageClassificationTrainingArgs(
|
||||||
args = ImageClassificationTrainingArgs.from_cli(
|
model='damo/cv_vit-base_image-classification_ImageNet-labels',
|
||||||
model='damo/cv_vit-base_image-classification_ImageNet-labels',
|
max_epochs=1,
|
||||||
max_epochs=1,
|
lr=1e-4,
|
||||||
lr=1e-4,
|
optimizer='AdamW',
|
||||||
optimizer='AdamW',
|
warmup_iters=1,
|
||||||
warmup_iters=1,
|
topk=(1, )).parse_cli()
|
||||||
topk=(1, ))
|
config, args = training_args.to_config()
|
||||||
if args.dataset_name is not None:
|
|
||||||
train_dataset = create_dataset(args.dataset_name, split='train')
|
|
||||||
val_dataset = create_dataset(args.dataset_name, split='validation')
|
def cfg_modify_fn(cfg):
|
||||||
|
if args.use_model_config:
|
||||||
|
cfg.merge_from_dict(config)
|
||||||
else:
|
else:
|
||||||
train_dataset = create_dataset(args.train_dataset_name, split='train')
|
cfg = config
|
||||||
val_dataset = create_dataset(args.val_dataset_name, split='validation')
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
train_dataset = create_dataset(
|
||||||
|
training_args.train_dataset_name, split=training_args.train_split)
|
||||||
|
val_dataset = create_dataset(
|
||||||
|
training_args.val_dataset_name, split=training_args.val_split)
|
||||||
|
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
model=args.model, # model id
|
model=args.model, # model id
|
||||||
train_dataset=train_dataset, # training dataset
|
train_dataset=train_dataset, # training dataset
|
||||||
eval_dataset=val_dataset, # validation dataset
|
eval_dataset=val_dataset, # validation dataset
|
||||||
cfg_modify_fn=args # callback to modify configuration
|
cfg_modify_fn=cfg_modify_fn # callback to modify configuration
|
||||||
)
|
)
|
||||||
|
|
||||||
# in distributed training, specify pytorch launcher
|
# in distributed training, specify pytorch launcher
|
||||||
|
|||||||
@@ -2,4 +2,7 @@ PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
|
|||||||
examples/pytorch/image_classification/finetune_image_classification.py \
|
examples/pytorch/image_classification/finetune_image_classification.py \
|
||||||
--num_classes 2 \
|
--num_classes 2 \
|
||||||
--train_dataset_name 'tany0699/cats_and_dogs' \
|
--train_dataset_name 'tany0699/cats_and_dogs' \
|
||||||
--val_dataset_name 'tany0699/cats_and_dogs'
|
--val_dataset_name 'tany0699/cats_and_dogs' \
|
||||||
|
--train_split train \
|
||||||
|
--val_split validation \
|
||||||
|
--use_model_config true \
|
||||||
|
|||||||
@@ -1,15 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
|
from modelscope import MsDataset, TrainingArgs
|
||||||
from modelscope.metainfo import Trainers
|
from modelscope.metainfo import Trainers
|
||||||
from modelscope.msdatasets import MsDataset
|
|
||||||
from modelscope.trainers import build_trainer
|
from modelscope.trainers import build_trainer
|
||||||
from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
|
from modelscope.trainers.training_args import set_flatten_value
|
||||||
set_flatten_value)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(init=False)
|
||||||
class MultiModalEmbeddingArguments(TrainingArgs):
|
class MultiModalEmbeddingArguments(TrainingArgs):
|
||||||
|
|
||||||
trainer: str = field(
|
trainer: str = field(
|
||||||
@@ -17,6 +15,12 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
|||||||
'help': 'The trainer used',
|
'help': 'The trainer used',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
work_dir: str = field(
|
||||||
|
default='./tmp',
|
||||||
|
metadata={
|
||||||
|
'help': 'The working path for saving checkpoint',
|
||||||
|
})
|
||||||
|
|
||||||
use_fp16: bool = field(
|
use_fp16: bool = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
@@ -35,7 +39,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
'cfg_node': 'train.optimizer_hparams',
|
'cfg_node': 'train.optimizer_hparams',
|
||||||
'cfg_getter': partial(get_flatten_value, exclusions=['lr']),
|
|
||||||
'cfg_setter': set_flatten_value,
|
'cfg_setter': set_flatten_value,
|
||||||
'help': 'The optimizer init params except `lr`',
|
'help': 'The optimizer init params except `lr`',
|
||||||
})
|
})
|
||||||
@@ -51,7 +54,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
'cfg_node': 'dataset.column_map',
|
'cfg_node': 'dataset.column_map',
|
||||||
'cfg_getter': get_flatten_value,
|
|
||||||
'cfg_setter': set_flatten_value,
|
'cfg_setter': set_flatten_value,
|
||||||
'help': 'The column map for dataset',
|
'help': 'The column map for dataset',
|
||||||
})
|
})
|
||||||
@@ -67,7 +69,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
'cfg_node': 'train.lr_scheduler_hook',
|
'cfg_node': 'train.lr_scheduler_hook',
|
||||||
'cfg_getter': get_flatten_value,
|
|
||||||
'cfg_setter': set_flatten_value,
|
'cfg_setter': set_flatten_value,
|
||||||
'help': 'The parameters for lr scheduler hook',
|
'help': 'The parameters for lr scheduler hook',
|
||||||
})
|
})
|
||||||
@@ -76,7 +77,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
'cfg_node': 'train.optimizer_hook',
|
'cfg_node': 'train.optimizer_hook',
|
||||||
'cfg_getter': get_flatten_value,
|
|
||||||
'cfg_setter': set_flatten_value,
|
'cfg_setter': set_flatten_value,
|
||||||
'help': 'The parameters for optimizer hook',
|
'help': 'The parameters for optimizer hook',
|
||||||
})
|
})
|
||||||
@@ -92,23 +92,28 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
|||||||
'help': 'The data parallel world size',
|
'help': 'The data parallel world size',
|
||||||
})
|
})
|
||||||
|
|
||||||
def __call__(self, config):
|
|
||||||
config = super().__call__(config)
|
config, args = MultiModalEmbeddingArguments().parse_cli().to_config()
|
||||||
config.merge_from_dict({'pretrained_model.model_name': self.model})
|
print(config, args)
|
||||||
if self.clip_clamp:
|
|
||||||
config.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
|
|
||||||
if self.world_size > 1:
|
|
||||||
config.train.launcher = 'pytorch'
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
args = MultiModalEmbeddingArguments.from_cli(task='multi-modal-embedding')
|
def cfg_modify_fn(cfg):
|
||||||
print(args)
|
if args.use_model_config:
|
||||||
|
cfg.merge_from_dict(config)
|
||||||
|
else:
|
||||||
|
cfg = config
|
||||||
|
cfg.merge_from_dict({'pretrained_model.model_name': args.model})
|
||||||
|
if args.clip_clamp:
|
||||||
|
cfg.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
|
||||||
|
if args.world_size > 1:
|
||||||
|
cfg.train.launcher = 'pytorch'
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
train_dataset = MsDataset.load(
|
train_dataset = MsDataset.load(
|
||||||
args.dataset_name, namespace='modelscope', split='train')
|
args.train_dataset_name, namespace='modelscope', split='train')
|
||||||
eval_dataset = MsDataset.load(
|
eval_dataset = MsDataset.load(
|
||||||
args.dataset_name, namespace='modelscope', split='validation')
|
args.train_dataset_name, namespace='modelscope', split='validation')
|
||||||
|
|
||||||
os.makedirs(args.work_dir, exist_ok=True)
|
os.makedirs(args.work_dir, exist_ok=True)
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
@@ -116,6 +121,6 @@ kwargs = dict(
|
|||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
eval_dataset=eval_dataset,
|
eval_dataset=eval_dataset,
|
||||||
work_dir=args.work_dir,
|
work_dir=args.work_dir,
|
||||||
cfg_modify_fn=args)
|
cfg_modify_fn=cfg_modify_fn)
|
||||||
trainer = build_trainer(name=args.trainer, default_args=kwargs)
|
trainer = build_trainer(name=args.trainer, default_args=kwargs)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
|
|||||||
@@ -6,14 +6,16 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
|
|||||||
--trainer 'clip-multi-modal-embedding' \
|
--trainer 'clip-multi-modal-embedding' \
|
||||||
--work_dir './workspace/ckpts/clip' \
|
--work_dir './workspace/ckpts/clip' \
|
||||||
--model 'damo/multi-modal_clip-vit-base-patch16_zh' \
|
--model 'damo/multi-modal_clip-vit-base-patch16_zh' \
|
||||||
--dataset_name 'muge' \
|
--train_dataset_name 'muge' \
|
||||||
--dataset_column_map 'img=image,text=query' \
|
--dataset_column_map 'img=image,text=query' \
|
||||||
--max_epochs 1 \
|
--max_epochs 1 \
|
||||||
--use_fp16 true \
|
--use_fp16 true \
|
||||||
--per_device_train_batch_size 180 \
|
--per_device_train_batch_size 180 \
|
||||||
|
--train_data_worker 0 \
|
||||||
--train_shuffle true \
|
--train_shuffle true \
|
||||||
--train_drop_last true \
|
--train_drop_last true \
|
||||||
--per_device_eval_batch_size 128 \
|
--per_device_eval_batch_size 128 \
|
||||||
|
--eval_data_worker 0 \
|
||||||
--eval_shuffle true \
|
--eval_shuffle true \
|
||||||
--eval_drop_last true \
|
--eval_drop_last true \
|
||||||
--save_ckpt_best true \
|
--save_ckpt_best true \
|
||||||
@@ -33,3 +35,4 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
|
|||||||
--optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \
|
--optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \
|
||||||
--clip_clamp true \
|
--clip_clamp true \
|
||||||
--world_size $DATA_PARALLEL_SIZE \
|
--world_size $DATA_PARALLEL_SIZE \
|
||||||
|
--use_model_config true \
|
||||||
|
|||||||
@@ -4,30 +4,32 @@ from modelscope.msdatasets import MsDataset
|
|||||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||||
from modelscope.trainers.training_args import TrainingArgs
|
from modelscope.trainers.training_args import TrainingArgs
|
||||||
|
|
||||||
|
training_args = TrainingArgs(task='efficient-diffusion-tuning').parse_cli()
|
||||||
@dataclass
|
config, args = training_args.to_config()
|
||||||
class StableDiffusionArguments(TrainingArgs):
|
|
||||||
|
|
||||||
def __call__(self, config):
|
|
||||||
config = super().__call__(config)
|
|
||||||
config.train.lr_scheduler.T_max = self.max_epochs
|
|
||||||
config.model.inference = False
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
args = StableDiffusionArguments.from_cli(task='efficient-diffusion-tuning')
|
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
dataset = MsDataset.load(args.dataset_name, namespace=args.namespace)
|
dataset = MsDataset.load(
|
||||||
|
args.train_dataset_name, namespace=args.train_dataset_namespace)
|
||||||
train_dataset = dataset['train']
|
train_dataset = dataset['train']
|
||||||
validation_dataset = dataset['validation']
|
validation_dataset = dataset['validation']
|
||||||
|
|
||||||
|
|
||||||
|
def cfg_modify_fn(cfg):
|
||||||
|
if args.use_model_config:
|
||||||
|
cfg.merge_from_dict(config)
|
||||||
|
else:
|
||||||
|
cfg = config
|
||||||
|
cfg.train.lr_scheduler.T_max = training_args.max_epochs
|
||||||
|
cfg.model.inference = False
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
model=args.model,
|
model=training_args.model,
|
||||||
work_dir=args.work_dir,
|
work_dir=training_args.work_dir,
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
eval_dataset=validation_dataset,
|
eval_dataset=validation_dataset,
|
||||||
cfg_modify_fn=args)
|
cfg_modify_fn=cfg_modify_fn)
|
||||||
|
|
||||||
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \
|
PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \
|
||||||
--model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
|
--model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
|
||||||
--work_dir './tmp/stable_diffusion_tuning' \
|
--work_dir './tmp/stable_diffusion_tuning' \
|
||||||
--namespace 'damo' \
|
--train_dataset_namespace 'damo' \
|
||||||
--dataset_name 'buptwq/lora-stable-diffusion-finetune-dog' \
|
--train_dataset_name 'controlnet_dataset_condition_fill50k' \
|
||||||
--max_epochs 150 \
|
--max_epochs 1 \
|
||||||
--save_ckpt_strategy 'by_epoch' \
|
--save_ckpt_strategy 'by_epoch' \
|
||||||
--logging_interval 100 \
|
--logging_interval 100 \
|
||||||
--train.dataloader.workers_per_gpu 0 \
|
--train.dataloader.workers_per_gpu 0 \
|
||||||
--evaluation.dataloader.workers_per_gpu 0 \
|
--evaluation.dataloader.workers_per_gpu 0 \
|
||||||
--train.optimizer.lr 1e-4
|
--train.optimizer.lr 1e-5 \
|
||||||
|
--use_model_config true
|
||||||
|
|||||||
@@ -1,26 +1,18 @@
|
|||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from modelscope.msdatasets import MsDataset
|
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
|
||||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
build_dataset_from_file)
|
||||||
from modelscope.trainers.training_args import TrainingArgs
|
from modelscope.trainers import build_trainer
|
||||||
|
|
||||||
|
|
||||||
def get_labels(cfg, metadata):
|
def set_labels(labels):
|
||||||
label2id = cfg.safe_get(metadata['cfg_node'])
|
|
||||||
if label2id is not None:
|
|
||||||
return ','.join(label2id.keys())
|
|
||||||
|
|
||||||
|
|
||||||
def set_labels(cfg, labels, metadata):
|
|
||||||
if isinstance(labels, str):
|
if isinstance(labels, str):
|
||||||
labels = labels.split(',')
|
labels = labels.split(',')
|
||||||
cfg.merge_from_dict(
|
return {label: id for id, label in enumerate(labels)}
|
||||||
{metadata['cfg_node']: {label: id
|
|
||||||
for id, label in enumerate(labels)}})
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(init=False)
|
||||||
class TextClassificationArguments(TrainingArgs):
|
class TextClassificationArguments(TrainingArgs):
|
||||||
|
|
||||||
first_sequence: str = field(
|
first_sequence: str = field(
|
||||||
@@ -49,7 +41,6 @@ class TextClassificationArguments(TrainingArgs):
|
|||||||
metadata={
|
metadata={
|
||||||
'help': 'The labels of the dataset',
|
'help': 'The labels of the dataset',
|
||||||
'cfg_node': 'preprocessor.label2id',
|
'cfg_node': 'preprocessor.label2id',
|
||||||
'cfg_getter': get_labels,
|
|
||||||
'cfg_setter': set_labels,
|
'cfg_setter': set_labels,
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -60,30 +51,39 @@ class TextClassificationArguments(TrainingArgs):
|
|||||||
'cfg_node': 'preprocessor.type'
|
'cfg_node': 'preprocessor.type'
|
||||||
})
|
})
|
||||||
|
|
||||||
def __call__(self, config):
|
|
||||||
config = super().__call__(config)
|
config, args = TextClassificationArguments().parse_cli().to_config()
|
||||||
config.model['num_labels'] = len(self.labels)
|
|
||||||
if config.train.lr_scheduler.type == 'LinearLR':
|
print(config, args)
|
||||||
config.train.lr_scheduler['total_iters'] = \
|
|
||||||
int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
args = TextClassificationArguments.from_cli(
|
def cfg_modify_fn(cfg):
|
||||||
task='text-classification', eval_metrics='seq-cls-metric')
|
if args.use_model_config:
|
||||||
|
cfg.merge_from_dict(config)
|
||||||
|
else:
|
||||||
|
cfg = config
|
||||||
|
cfg.model['num_labels'] = len(cfg.preprocessor.label2id)
|
||||||
|
if cfg.train.lr_scheduler.type == 'LinearLR':
|
||||||
|
cfg.train.lr_scheduler['total_iters'] = \
|
||||||
|
int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
|
||||||
|
return cfg
|
||||||
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
dataset = MsDataset.load(args.dataset_name, subset_name=args.subset_name)
|
if args.dataset_json_file is None:
|
||||||
train_dataset = dataset['train']
|
dataset = MsDataset.load(
|
||||||
validation_dataset = dataset['validation']
|
args.train_dataset_name, subset_name=args.train_subset_name)
|
||||||
|
train_dataset = dataset['train']
|
||||||
|
validation_dataset = dataset['validation']
|
||||||
|
else:
|
||||||
|
train_dataset, validation_dataset = build_dataset_from_file(
|
||||||
|
args.dataset_json_file)
|
||||||
|
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
model=args.model,
|
model=args.model,
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
eval_dataset=validation_dataset,
|
eval_dataset=validation_dataset,
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
cfg_modify_fn=args)
|
cfg_modify_fn=cfg_modify_fn)
|
||||||
|
|
||||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||||
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
||||||
|
|||||||
@@ -1,12 +1,16 @@
|
|||||||
PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
|
PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
|
||||||
|
--task 'text-classification' \
|
||||||
--model 'damo/nlp_structbert_backbone_base_std' \
|
--model 'damo/nlp_structbert_backbone_base_std' \
|
||||||
--dataset_name 'clue' \
|
--train_dataset_name 'clue' \
|
||||||
--subset_name 'tnews' \
|
--train_subset_name 'tnews' \
|
||||||
--first_sequence 'sentence' \
|
--first_sequence 'sentence' \
|
||||||
--preprocessor.label label \
|
--preprocessor.label label \
|
||||||
--model.num_labels 15 \
|
--model.num_labels 15 \
|
||||||
--labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
|
--labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
|
||||||
--preprocessor 'sen-cls-tokenizer' \
|
--preprocessor 'sen-cls-tokenizer' \
|
||||||
|
--use_model_config True \
|
||||||
|
--max_epochs 1 \
|
||||||
--train.dataloader.workers_per_gpu 0 \
|
--train.dataloader.workers_per_gpu 0 \
|
||||||
--evaluation.dataloader.workers_per_gpu 0 \
|
--evaluation.dataloader.workers_per_gpu 0 \
|
||||||
--train.optimizer.lr 1e-5 \
|
--train.optimizer.lr 1e-5 \
|
||||||
|
--eval_metrics 'seq-cls-metric' \
|
||||||
|
|||||||
@@ -1,12 +1,11 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from modelscope import EpochBasedTrainer, MsDataset, TrainingArgs
|
||||||
from modelscope.metainfo import Trainers
|
from modelscope.metainfo import Trainers
|
||||||
from modelscope.msdatasets import MsDataset
|
from modelscope.trainers import build_trainer
|
||||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
|
||||||
from modelscope.trainers.training_args import TrainingArgs
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(init=False)
|
||||||
class TextGenerationArguments(TrainingArgs):
|
class TextGenerationArguments(TrainingArgs):
|
||||||
|
|
||||||
trainer: str = field(
|
trainer: str = field(
|
||||||
@@ -67,30 +66,35 @@ class TextGenerationArguments(TrainingArgs):
|
|||||||
'help': 'Whether to use MegatronHook',
|
'help': 'Whether to use MegatronHook',
|
||||||
})
|
})
|
||||||
|
|
||||||
def __call__(self, config):
|
|
||||||
config = super().__call__(config)
|
|
||||||
if config.train.lr_scheduler.type == 'noam':
|
|
||||||
config.train.lr_scheduler = {
|
|
||||||
'type': 'LambdaLR',
|
|
||||||
'lr_lambda': noam_lambda,
|
|
||||||
'options': {
|
|
||||||
'by_epoch': False
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if self.use_megatron:
|
|
||||||
config.train.hooks.append({'type': 'MegatronHook'})
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def noam_lambda(current_step: int):
|
def noam_lambda(current_step: int):
|
||||||
current_step += 1
|
current_step += 1
|
||||||
return min(current_step**(-0.5), current_step * 100**(-1.5))
|
return min(current_step**(-0.5), current_step * 100**(-1.5))
|
||||||
|
|
||||||
|
|
||||||
args = TextGenerationArguments.from_cli(task='text-generation')
|
config, args = TextGenerationArguments().parse_cli().to_config()
|
||||||
print(args)
|
print(config, args)
|
||||||
|
|
||||||
dataset = MsDataset.load(args.dataset_name)
|
|
||||||
|
def cfg_modify_fn(cfg):
|
||||||
|
if args.use_model_config:
|
||||||
|
cfg.merge_from_dict(config)
|
||||||
|
else:
|
||||||
|
cfg = config
|
||||||
|
if cfg.train.lr_scheduler.type == 'noam':
|
||||||
|
cfg.train.lr_scheduler = {
|
||||||
|
'type': 'LambdaLR',
|
||||||
|
'lr_lambda': noam_lambda,
|
||||||
|
'options': {
|
||||||
|
'by_epoch': False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if args.use_megatron:
|
||||||
|
cfg.train.hooks.append({'type': 'MegatronHook'})
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
dataset = MsDataset.load(args.train_dataset_name)
|
||||||
train_dataset = dataset['train']
|
train_dataset = dataset['train']
|
||||||
eval_dataset = dataset['validation' if 'validation' in dataset else 'test']
|
eval_dataset = dataset['validation' if 'validation' in dataset else 'test']
|
||||||
|
|
||||||
@@ -100,7 +104,7 @@ kwargs = dict(
|
|||||||
eval_dataset=eval_dataset,
|
eval_dataset=eval_dataset,
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
work_dir=args.work_dir,
|
work_dir=args.work_dir,
|
||||||
cfg_modify_fn=args)
|
cfg_modify_fn=cfg_modify_fn)
|
||||||
|
|
||||||
trainer: EpochBasedTrainer = build_trainer(
|
trainer: EpochBasedTrainer = build_trainer(
|
||||||
name=args.trainer, default_args=kwargs)
|
name=args.trainer, default_args=kwargs)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
|
|||||||
--trainer 'nlp-gpt3-trainer' \
|
--trainer 'nlp-gpt3-trainer' \
|
||||||
--work_dir './tmp' \
|
--work_dir './tmp' \
|
||||||
--model 'damo/nlp_gpt3_text-generation_1.3B' \
|
--model 'damo/nlp_gpt3_text-generation_1.3B' \
|
||||||
--dataset_name 'chinese-poetry-collection' \
|
--train_dataset_name 'chinese-poetry-collection' \
|
||||||
--preprocessor 'text-gen-jieba-tokenizer' \
|
--preprocessor 'text-gen-jieba-tokenizer' \
|
||||||
--src_txt 'text1' \
|
--src_txt 'text1' \
|
||||||
--tgt_txt 'text2' \
|
--tgt_txt 'text2' \
|
||||||
@@ -20,4 +20,5 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
|
|||||||
--world_size $WORLD_SIZE \
|
--world_size $WORLD_SIZE \
|
||||||
--tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \
|
--tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \
|
||||||
--use_megatron true \
|
--use_megatron true \
|
||||||
# --dataset_name 'DuReader_robust-QG' \ # input&output
|
--use_model_config true \
|
||||||
|
# --train_dataset_name 'DuReader_robust-QG' \ # input&output
|
||||||
|
|||||||
13
examples/pytorch/text_generation/run_train_mt5.sh
Normal file
13
examples/pytorch/text_generation/run_train_mt5.sh
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.py \
|
||||||
|
--trainer 'text-generation-trainer' \
|
||||||
|
--work_dir './tmp' \
|
||||||
|
--task 'text2text-generation' \
|
||||||
|
--model 'damo/nlp_mt5_zero-shot-augment_chinese-base' \
|
||||||
|
--train_dataset_name 'DuReader_robust-QG' \
|
||||||
|
--src_txt 'text1' \
|
||||||
|
--tgt_txt 'text2' \
|
||||||
|
--max_epochs 1 \
|
||||||
|
--use_model_config True \
|
||||||
|
--per_device_train_batch_size 8 \
|
||||||
|
--lr 1e-3 \
|
||||||
|
--lr_scheduler 'noam' \
|
||||||
@@ -2,10 +2,11 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
|
|||||||
--trainer 'text-generation-trainer' \
|
--trainer 'text-generation-trainer' \
|
||||||
--work_dir './tmp' \
|
--work_dir './tmp' \
|
||||||
--model 'damo/nlp_palm2.0_pretrained_chinese-base' \
|
--model 'damo/nlp_palm2.0_pretrained_chinese-base' \
|
||||||
--dataset_name 'DuReader_robust-QG' \
|
--train_dataset_name 'DuReader_robust-QG' \
|
||||||
--src_txt 'text1' \
|
--src_txt 'text1' \
|
||||||
--tgt_txt 'text2' \
|
--tgt_txt 'text2' \
|
||||||
--max_epochs 15 \
|
--max_epochs 1 \
|
||||||
|
--use_model_config True \
|
||||||
--per_device_train_batch_size 8 \
|
--per_device_train_batch_size 8 \
|
||||||
--lr 1e-3 \
|
--lr 1e-3 \
|
||||||
--lr_scheduler 'noam' \
|
--lr_scheduler 'noam' \
|
||||||
|
|||||||
@@ -1,20 +1,22 @@
|
|||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from modelscope.metainfo import Trainers
|
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
|
||||||
from modelscope.msdatasets import MsDataset
|
build_dataset_from_file)
|
||||||
from modelscope.trainers import build_trainer
|
|
||||||
from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
|
|
||||||
set_flatten_value)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(init=False)
|
||||||
class TokenClassificationArguments(TrainingArgs):
|
class TokenClassificationArguments(TrainingArgs):
|
||||||
|
|
||||||
trainer: str = field(
|
trainer: str = field(
|
||||||
default=Trainers.default, metadata={
|
default=None, metadata={
|
||||||
'help': 'The trainer used',
|
'help': 'The trainer used',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
work_dir: str = field(
|
||||||
|
default='./tmp',
|
||||||
|
metadata={
|
||||||
|
'help': 'The working path for saving checkpoint',
|
||||||
|
})
|
||||||
|
|
||||||
preprocessor: str = field(
|
preprocessor: str = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
@@ -29,60 +31,99 @@ class TokenClassificationArguments(TrainingArgs):
|
|||||||
'cfg_node': 'preprocessor.padding'
|
'cfg_node': 'preprocessor.padding'
|
||||||
})
|
})
|
||||||
|
|
||||||
train_dataset_params: str = field(
|
mode: str = field(
|
||||||
|
default='inference',
|
||||||
|
metadata={
|
||||||
|
'help': 'The preprocessor padding',
|
||||||
|
'cfg_node': 'preprocessor.mode'
|
||||||
|
})
|
||||||
|
|
||||||
|
first_sequence: str = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
'cfg_node': 'dataset.train',
|
'cfg_node': 'preprocessor.first_sequence',
|
||||||
'cfg_getter': get_flatten_value,
|
|
||||||
'cfg_setter': set_flatten_value,
|
|
||||||
'help': 'The parameters for train dataset',
|
'help': 'The parameters for train dataset',
|
||||||
})
|
})
|
||||||
|
|
||||||
def __call__(self, config):
|
label: str = field(
|
||||||
config = super().__call__(config)
|
default=None,
|
||||||
if config.safe_get('dataset.train.label') == 'ner_tags':
|
metadata={
|
||||||
ner_tags_labels = train_dataset['ner_tags'] + eval_dataset[
|
'cfg_node': 'preprocessor.label',
|
||||||
'ner_tags']
|
'help': 'The parameters for train dataset',
|
||||||
label_enumerate_values = self._get_label_list(ner_tags_labels)
|
})
|
||||||
config.merge_from_dict(
|
|
||||||
{'dataset.train.labels': label_enumerate_values})
|
|
||||||
if config.train.lr_scheduler.type == 'LinearLR':
|
|
||||||
config.train.lr_scheduler['total_iters'] = \
|
|
||||||
int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
|
|
||||||
return config
|
|
||||||
|
|
||||||
# TODO: Future performance optimization in MsDataset
|
sequence_length: int = field(
|
||||||
@staticmethod
|
default=128,
|
||||||
def _get_label_list(labels):
|
metadata={
|
||||||
unique_labels = set()
|
'cfg_node': 'preprocessor.sequence_length',
|
||||||
for label in labels:
|
'help': 'The parameters for train dataset',
|
||||||
unique_labels = unique_labels | set(label)
|
})
|
||||||
label_list = list(unique_labels)
|
|
||||||
label_list.sort()
|
|
||||||
return label_list
|
|
||||||
|
|
||||||
|
|
||||||
args = TokenClassificationArguments.from_cli(task='token-classification')
|
training_args = TokenClassificationArguments().parse_cli()
|
||||||
|
config, args = training_args.to_config()
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
# load dataset
|
|
||||||
train_dataset = MsDataset.load(
|
def get_label_list(labels):
|
||||||
args.dataset_name,
|
unique_labels = set()
|
||||||
subset_name=args.subset_name,
|
for label in labels:
|
||||||
split='train',
|
unique_labels = unique_labels | set(label)
|
||||||
namespace='damo')['train']
|
label_list = list(unique_labels)
|
||||||
eval_dataset = MsDataset.load(
|
label_list.sort()
|
||||||
args.dataset_name,
|
return label_list
|
||||||
subset_name=args.subset_name,
|
|
||||||
split='validation',
|
|
||||||
namespace='damo')['validation']
|
def cfg_modify_fn(cfg):
|
||||||
|
if args.use_model_config:
|
||||||
|
cfg.merge_from_dict(config)
|
||||||
|
else:
|
||||||
|
cfg = config
|
||||||
|
labels = train_dataset[training_args.label] + validation_dataset[
|
||||||
|
training_args.label]
|
||||||
|
label_enumerate_values = get_label_list(labels)
|
||||||
|
cfg.merge_from_dict({
|
||||||
|
'preprocessor.label2id':
|
||||||
|
{label: id
|
||||||
|
for id, label in enumerate(label_enumerate_values)}
|
||||||
|
})
|
||||||
|
cfg.merge_from_dict({'model.num_labels': len(label_enumerate_values)})
|
||||||
|
cfg.merge_from_dict({'preprocessor.use_fast': True})
|
||||||
|
cfg.merge_from_dict({
|
||||||
|
'evaluation.metrics': {
|
||||||
|
'type': 'token-cls-metric',
|
||||||
|
'label2id':
|
||||||
|
{label: id
|
||||||
|
for id, label in enumerate(label_enumerate_values)}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
if cfg.train.lr_scheduler.type == 'LinearLR':
|
||||||
|
cfg.train.lr_scheduler['total_iters'] = \
|
||||||
|
int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
if args.dataset_json_file is None:
|
||||||
|
train_dataset = MsDataset.load(
|
||||||
|
args.train_dataset_name,
|
||||||
|
subset_name=args.train_subset_name,
|
||||||
|
split='train',
|
||||||
|
namespace=args.train_dataset_namespace)['train']
|
||||||
|
validation_dataset = MsDataset.load(
|
||||||
|
args.train_dataset_name,
|
||||||
|
subset_name=args.train_subset_name,
|
||||||
|
split='validation',
|
||||||
|
namespace=args.train_dataset_namespace)['validation']
|
||||||
|
else:
|
||||||
|
train_dataset, validation_dataset = build_dataset_from_file(
|
||||||
|
args.dataset_json_file)
|
||||||
|
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
model=args.model,
|
model=args.model,
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
eval_dataset=eval_dataset,
|
eval_dataset=validation_dataset,
|
||||||
work_dir=args.work_dir,
|
work_dir=args.work_dir,
|
||||||
cfg_modify_fn=args)
|
cfg_modify_fn=cfg_modify_fn)
|
||||||
|
|
||||||
trainer = build_trainer(name=args.trainer, default_args=kwargs)
|
trainer = EpochBasedTrainer(**kwargs)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
|
|||||||
@@ -1,15 +1,22 @@
|
|||||||
PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
|
PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
|
||||||
|
--task 'token-classification' \
|
||||||
--trainer 'nlp-base-trainer' \
|
--trainer 'nlp-base-trainer' \
|
||||||
--work_dir './tmp' \
|
--work_dir './tmp' \
|
||||||
--model 'damo/mgeo_backbone_chinese_base' \
|
--model 'damo/mgeo_backbone_chinese_base' \
|
||||||
--dataset_name 'GeoGLUE' \
|
--train_dataset_name 'GeoGLUE' \
|
||||||
--subset_name 'GeoETA' \
|
--train_subset_name 'GeoETA' \
|
||||||
--train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
|
--train_dataset_namespace 'damo' \
|
||||||
|
--first_sequence 'tokens' \
|
||||||
|
--eval_strategy by_step \
|
||||||
|
--eval_interval 10 \
|
||||||
|
--label 'ner_tags' \
|
||||||
|
--sequence_length 128 \
|
||||||
--preprocessor 'token-cls-tokenizer' \
|
--preprocessor 'token-cls-tokenizer' \
|
||||||
--preprocessor_padding 'max_length' \
|
--preprocessor_padding 'max_length' \
|
||||||
--max_epochs 1 \
|
--max_epochs 1 \
|
||||||
|
--mode 'inference' \
|
||||||
|
--use_model_config True \
|
||||||
--per_device_train_batch_size 32 \
|
--per_device_train_batch_size 32 \
|
||||||
|
--train_data_worker 0 \
|
||||||
|
--eval_data_worker 0 \
|
||||||
--lr 3e-5 \
|
--lr 3e-5 \
|
||||||
--save_ckpt_strategy 'by_epoch' \
|
|
||||||
--logging_interval 100 \
|
|
||||||
--eval_strategy 'by_epoch' \
|
|
||||||
|
|||||||
@@ -1,16 +1,22 @@
|
|||||||
PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
|
PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
|
||||||
|
--task 'token-classification' \
|
||||||
--trainer 'nlp-base-trainer' \
|
--trainer 'nlp-base-trainer' \
|
||||||
--work_dir './tmp' \
|
--work_dir './tmp' \
|
||||||
--model 'damo/nlp_structbert_backbone_base_std' \
|
--model 'damo/nlp_structbert_backbone_base_std' \
|
||||||
--dataset_name 'GeoGLUE' \
|
--train_dataset_name 'GeoGLUE' \
|
||||||
--subset_name 'GeoETA' \
|
--train_subset_name 'GeoETA' \
|
||||||
--train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
|
--train_dataset_namespace 'damo' \
|
||||||
|
--first_sequence 'tokens' \
|
||||||
|
--eval_strategy by_step \
|
||||||
|
--eval_interval 20 \
|
||||||
|
--label 'ner_tags' \
|
||||||
|
--sequence_length 128 \
|
||||||
--preprocessor 'token-cls-tokenizer' \
|
--preprocessor 'token-cls-tokenizer' \
|
||||||
--preprocessor_padding 'max_length' \
|
--preprocessor_padding 'max_length' \
|
||||||
--max_epochs 2 \
|
--max_epochs 2 \
|
||||||
|
--mode 'inference' \
|
||||||
|
--use_model_config True \
|
||||||
--per_device_train_batch_size 32 \
|
--per_device_train_batch_size 32 \
|
||||||
|
--train_data_worker 0 \
|
||||||
|
--eval_data_worker 0 \
|
||||||
--lr 3e-5 \
|
--lr 3e-5 \
|
||||||
--save_ckpt_strategy 'by_epoch' \
|
|
||||||
--logging_interval 1 \
|
|
||||||
--eval_strategy 'by_step' \
|
|
||||||
--eval_interval 20 \
|
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}
|
|
||||||
@@ -5,11 +5,11 @@ from datasets import load_dataset
|
|||||||
from transformers import (BertForSequenceClassification, BertTokenizerFast,
|
from transformers import (BertForSequenceClassification, BertTokenizerFast,
|
||||||
default_data_collator)
|
default_data_collator)
|
||||||
|
|
||||||
|
from modelscope import TrainingArgs
|
||||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||||
from modelscope.trainers.default_config import DEFAULT_CONFIG, TrainingArgs
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(init=False)
|
||||||
class TransformersArguments(TrainingArgs):
|
class TransformersArguments(TrainingArgs):
|
||||||
|
|
||||||
num_labels: int = field(
|
num_labels: int = field(
|
||||||
@@ -17,13 +17,27 @@ class TransformersArguments(TrainingArgs):
|
|||||||
'help': 'The number of labels',
|
'help': 'The number of labels',
|
||||||
})
|
})
|
||||||
|
|
||||||
|
sentence: str = field(
|
||||||
|
default=None, metadata={
|
||||||
|
'help': 'The sentence key',
|
||||||
|
})
|
||||||
|
|
||||||
args = TransformersArguments.from_cli(
|
label: str = field(
|
||||||
task='text-classification', eval_metrics='seq-cls-metric')
|
default=None, metadata={
|
||||||
|
'help': 'The label key',
|
||||||
|
})
|
||||||
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
dataset = load_dataset(args.dataset_name, args.subset_name)
|
training_args = TransformersArguments(
|
||||||
|
task='text-classification', eval_metrics='seq-cls-metric').parse_cli()
|
||||||
|
config, args = training_args.to_config()
|
||||||
|
|
||||||
|
print(config, args)
|
||||||
|
|
||||||
|
train_dataset = load_dataset(
|
||||||
|
args.train_dataset_name, args.train_subset_name, split=args.train_split)
|
||||||
|
val_dataset = load_dataset(
|
||||||
|
args.val_dataset_name, args.val_subset_name, split=args.val_split)
|
||||||
|
|
||||||
model = BertForSequenceClassification.from_pretrained(
|
model = BertForSequenceClassification.from_pretrained(
|
||||||
args.model, num_labels=args.num_labels)
|
args.model, num_labels=args.num_labels)
|
||||||
@@ -31,26 +45,30 @@ tokenizer = BertTokenizerFast.from_pretrained(args.model)
|
|||||||
|
|
||||||
|
|
||||||
def tokenize_sentence(row):
|
def tokenize_sentence(row):
|
||||||
return tokenizer(row['sentence'], padding='max_length', max_length=128)
|
return tokenizer(
|
||||||
|
row[training_args.sentence], padding='max_length', max_length=128)
|
||||||
|
|
||||||
|
|
||||||
# Extra columns, Rename columns
|
# Extra columns, Rename columns
|
||||||
dataset = dataset.map(tokenize_sentence).remove_columns(['sentence',
|
train_dataset = train_dataset.map(tokenize_sentence)
|
||||||
'idx']).rename_column(
|
val_dataset = val_dataset.map(tokenize_sentence)
|
||||||
'label', 'labels')
|
if training_args.label != 'labels':
|
||||||
|
train_dataset = train_dataset.rename_columns(
|
||||||
|
{training_args.label: 'labels'})
|
||||||
|
val_dataset = val_dataset.rename_columns({training_args.label: 'labels'})
|
||||||
|
|
||||||
cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
|
cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
|
||||||
DEFAULT_CONFIG.dump(cfg_file)
|
config.dump(cfg_file)
|
||||||
|
|
||||||
kwargs = dict(
|
kwargs = dict(
|
||||||
model=model,
|
model=model,
|
||||||
cfg_file=cfg_file,
|
cfg_file=cfg_file,
|
||||||
# data_collator
|
# data_collator
|
||||||
data_collator=default_data_collator,
|
data_collator=default_data_collator,
|
||||||
train_dataset=dataset['train'],
|
train_dataset=train_dataset,
|
||||||
eval_dataset=dataset['validation'],
|
eval_dataset=val_dataset,
|
||||||
seed=args.seed,
|
remove_unused_data=True,
|
||||||
cfg_modify_fn=args)
|
seed=args.seed)
|
||||||
|
|
||||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||||
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
||||||
|
|||||||
@@ -1,5 +1,14 @@
|
|||||||
PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
|
PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
|
||||||
--model bert-base-uncased \
|
--model bert-base-uncased \
|
||||||
--num_labels 15 \
|
--num_labels 15 \
|
||||||
--dataset_name clue \
|
--train_dataset_name clue \
|
||||||
--subset_name tnews
|
--train_subset_name tnews \
|
||||||
|
--train_split train \
|
||||||
|
--val_dataset_name clue \
|
||||||
|
--val_subset_name tnews \
|
||||||
|
--train_split train \
|
||||||
|
--val_split validation \
|
||||||
|
--sentence sentence \
|
||||||
|
--label label \
|
||||||
|
--eval_strategy by_step \
|
||||||
|
--eval_interval 100
|
||||||
|
|||||||
@@ -1,4 +1,79 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
from .version import __release_datetime__, __version__
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
__all__ = ['__version__', '__release_datetime__']
|
from modelscope.utils.import_utils import LazyImportModule
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from .version import __release_datetime__, __version__
|
||||||
|
from .trainers import EpochBasedTrainer, TrainingArgs, build_dataset_from_file
|
||||||
|
from .trainers import Hook, Priority
|
||||||
|
from .exporters import Exporter
|
||||||
|
from .exporters import TfModelExporter
|
||||||
|
from .exporters import TorchModelExporter
|
||||||
|
from .hub.api import HubApi
|
||||||
|
from .hub.snapshot_download import snapshot_download
|
||||||
|
from .hub.push_to_hub import push_to_hub, push_to_hub_async
|
||||||
|
from .hub.check_model import check_model_is_id, check_local_model_is_latest
|
||||||
|
from .metrics import AudioNoiseMetric, Metric, task_default_metrics, ImageColorEnhanceMetric, ImageDenoiseMetric, \
|
||||||
|
ImageInstanceSegmentationCOCOMetric, ImagePortraitEnhancementMetric, SequenceClassificationMetric, \
|
||||||
|
TextGenerationMetric, TokenClassificationMetric, VideoSummarizationMetric, MovieSceneSegmentationMetric, \
|
||||||
|
AccuracyMetric, BleuMetric, ImageInpaintingMetric, ReferringVideoObjectSegmentationMetric, \
|
||||||
|
VideoFrameInterpolationMetric, VideoStabilizationMetric, VideoSuperResolutionMetric, PplMetric, \
|
||||||
|
ImageQualityAssessmentDegradationMetric, ImageQualityAssessmentMosMetric, TextRankingMetric, \
|
||||||
|
LossMetric, ImageColorizationMetric, OCRRecognitionMetric
|
||||||
|
from .models import Model, TorchModel
|
||||||
|
from .preprocessors import Preprocessor
|
||||||
|
from .pipelines import Pipeline, pipeline
|
||||||
|
from .utils.hub import read_config, create_model_if_not_exist
|
||||||
|
from .utils.logger import get_logger
|
||||||
|
from .msdatasets import MsDataset
|
||||||
|
|
||||||
|
else:
|
||||||
|
_import_structure = {
|
||||||
|
'version': ['__release_datetime__', '__version__'],
|
||||||
|
'trainers': [
|
||||||
|
'EpochBasedTrainer', 'TrainingArgs', 'Hook', 'Priority',
|
||||||
|
'build_dataset_from_file'
|
||||||
|
],
|
||||||
|
'exporters': [
|
||||||
|
'Exporter',
|
||||||
|
'TfModelExporter',
|
||||||
|
'TorchModelExporter',
|
||||||
|
],
|
||||||
|
'hub.api': ['HubApi'],
|
||||||
|
'hub.snapshot_download': ['snapshot_download'],
|
||||||
|
'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'],
|
||||||
|
'hub.check_model':
|
||||||
|
['check_model_is_id', 'check_local_model_is_latest'],
|
||||||
|
'metrics': [
|
||||||
|
'AudioNoiseMetric', 'Metric', 'task_default_metrics',
|
||||||
|
'ImageColorEnhanceMetric', 'ImageDenoiseMetric',
|
||||||
|
'ImageInstanceSegmentationCOCOMetric',
|
||||||
|
'ImagePortraitEnhancementMetric', 'SequenceClassificationMetric',
|
||||||
|
'TextGenerationMetric', 'TokenClassificationMetric',
|
||||||
|
'VideoSummarizationMetric', 'MovieSceneSegmentationMetric',
|
||||||
|
'AccuracyMetric', 'BleuMetric', 'ImageInpaintingMetric',
|
||||||
|
'ReferringVideoObjectSegmentationMetric',
|
||||||
|
'VideoFrameInterpolationMetric', 'VideoStabilizationMetric',
|
||||||
|
'VideoSuperResolutionMetric', 'PplMetric',
|
||||||
|
'ImageQualityAssessmentDegradationMetric',
|
||||||
|
'ImageQualityAssessmentMosMetric', 'TextRankingMetric',
|
||||||
|
'LossMetric', 'ImageColorizationMetric', 'OCRRecognitionMetric'
|
||||||
|
],
|
||||||
|
'models': ['Model', 'TorchModel'],
|
||||||
|
'preprocessors': ['Preprocessor'],
|
||||||
|
'pipelines': ['Pipeline', 'pipeline'],
|
||||||
|
'utils.hub': ['read_config', 'create_model_if_not_exist'],
|
||||||
|
'utils.logger': ['get_logger'],
|
||||||
|
'msdatasets': ['MsDataset']
|
||||||
|
}
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.modules[__name__] = LazyImportModule(
|
||||||
|
__name__,
|
||||||
|
globals()['__file__'],
|
||||||
|
_import_structure,
|
||||||
|
module_spec=__spec__,
|
||||||
|
extra_objects={},
|
||||||
|
)
|
||||||
|
|||||||
@@ -122,10 +122,11 @@ class ${pipeline_name}(Pipeline):
|
|||||||
# Tips: usr_config_path is the temporary save configuration location, after upload modelscope hub, it is the model_id
|
# Tips: usr_config_path is the temporary save configuration location, after upload modelscope hub, it is the model_id
|
||||||
usr_config_path = '${configuration_path}'
|
usr_config_path = '${configuration_path}'
|
||||||
config = Config({
|
config = Config({
|
||||||
'framework': 'pytorch',
|
"framework": 'pytorch',
|
||||||
'task': '${task_name}',
|
"task": '${task_name}',
|
||||||
'model': {'type': 'my-custom-model'},
|
"model": {'type': 'my-custom-model'},
|
||||||
"pipeline": {"type": "my-custom-pipeline"}
|
"pipeline": {"type": "my-custom-pipeline"},
|
||||||
|
"allow_remote": True
|
||||||
})
|
})
|
||||||
config.dump('${configuration_path}' + 'configuration.json')
|
config.dump('${configuration_path}' + 'configuration.json')
|
||||||
|
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
from modelscope.utils.import_utils import LazyImportModule
|
from modelscope.utils.import_utils import LazyImportModule
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .human_wholebody_keypoint import HumanWholeBodyKeypoint
|
from .ans_dfsmn_exporter import ANSDFSMNExporter
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
|
'ans_dfsmn_exporter': ['ANSDFSMNExporter'],
|
||||||
}
|
}
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
62
modelscope/exporters/audio/ans_dfsmn_exporter.py
Normal file
62
modelscope/exporters/audio/ans_dfsmn_exporter.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
import os
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from modelscope.exporters.builder import EXPORTERS
|
||||||
|
from modelscope.exporters.torch_model_exporter import TorchModelExporter
|
||||||
|
from modelscope.metainfo import Models
|
||||||
|
from modelscope.utils.constant import ModelFile, Tasks
|
||||||
|
|
||||||
|
INPUT_NAME = 'input'
|
||||||
|
OUTPUT_NAME = 'output'
|
||||||
|
|
||||||
|
|
||||||
|
@EXPORTERS.register_module(
|
||||||
|
Tasks.acoustic_noise_suppression, module_name=Models.speech_dfsmn_ans)
|
||||||
|
class ANSDFSMNExporter(TorchModelExporter):
|
||||||
|
|
||||||
|
def export_onnx(self, output_dir: str, opset=9, **kwargs):
|
||||||
|
"""Export the model as onnx format files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_dir: The output dir.
|
||||||
|
opset: The version of the ONNX operator set to use.
|
||||||
|
kwargs:
|
||||||
|
device: The device used to forward.
|
||||||
|
Returns:
|
||||||
|
A dict containing the model key - model file path pairs.
|
||||||
|
"""
|
||||||
|
model = self.model if 'model' not in kwargs else kwargs.pop('model')
|
||||||
|
device_name = 'cpu' if 'device' not in kwargs else kwargs.pop('device')
|
||||||
|
model_bin_file = os.path.join(model.model_dir,
|
||||||
|
ModelFile.TORCH_MODEL_BIN_FILE)
|
||||||
|
if os.path.exists(model_bin_file):
|
||||||
|
checkpoint = torch.load(model_bin_file, map_location='cpu')
|
||||||
|
model.load_state_dict(checkpoint)
|
||||||
|
onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
model.eval()
|
||||||
|
device = torch.device(device_name)
|
||||||
|
model.to(device)
|
||||||
|
model_script = torch.jit.script(model)
|
||||||
|
fbank_input = torch.zeros((1, 3, 120), dtype=torch.float32)
|
||||||
|
torch.onnx.export(
|
||||||
|
model_script,
|
||||||
|
fbank_input,
|
||||||
|
onnx_file,
|
||||||
|
opset_version=opset,
|
||||||
|
input_names=[INPUT_NAME],
|
||||||
|
output_names=[OUTPUT_NAME],
|
||||||
|
dynamic_axes={
|
||||||
|
INPUT_NAME: {
|
||||||
|
0: 'batch_size',
|
||||||
|
1: 'number_of_frame'
|
||||||
|
},
|
||||||
|
OUTPUT_NAME: {
|
||||||
|
0: 'batch_size',
|
||||||
|
1: 'number_of_frame'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return {'model': onnx_file}
|
||||||
@@ -6,6 +6,7 @@ import functools
|
|||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import platform
|
import platform
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
@@ -15,10 +16,10 @@ from http.cookiejar import CookieJar
|
|||||||
from os.path import expanduser
|
from os.path import expanduser
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import requests
|
||||||
from requests import Session
|
from requests import Session
|
||||||
from requests.adapters import HTTPAdapter, Retry
|
from requests.adapters import HTTPAdapter, Retry
|
||||||
|
|
||||||
from modelscope import __version__
|
|
||||||
from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
|
from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
|
||||||
API_RESPONSE_FIELD_DATA,
|
API_RESPONSE_FIELD_DATA,
|
||||||
API_RESPONSE_FIELD_EMAIL,
|
API_RESPONSE_FIELD_EMAIL,
|
||||||
@@ -45,7 +46,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
|
|||||||
MASTER_MODEL_BRANCH, DatasetFormations,
|
MASTER_MODEL_BRANCH, DatasetFormations,
|
||||||
DatasetMetaFormats,
|
DatasetMetaFormats,
|
||||||
DatasetVisibilityMap, DownloadChannel,
|
DatasetVisibilityMap, DownloadChannel,
|
||||||
ModelFile)
|
ModelFile, VirgoDatasetConfig)
|
||||||
from modelscope.utils.logger import get_logger
|
from modelscope.utils.logger import get_logger
|
||||||
from .utils.utils import (get_endpoint, get_release_datetime,
|
from .utils.utils import (get_endpoint, get_release_datetime,
|
||||||
model_id_to_group_owner_name)
|
model_id_to_group_owner_name)
|
||||||
@@ -160,6 +161,7 @@ class HubApi:
|
|||||||
'Visibility': visibility, # server check
|
'Visibility': visibility, # server check
|
||||||
'License': license,
|
'License': license,
|
||||||
'OriginalModelId': original_model_id,
|
'OriginalModelId': original_model_id,
|
||||||
|
'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''),
|
||||||
}
|
}
|
||||||
r = self.session.post(
|
r = self.session.post(
|
||||||
path, json=body, cookies=cookies, headers=self.headers)
|
path, json=body, cookies=cookies, headers=self.headers)
|
||||||
@@ -236,8 +238,10 @@ class HubApi:
|
|||||||
license: Optional[str] = Licenses.APACHE_V2,
|
license: Optional[str] = Licenses.APACHE_V2,
|
||||||
chinese_name: Optional[str] = None,
|
chinese_name: Optional[str] = None,
|
||||||
commit_message: Optional[str] = 'upload model',
|
commit_message: Optional[str] = 'upload model',
|
||||||
|
tag: Optional[str] = None,
|
||||||
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
||||||
original_model_id: Optional[str] = None):
|
original_model_id: Optional[str] = None,
|
||||||
|
ignore_file_pattern: Optional[Union[List[str], str]] = None):
|
||||||
"""Upload model from a given directory to given repository. A valid model directory
|
"""Upload model from a given directory to given repository. A valid model directory
|
||||||
must contain a configuration.json file.
|
must contain a configuration.json file.
|
||||||
|
|
||||||
@@ -268,10 +272,13 @@ class HubApi:
|
|||||||
chinese name of the new created model.
|
chinese name of the new created model.
|
||||||
commit_message(`str`, *optional*, defaults to `None`):
|
commit_message(`str`, *optional*, defaults to `None`):
|
||||||
commit message of the push request.
|
commit message of the push request.
|
||||||
|
tag(`str`, *optional*, defaults to `None`):
|
||||||
|
The tag on this commit
|
||||||
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
|
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
|
||||||
which branch to push. If the branch is not exists, It will create a new
|
which branch to push. If the branch is not exists, It will create a new
|
||||||
branch and push to it.
|
branch and push to it.
|
||||||
original_model_id (str, optional): The base model id which this model is trained from
|
original_model_id (str, optional): The base model id which this model is trained from
|
||||||
|
ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
InvalidParameter: Parameter invalid.
|
InvalidParameter: Parameter invalid.
|
||||||
@@ -292,6 +299,10 @@ class HubApi:
|
|||||||
if cookies is None:
|
if cookies is None:
|
||||||
raise NotLoginException('Must login before upload!')
|
raise NotLoginException('Must login before upload!')
|
||||||
files_to_save = os.listdir(model_dir)
|
files_to_save = os.listdir(model_dir)
|
||||||
|
if ignore_file_pattern is None:
|
||||||
|
ignore_file_pattern = []
|
||||||
|
if isinstance(ignore_file_pattern, str):
|
||||||
|
ignore_file_pattern = [ignore_file_pattern]
|
||||||
try:
|
try:
|
||||||
self.get_model(model_id=model_id)
|
self.get_model(model_id=model_id)
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -325,6 +336,8 @@ class HubApi:
|
|||||||
shutil.rmtree(src, ignore_errors=True)
|
shutil.rmtree(src, ignore_errors=True)
|
||||||
for f in files_to_save:
|
for f in files_to_save:
|
||||||
if f[0] != '.':
|
if f[0] != '.':
|
||||||
|
if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
|
||||||
|
continue
|
||||||
src = os.path.join(model_dir, f)
|
src = os.path.join(model_dir, f)
|
||||||
if os.path.isdir(src):
|
if os.path.isdir(src):
|
||||||
shutil.copytree(src, os.path.join(tmp_dir, f))
|
shutil.copytree(src, os.path.join(tmp_dir, f))
|
||||||
@@ -338,6 +351,8 @@ class HubApi:
|
|||||||
commit_message=commit_message,
|
commit_message=commit_message,
|
||||||
local_branch=revision,
|
local_branch=revision,
|
||||||
remote_branch=revision)
|
remote_branch=revision)
|
||||||
|
if tag is not None:
|
||||||
|
repo.tag_and_push(tag, tag)
|
||||||
except Exception:
|
except Exception:
|
||||||
raise
|
raise
|
||||||
finally:
|
finally:
|
||||||
@@ -581,6 +596,17 @@ class HubApi:
|
|||||||
file_list = file_list['Files']
|
file_list = file_list['Files']
|
||||||
return file_list
|
return file_list
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
|
||||||
|
"""
|
||||||
|
Dump the data_type as a local file, in order to get the dataset formation without calling the datahub.
|
||||||
|
More details, please refer to the class `modelscope.utils.constant.DatasetFormations`.
|
||||||
|
"""
|
||||||
|
dataset_type_file_path = os.path.join(meta_cache_dir,
|
||||||
|
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
|
||||||
|
with open(dataset_type_file_path, 'w') as fp:
|
||||||
|
fp.write('*** Automatically-generated file, do not modify ***')
|
||||||
|
|
||||||
def get_dataset_meta_files_local_paths(self, dataset_name: str,
|
def get_dataset_meta_files_local_paths(self, dataset_name: str,
|
||||||
namespace: str,
|
namespace: str,
|
||||||
revision: str,
|
revision: str,
|
||||||
@@ -591,10 +617,7 @@ class HubApi:
|
|||||||
cookies = ModelScopeConfig.get_cookies()
|
cookies = ModelScopeConfig.get_cookies()
|
||||||
|
|
||||||
# Dump the data_type as a local file
|
# Dump the data_type as a local file
|
||||||
dataset_type_file_path = os.path.join(meta_cache_dir,
|
HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)
|
||||||
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
|
|
||||||
with open(dataset_type_file_path, 'w') as fp:
|
|
||||||
fp.write('*** Automatically-generated file, do not modify ***')
|
|
||||||
|
|
||||||
for file_info in file_list:
|
for file_info in file_list:
|
||||||
file_path = file_info['Path']
|
file_path = file_info['Path']
|
||||||
@@ -661,7 +684,6 @@ class HubApi:
|
|||||||
cookies = self._check_cookie(use_cookies=True)
|
cookies = self._check_cookie(use_cookies=True)
|
||||||
else:
|
else:
|
||||||
cookies = ModelScopeConfig.get_cookies()
|
cookies = ModelScopeConfig.get_cookies()
|
||||||
r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
|
|
||||||
|
|
||||||
r = self.session.get(
|
r = self.session.get(
|
||||||
url=datahub_url, cookies=cookies, headers=self.headers)
|
url=datahub_url, cookies=cookies, headers=self.headers)
|
||||||
@@ -669,6 +691,31 @@ class HubApi:
|
|||||||
raise_on_error(resp)
|
raise_on_error(resp)
|
||||||
return resp['Data']
|
return resp['Data']
|
||||||
|
|
||||||
|
def get_virgo_meta(self, dataset_id: str, version: int = 1) -> dict:
|
||||||
|
"""
|
||||||
|
Get virgo dataset meta info.
|
||||||
|
"""
|
||||||
|
virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
|
||||||
|
if not virgo_endpoint:
|
||||||
|
raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
|
||||||
|
|
||||||
|
virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
|
||||||
|
cookies = requests.utils.dict_from_cookiejar(ModelScopeConfig.get_cookies())
|
||||||
|
|
||||||
|
dataset_info = dict(
|
||||||
|
dataSetId=dataset_id,
|
||||||
|
dataSetVersion=version
|
||||||
|
)
|
||||||
|
data = dict(
|
||||||
|
data=dataset_info,
|
||||||
|
)
|
||||||
|
r = self.session.post(url=virgo_dataset_url, json=data, cookies=cookies, headers=self.headers, timeout=900)
|
||||||
|
resp = r.json()
|
||||||
|
if resp['code'] != 0:
|
||||||
|
raise RuntimeError(f'Failed to get virgo dataset: {resp}')
|
||||||
|
|
||||||
|
return resp['data']
|
||||||
|
|
||||||
def get_dataset_access_config_for_unzipped(self,
|
def get_dataset_access_config_for_unzipped(self,
|
||||||
dataset_name: str,
|
dataset_name: str,
|
||||||
namespace: str,
|
namespace: str,
|
||||||
@@ -895,6 +942,7 @@ class ModelScopeConfig:
|
|||||||
if MODELSCOPE_CLOUD_USERNAME in os.environ:
|
if MODELSCOPE_CLOUD_USERNAME in os.environ:
|
||||||
user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
|
user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
|
||||||
|
|
||||||
|
from modelscope import __version__
|
||||||
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
|
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
|
||||||
__version__,
|
__version__,
|
||||||
platform.python_version(),
|
platform.python_version(),
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
|
|
||||||
|
import requests
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
from modelscope.utils.logger import get_logger
|
from modelscope.utils.logger import get_logger
|
||||||
@@ -57,13 +58,22 @@ def is_ok(rsp):
|
|||||||
return rsp['Code'] == HTTPStatus.OK and rsp['Success']
|
return rsp['Code'] == HTTPStatus.OK and rsp['Success']
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_response_error(response: requests.Response):
|
||||||
|
if 'application/json' in response.headers.get('content-type', ''):
|
||||||
|
message = response.json()
|
||||||
|
else:
|
||||||
|
message = response.content.decode('utf-8')
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
def handle_http_post_error(response, url, request_body):
|
def handle_http_post_error(response, url, request_body):
|
||||||
try:
|
try:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
except HTTPError as error:
|
except HTTPError as error:
|
||||||
logger.error('Request %s with body: %s exception' %
|
logger.error('Request %s with body: %s exception' %
|
||||||
(url, request_body))
|
(url, request_body))
|
||||||
logger.error('Response details: %s' % response.content)
|
message = _decode_response_error(response)
|
||||||
|
logger.error('Response details: %s' % message)
|
||||||
raise error
|
raise error
|
||||||
|
|
||||||
|
|
||||||
@@ -75,7 +85,8 @@ def handle_http_response(response, logger, cookies, model_id):
|
|||||||
logger.error(
|
logger.error(
|
||||||
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
|
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
|
||||||
private. Please login first.')
|
private. Please login first.')
|
||||||
logger.error('Response details: %s' % response.content)
|
message = _decode_response_error(response)
|
||||||
|
logger.error('Response details: %s' % message)
|
||||||
raise error
|
raise error
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ import requests
|
|||||||
from requests.adapters import Retry
|
from requests.adapters import Retry
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from modelscope import __version__
|
|
||||||
from modelscope.hub.api import HubApi, ModelScopeConfig
|
from modelscope.hub.api import HubApi, ModelScopeConfig
|
||||||
from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
|
from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
|
||||||
API_FILE_DOWNLOAD_RETRY_TIMES,
|
API_FILE_DOWNLOAD_RETRY_TIMES,
|
||||||
|
|||||||
@@ -55,16 +55,10 @@ class GitCommandWrapper(metaclass=Singleton):
|
|||||||
response.check_returncode()
|
response.check_returncode()
|
||||||
return response
|
return response
|
||||||
except subprocess.CalledProcessError as error:
|
except subprocess.CalledProcessError as error:
|
||||||
if response.returncode == 1:
|
logger.error('There are error run git command.')
|
||||||
logger.info('Nothing to commit.')
|
raise GitError(
|
||||||
return response
|
'stdout: %s, stderr: %s' %
|
||||||
else:
|
(response.stdout.decode('utf8'), error.stderr.decode('utf8')))
|
||||||
logger.error(
|
|
||||||
'There are error run git command, you may need to login first.'
|
|
||||||
)
|
|
||||||
raise GitError('stdout: %s, stderr: %s' %
|
|
||||||
(response.stdout.decode('utf8'),
|
|
||||||
error.stderr.decode('utf8')))
|
|
||||||
|
|
||||||
def config_auth_token(self, repo_dir, auth_token):
|
def config_auth_token(self, repo_dir, auth_token):
|
||||||
url = self.get_repo_remote_url(repo_dir)
|
url = self.get_repo_remote_url(repo_dir)
|
||||||
@@ -199,8 +193,11 @@ class GitCommandWrapper(metaclass=Singleton):
|
|||||||
else:
|
else:
|
||||||
return ['/'.join(line.split('/')[1:]) for line in info[1:]]
|
return ['/'.join(line.split('/')[1:]) for line in info[1:]]
|
||||||
|
|
||||||
def pull(self, repo_dir: str):
|
def pull(self,
|
||||||
cmds = ['-C', repo_dir, 'pull']
|
repo_dir: str,
|
||||||
|
remote: str = 'origin',
|
||||||
|
branch: str = 'master'):
|
||||||
|
cmds = ['-C', repo_dir, 'pull', remote, branch]
|
||||||
return self._run_git_command(*cmds)
|
return self._run_git_command(*cmds)
|
||||||
|
|
||||||
def push(self,
|
def push(self,
|
||||||
|
|||||||
@@ -4,8 +4,8 @@ import concurrent.futures
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from modelscope.hub.api import HubApi
|
from modelscope.hub.api import HubApi
|
||||||
from modelscope.hub.constants import Licenses, ModelVisibility
|
from modelscope.hub.constants import ModelVisibility
|
||||||
from modelscope.hub.errors import NotExistError
|
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
||||||
from modelscope.utils.logger import get_logger
|
from modelscope.utils.logger import get_logger
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@@ -18,7 +18,10 @@ def _api_push_to_hub(repo_name,
|
|||||||
token,
|
token,
|
||||||
private=True,
|
private=True,
|
||||||
commit_message='',
|
commit_message='',
|
||||||
source_repo=''):
|
tag=None,
|
||||||
|
source_repo='',
|
||||||
|
ignore_file_pattern=None,
|
||||||
|
revision=DEFAULT_REPOSITORY_REVISION):
|
||||||
try:
|
try:
|
||||||
api = HubApi()
|
api = HubApi()
|
||||||
api.login(token)
|
api.login(token)
|
||||||
@@ -29,7 +32,10 @@ def _api_push_to_hub(repo_name,
|
|||||||
if not private else ModelVisibility.PRIVATE,
|
if not private else ModelVisibility.PRIVATE,
|
||||||
chinese_name=repo_name,
|
chinese_name=repo_name,
|
||||||
commit_message=commit_message,
|
commit_message=commit_message,
|
||||||
original_model_id=source_repo)
|
tag=tag,
|
||||||
|
original_model_id=source_repo,
|
||||||
|
ignore_file_pattern=ignore_file_pattern,
|
||||||
|
revision=revision)
|
||||||
commit_message = commit_message or 'No commit message'
|
commit_message = commit_message or 'No commit message'
|
||||||
logger.info(
|
logger.info(
|
||||||
f'Successfully upload the model to {repo_name} with message: {commit_message}'
|
f'Successfully upload the model to {repo_name} with message: {commit_message}'
|
||||||
@@ -48,7 +54,10 @@ def push_to_hub(repo_name,
|
|||||||
private=True,
|
private=True,
|
||||||
retry=3,
|
retry=3,
|
||||||
commit_message='',
|
commit_message='',
|
||||||
source_repo=''):
|
tag=None,
|
||||||
|
source_repo='',
|
||||||
|
ignore_file_pattern=None,
|
||||||
|
revision=DEFAULT_REPOSITORY_REVISION):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
repo_name: The repo name for the modelhub repo
|
repo_name: The repo name for the modelhub repo
|
||||||
@@ -57,13 +66,18 @@ def push_to_hub(repo_name,
|
|||||||
private: If is a private repo, default True
|
private: If is a private repo, default True
|
||||||
retry: Retry times if something error in uploading, default 3
|
retry: Retry times if something error in uploading, default 3
|
||||||
commit_message: The commit message
|
commit_message: The commit message
|
||||||
|
tag: The tag of this commit
|
||||||
source_repo: The source repo (model id) which this model comes from
|
source_repo: The source repo (model id) which this model comes from
|
||||||
|
ignore_file_pattern: The file pattern to be ignored in uploading.
|
||||||
|
revision: The branch to commit to
|
||||||
Returns:
|
Returns:
|
||||||
The boolean value to represent whether the model is uploaded.
|
The boolean value to represent whether the model is uploaded.
|
||||||
"""
|
"""
|
||||||
if token is None:
|
if token is None:
|
||||||
token = os.environ.get('MODELSCOPE_API_TOKEN')
|
token = os.environ.get('MODELSCOPE_API_TOKEN')
|
||||||
|
if ignore_file_pattern is None:
|
||||||
|
ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
|
||||||
|
assert repo_name is not None
|
||||||
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
|
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
|
||||||
assert os.path.isdir(output_dir)
|
assert os.path.isdir(output_dir)
|
||||||
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
|
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
|
||||||
@@ -73,7 +87,8 @@ def push_to_hub(repo_name,
|
|||||||
f'Uploading {output_dir} to {repo_name} with message {commit_message}')
|
f'Uploading {output_dir} to {repo_name} with message {commit_message}')
|
||||||
for i in range(retry):
|
for i in range(retry):
|
||||||
if _api_push_to_hub(repo_name, output_dir, token, private,
|
if _api_push_to_hub(repo_name, output_dir, token, private,
|
||||||
commit_message, source_repo):
|
commit_message, tag, source_repo,
|
||||||
|
ignore_file_pattern, revision):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -83,7 +98,10 @@ def push_to_hub_async(repo_name,
|
|||||||
token=None,
|
token=None,
|
||||||
private=True,
|
private=True,
|
||||||
commit_message='',
|
commit_message='',
|
||||||
source_repo=''):
|
tag=None,
|
||||||
|
source_repo='',
|
||||||
|
ignore_file_pattern=None,
|
||||||
|
revision=DEFAULT_REPOSITORY_REVISION):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
repo_name: The repo name for the modelhub repo
|
repo_name: The repo name for the modelhub repo
|
||||||
@@ -91,13 +109,18 @@ def push_to_hub_async(repo_name,
|
|||||||
token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None
|
token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None
|
||||||
private: If is a private repo, default True
|
private: If is a private repo, default True
|
||||||
commit_message: The commit message
|
commit_message: The commit message
|
||||||
|
tag: The tag of this commit
|
||||||
source_repo: The source repo (model id) which this model comes from
|
source_repo: The source repo (model id) which this model comes from
|
||||||
|
ignore_file_pattern: The file pattern to be ignored in uploading
|
||||||
|
revision: The branch to commit to
|
||||||
Returns:
|
Returns:
|
||||||
A handler to check the result and the status
|
A handler to check the result and the status
|
||||||
"""
|
"""
|
||||||
if token is None:
|
if token is None:
|
||||||
token = os.environ.get('MODELSCOPE_API_TOKEN')
|
token = os.environ.get('MODELSCOPE_API_TOKEN')
|
||||||
|
if ignore_file_pattern is None:
|
||||||
|
ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
|
||||||
|
assert repo_name is not None
|
||||||
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
|
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
|
||||||
assert os.path.isdir(output_dir)
|
assert os.path.isdir(output_dir)
|
||||||
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
|
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
|
||||||
@@ -106,4 +129,5 @@ def push_to_hub_async(repo_name,
|
|||||||
logger.info(
|
logger.info(
|
||||||
f'Uploading {output_dir} to {repo_name} with message {commit_message}')
|
f'Uploading {output_dir} to {repo_name} with message {commit_message}')
|
||||||
return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
|
return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
|
||||||
private, commit_message, source_repo)
|
private, commit_message, tag, source_repo,
|
||||||
|
ignore_file_pattern, revision)
|
||||||
|
|||||||
@@ -88,6 +88,26 @@ class Repository:
|
|||||||
remote = None
|
remote = None
|
||||||
return remote
|
return remote
|
||||||
|
|
||||||
|
def pull(self, remote: str = 'origin', branch: str = 'master'):
|
||||||
|
"""Pull remote branch
|
||||||
|
|
||||||
|
Args:
|
||||||
|
remote (str, optional): The remote name. Defaults to 'origin'.
|
||||||
|
branch (str, optional): The remote branch. Defaults to 'master'.
|
||||||
|
"""
|
||||||
|
self.git_wrapper.pull(self.model_dir, remote=remote, branch=branch)
|
||||||
|
|
||||||
|
def add_lfs_type(self, file_name_suffix: str):
|
||||||
|
"""Add file suffix to lfs list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_name_suffix (str): The file name suffix.
|
||||||
|
examples '*.safetensors'
|
||||||
|
"""
|
||||||
|
os.system(
|
||||||
|
"printf '%s filter=lfs diff=lfs merge=lfs -text\n'>>%s" %
|
||||||
|
(file_name_suffix, os.path.join(self.model_dir, '.gitattributes')))
|
||||||
|
|
||||||
def push(self,
|
def push(self,
|
||||||
commit_message: str,
|
commit_message: str,
|
||||||
local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
||||||
@@ -120,7 +140,6 @@ class Repository:
|
|||||||
self.model_repo_name)
|
self.model_repo_name)
|
||||||
|
|
||||||
url = self.git_wrapper.get_repo_remote_url(self.model_dir)
|
url = self.git_wrapper.get_repo_remote_url(self.model_dir)
|
||||||
self.git_wrapper.pull(self.model_dir)
|
|
||||||
|
|
||||||
self.git_wrapper.add(self.model_dir, all_files=True)
|
self.git_wrapper.add(self.model_dir, all_files=True)
|
||||||
self.git_wrapper.commit(self.model_dir, commit_message)
|
self.git_wrapper.commit(self.model_dir, commit_message)
|
||||||
|
|||||||
@@ -116,15 +116,9 @@ class Models(object):
|
|||||||
bad_image_detecting = 'bad-image-detecting'
|
bad_image_detecting = 'bad-image-detecting'
|
||||||
controllable_image_generation = 'controllable-image-generation'
|
controllable_image_generation = 'controllable-image-generation'
|
||||||
longshortnet = 'longshortnet'
|
longshortnet = 'longshortnet'
|
||||||
|
fastinst = 'fastinst'
|
||||||
pedestrian_attribute_recognition = 'pedestrian-attribute-recognition'
|
pedestrian_attribute_recognition = 'pedestrian-attribute-recognition'
|
||||||
|
|
||||||
# EasyCV models
|
|
||||||
yolox = 'YOLOX'
|
|
||||||
segformer = 'Segformer'
|
|
||||||
hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
|
|
||||||
image_object_detection_auto = 'image-object-detection-auto'
|
|
||||||
dino = 'DINO'
|
|
||||||
|
|
||||||
# nlp models
|
# nlp models
|
||||||
bert = 'bert'
|
bert = 'bert'
|
||||||
palm = 'palm-v2'
|
palm = 'palm-v2'
|
||||||
@@ -177,6 +171,7 @@ class Models(object):
|
|||||||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
|
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
|
||||||
speech_dfsmn_ans = 'speech_dfsmn_ans'
|
speech_dfsmn_ans = 'speech_dfsmn_ans'
|
||||||
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
|
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
|
||||||
|
speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
|
||||||
speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
|
speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
|
||||||
speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
|
speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
|
||||||
kws_kwsbp = 'kws-kwsbp'
|
kws_kwsbp = 'kws-kwsbp'
|
||||||
@@ -187,6 +182,9 @@ class Models(object):
|
|||||||
generic_sv = 'generic-sv'
|
generic_sv = 'generic-sv'
|
||||||
ecapa_tdnn_sv = 'ecapa-tdnn-sv'
|
ecapa_tdnn_sv = 'ecapa-tdnn-sv'
|
||||||
campplus_sv = 'cam++-sv'
|
campplus_sv = 'cam++-sv'
|
||||||
|
eres2net_sv = 'eres2net-sv'
|
||||||
|
scl_sd = 'scl-sd'
|
||||||
|
rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
|
||||||
generic_lm = 'generic-lm'
|
generic_lm = 'generic-lm'
|
||||||
|
|
||||||
# multi-modal models
|
# multi-modal models
|
||||||
@@ -205,6 +203,8 @@ class Models(object):
|
|||||||
hitea = 'hitea'
|
hitea = 'hitea'
|
||||||
soonet = 'soonet'
|
soonet = 'soonet'
|
||||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||||
|
mplug_owl = 'mplug-owl'
|
||||||
|
clip_interrogator = 'clip-interrogator'
|
||||||
|
|
||||||
# science models
|
# science models
|
||||||
unifold = 'unifold'
|
unifold = 'unifold'
|
||||||
@@ -255,6 +255,7 @@ class Pipelines(object):
|
|||||||
should use task name for this pipeline.
|
should use task name for this pipeline.
|
||||||
For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
|
For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
|
||||||
"""
|
"""
|
||||||
|
pipeline_template = 'pipeline-template'
|
||||||
# vision tasks
|
# vision tasks
|
||||||
portrait_matting = 'unet-image-matting'
|
portrait_matting = 'unet-image-matting'
|
||||||
universal_matting = 'unet-universal-matting'
|
universal_matting = 'unet-universal-matting'
|
||||||
@@ -277,8 +278,6 @@ class Pipelines(object):
|
|||||||
tbs_detection = 'tbs-detection'
|
tbs_detection = 'tbs-detection'
|
||||||
object_detection = 'vit-object-detection'
|
object_detection = 'vit-object-detection'
|
||||||
abnormal_object_detection = 'abnormal-object-detection'
|
abnormal_object_detection = 'abnormal-object-detection'
|
||||||
easycv_detection = 'easycv-detection'
|
|
||||||
easycv_segmentation = 'easycv-segmentation'
|
|
||||||
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
|
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
|
||||||
salient_detection = 'u2net-salient-detection'
|
salient_detection = 'u2net-salient-detection'
|
||||||
salient_boudary_detection = 'res2net-salient-detection'
|
salient_boudary_detection = 'res2net-salient-detection'
|
||||||
@@ -347,7 +346,6 @@ class Pipelines(object):
|
|||||||
video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking'
|
video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking'
|
||||||
video_multi_object_tracking = 'video-multi-object-tracking'
|
video_multi_object_tracking = 'video-multi-object-tracking'
|
||||||
image_panoptic_segmentation = 'image-panoptic-segmentation'
|
image_panoptic_segmentation = 'image-panoptic-segmentation'
|
||||||
image_panoptic_segmentation_easycv = 'image-panoptic-segmentation-easycv'
|
|
||||||
video_summarization = 'googlenet_pgl_video_summarization'
|
video_summarization = 'googlenet_pgl_video_summarization'
|
||||||
language_guided_video_summarization = 'clip-it-video-summarization'
|
language_guided_video_summarization = 'clip-it-video-summarization'
|
||||||
image_semantic_segmentation = 'image-semantic-segmentation'
|
image_semantic_segmentation = 'image-semantic-segmentation'
|
||||||
@@ -402,7 +400,7 @@ class Pipelines(object):
|
|||||||
nerf_recon_acc = 'nerf-recon-acc'
|
nerf_recon_acc = 'nerf-recon-acc'
|
||||||
bad_image_detecting = 'bad-image-detecting'
|
bad_image_detecting = 'bad-image-detecting'
|
||||||
controllable_image_generation = 'controllable-image-generation'
|
controllable_image_generation = 'controllable-image-generation'
|
||||||
|
fast_instance_segmentation = 'fast-instance-segmentation'
|
||||||
image_quality_assessment_mos = 'image-quality-assessment-mos'
|
image_quality_assessment_mos = 'image-quality-assessment-mos'
|
||||||
image_quality_assessment_man = 'image-quality-assessment-man'
|
image_quality_assessment_man = 'image-quality-assessment-man'
|
||||||
image_quality_assessment_degradation = 'image-quality-assessment-degradation'
|
image_quality_assessment_degradation = 'image-quality-assessment-degradation'
|
||||||
@@ -485,6 +483,9 @@ class Pipelines(object):
|
|||||||
speaker_diarization_inference = 'speaker-diarization-inference'
|
speaker_diarization_inference = 'speaker-diarization-inference'
|
||||||
vad_inference = 'vad-inference'
|
vad_inference = 'vad-inference'
|
||||||
speaker_verification = 'speaker-verification'
|
speaker_verification = 'speaker-verification'
|
||||||
|
speaker_verification_rdino = 'speaker-verification-rdino'
|
||||||
|
speaker_verification_eres2net = 'speaker-verification-eres2net'
|
||||||
|
speaker_change_locating = 'speaker-change-locating'
|
||||||
lm_inference = 'language-score-prediction'
|
lm_inference = 'language-score-prediction'
|
||||||
speech_timestamp_inference = 'speech-timestamp-inference'
|
speech_timestamp_inference = 'speech-timestamp-inference'
|
||||||
|
|
||||||
@@ -514,6 +515,7 @@ class Pipelines(object):
|
|||||||
gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
|
gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
|
||||||
soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
|
soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
|
||||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||||
|
multimodal_dialogue = 'multimodal-dialogue'
|
||||||
|
|
||||||
# science tasks
|
# science tasks
|
||||||
protein_structure = 'unifold-protein-structure'
|
protein_structure = 'unifold-protein-structure'
|
||||||
@@ -881,6 +883,7 @@ class NLPTrainers(object):
|
|||||||
document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
|
document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
|
||||||
document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
|
document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
|
||||||
siamese_uie_trainer = 'siamese-uie-trainer'
|
siamese_uie_trainer = 'siamese-uie-trainer'
|
||||||
|
translation_evaluation_trainer = 'translation-evaluation-trainer'
|
||||||
|
|
||||||
|
|
||||||
class MultiModalTrainers(object):
|
class MultiModalTrainers(object):
|
||||||
@@ -911,7 +914,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
default = 'trainer'
|
default = 'trainer'
|
||||||
easycv = 'easycv'
|
|
||||||
tinynas_damoyolo = 'tinynas-damoyolo'
|
tinynas_damoyolo = 'tinynas-damoyolo'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -933,8 +935,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
|
|||||||
return Fields.multi_modal
|
return Fields.multi_modal
|
||||||
elif attribute_or_value == Trainers.default:
|
elif attribute_or_value == Trainers.default:
|
||||||
return Trainers.default
|
return Trainers.default
|
||||||
elif attribute_or_value == Trainers.easycv:
|
|
||||||
return Trainers.easycv
|
|
||||||
else:
|
else:
|
||||||
return 'unknown'
|
return 'unknown'
|
||||||
|
|
||||||
@@ -1034,6 +1034,8 @@ class Preprocessors(object):
|
|||||||
vldoc_preprocessor = 'vldoc-preprocessor'
|
vldoc_preprocessor = 'vldoc-preprocessor'
|
||||||
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
|
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
|
||||||
diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
|
diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
|
||||||
|
mplug_owl_preprocessor = 'mplug-owl-preprocessor'
|
||||||
|
image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'
|
||||||
|
|
||||||
# science preprocessor
|
# science preprocessor
|
||||||
unifold_preprocessor = 'unifold-preprocessor'
|
unifold_preprocessor = 'unifold-preprocessor'
|
||||||
@@ -1098,6 +1100,8 @@ class Metrics(object):
|
|||||||
# metric for image-colorization task
|
# metric for image-colorization task
|
||||||
image_colorization_metric = 'image-colorization-metric'
|
image_colorization_metric = 'image-colorization-metric'
|
||||||
ocr_recognition_metric = 'ocr-recognition-metric'
|
ocr_recognition_metric = 'ocr-recognition-metric'
|
||||||
|
# metric for translation evaluation
|
||||||
|
translation_evaluation_metric = 'translation-evaluation-metric'
|
||||||
|
|
||||||
|
|
||||||
class Optimizers(object):
|
class Optimizers(object):
|
||||||
@@ -1165,14 +1169,6 @@ class LR_Schedulers(object):
|
|||||||
class CustomDatasets(object):
|
class CustomDatasets(object):
|
||||||
""" Names for different datasets.
|
""" Names for different datasets.
|
||||||
"""
|
"""
|
||||||
ClsDataset = 'ClsDataset'
|
|
||||||
Face2dKeypointsDataset = 'FaceKeypointDataset'
|
|
||||||
HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
|
|
||||||
HumanWholeBodyKeypointDataset = 'WholeBodyCocoTopDownDataset'
|
|
||||||
SegDataset = 'SegDataset'
|
|
||||||
DetDataset = 'DetDataset'
|
|
||||||
DetImagesMixDataset = 'DetImagesMixDataset'
|
|
||||||
PanopticDataset = 'PanopticDataset'
|
|
||||||
PairedDataset = 'PairedDataset'
|
PairedDataset = 'PairedDataset'
|
||||||
SiddDataset = 'SiddDataset'
|
SiddDataset = 'SiddDataset'
|
||||||
GoproDataset = 'GoproDataset'
|
GoproDataset = 'GoproDataset'
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
|
|||||||
from .loss_metric import LossMetric
|
from .loss_metric import LossMetric
|
||||||
from .image_colorization_metric import ImageColorizationMetric
|
from .image_colorization_metric import ImageColorizationMetric
|
||||||
from .ocr_recognition_metric import OCRRecognitionMetric
|
from .ocr_recognition_metric import OCRRecognitionMetric
|
||||||
|
from .translation_evaluation_metric import TranslationEvaluationMetric
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
'audio_noise_metric': ['AudioNoiseMetric'],
|
'audio_noise_metric': ['AudioNoiseMetric'],
|
||||||
@@ -62,7 +63,8 @@ else:
|
|||||||
'text_ranking_metric': ['TextRankingMetric'],
|
'text_ranking_metric': ['TextRankingMetric'],
|
||||||
'loss_metric': ['LossMetric'],
|
'loss_metric': ['LossMetric'],
|
||||||
'image_colorization_metric': ['ImageColorizationMetric'],
|
'image_colorization_metric': ['ImageColorizationMetric'],
|
||||||
'ocr_recognition_metric': ['OCRRecognitionMetric']
|
'ocr_recognition_metric': ['OCRRecognitionMetric'],
|
||||||
|
'translation_evaluation_metric': ['TranslationEvaluationMetric']
|
||||||
}
|
}
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ class MetricKeys(object):
|
|||||||
NDCG = 'ndcg'
|
NDCG = 'ndcg'
|
||||||
AR = 'AR'
|
AR = 'AR'
|
||||||
Colorfulness = 'colorfulness'
|
Colorfulness = 'colorfulness'
|
||||||
|
Kendall_Tau_Correlation = 'kendall_tau_correlation'
|
||||||
|
|
||||||
|
|
||||||
task_default_metrics = {
|
task_default_metrics = {
|
||||||
@@ -76,6 +77,7 @@ task_default_metrics = {
|
|||||||
Tasks.bad_image_detecting: [Metrics.accuracy],
|
Tasks.bad_image_detecting: [Metrics.accuracy],
|
||||||
Tasks.ocr_recognition: [Metrics.ocr_recognition_metric],
|
Tasks.ocr_recognition: [Metrics.ocr_recognition_metric],
|
||||||
Tasks.efficient_diffusion_tuning: [Metrics.loss_metric],
|
Tasks.efficient_diffusion_tuning: [Metrics.loss_metric],
|
||||||
|
Tasks.translation_evaluation: [Metrics.translation_evaluation_metric]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
174
modelscope/metrics/translation_evaluation_metric.py
Normal file
174
modelscope/metrics/translation_evaluation_metric.py
Normal file
@@ -0,0 +1,174 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
from typing import Dict, List, Union
|
||||||
|
|
||||||
|
from pandas import DataFrame
|
||||||
|
|
||||||
|
from modelscope.metainfo import Metrics
|
||||||
|
from modelscope.metrics.base import Metric
|
||||||
|
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||||
|
from modelscope.models.nlp.unite.configuration import InputFormat
|
||||||
|
from modelscope.utils.logger import get_logger
|
||||||
|
from modelscope.utils.registry import default_group
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
@METRICS.register_module(
|
||||||
|
group_key=default_group, module_name=Metrics.translation_evaluation_metric)
|
||||||
|
class TranslationEvaluationMetric(Metric):
|
||||||
|
r"""The metric class for translation evaluation.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, gap_threshold: float = 25.0):
|
||||||
|
r"""Build a translation evaluation metric, following the designed
|
||||||
|
Kendall's tau correlation from WMT Metrics Shared Task competitions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gap_threshold: The score gap denoting the available hypothesis pair.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A metric for translation evaluation.
|
||||||
|
"""
|
||||||
|
self.gap_threshold = gap_threshold
|
||||||
|
|
||||||
|
self.lp = list()
|
||||||
|
self.segment_id = list()
|
||||||
|
self.raw_score = list()
|
||||||
|
self.score = list()
|
||||||
|
self.input_format = list()
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
r"""Clear all the stored variables.
|
||||||
|
"""
|
||||||
|
self.lp.clear()
|
||||||
|
self.segment_id.clear()
|
||||||
|
self.raw_score.clear()
|
||||||
|
self.input_format.clear()
|
||||||
|
|
||||||
|
self.score.clear()
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def add(self, outputs: Dict[str, List[float]],
|
||||||
|
inputs: Dict[str, List[Union[float, int]]]) -> None:
|
||||||
|
r"""Collect the related results for processing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
outputs: Dict containing 'scores'
|
||||||
|
inputs: Dict containing 'labels' and 'segment_ids'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.lp += inputs['lp']
|
||||||
|
self.segment_id += inputs['segment_id']
|
||||||
|
self.raw_score += inputs['raw_score']
|
||||||
|
self.input_format += inputs['input_format']
|
||||||
|
|
||||||
|
self.score += outputs['score']
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def evaluate(self) -> Dict[str, Dict[str, float]]:
|
||||||
|
r"""Compute the Kendall's tau correlation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dict denoting Kendall's tau correlation.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'lp': self.lp,
|
||||||
|
'segment_id': self.segment_id,
|
||||||
|
'raw_score': self.raw_score,
|
||||||
|
'input_format': self.input_format,
|
||||||
|
'score': self.score
|
||||||
|
}
|
||||||
|
data = DataFrame(data=data)
|
||||||
|
correlation = dict()
|
||||||
|
|
||||||
|
for input_format in data.input_format.unique():
|
||||||
|
logger.info('Evaluation results for %s input format'
|
||||||
|
% input_format.value)
|
||||||
|
input_format_data = data[data.input_format == input_format]
|
||||||
|
|
||||||
|
temp_correlation = dict()
|
||||||
|
|
||||||
|
for lp in sorted(input_format_data.lp.unique()):
|
||||||
|
sub_data = input_format_data[input_format_data.lp == lp]
|
||||||
|
temp_correlation[input_format.value + '_'
|
||||||
|
+ lp] = self.compute_kendall_tau(sub_data)
|
||||||
|
logger.info(
|
||||||
|
'\t%s: %f' %
|
||||||
|
(lp,
|
||||||
|
temp_correlation[input_format.value + '_' + lp] * 100))
|
||||||
|
|
||||||
|
avg_correlation = sum(
|
||||||
|
temp_correlation.values()) / len(temp_correlation)
|
||||||
|
correlation[input_format.value + '_avg'] = avg_correlation
|
||||||
|
logger.info('Average evaluation result for %s input format: %f' %
|
||||||
|
(input_format.value, avg_correlation))
|
||||||
|
logger.info('')
|
||||||
|
correlation.update(temp_correlation)
|
||||||
|
|
||||||
|
return correlation
|
||||||
|
|
||||||
|
def merge(self, other: 'TranslationEvaluationMetric') -> None:
|
||||||
|
r"""Merge the predictions from other TranslationEvaluationMetric objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
other: Another TranslationEvaluationMetric object.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.lp += other.lp
|
||||||
|
self.segment_id += other.segment_ids
|
||||||
|
self.raw_score += other.raw_score
|
||||||
|
self.input_format += other.input_format
|
||||||
|
|
||||||
|
self.score += other.score
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def compute_kendall_tau(self, csv_data: DataFrame) -> float:
|
||||||
|
r"""Compute kendall's tau correlation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
csv_data: The pandas dataframe.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: THe kendall's Tau correlation.
|
||||||
|
|
||||||
|
"""
|
||||||
|
concor = discor = 0
|
||||||
|
|
||||||
|
for segment_id in sorted(csv_data.segment_id.unique()):
|
||||||
|
group_csv_data = csv_data[csv_data.segment_id == segment_id]
|
||||||
|
|
||||||
|
examples = group_csv_data.to_dict('records')
|
||||||
|
|
||||||
|
for i in range(0, len(examples)):
|
||||||
|
for j in range(i + 1, len(examples)):
|
||||||
|
if self.raw_score[i] - self.raw_score[
|
||||||
|
j] >= self.gap_threshold:
|
||||||
|
if self.score[i] > self.score[j]:
|
||||||
|
concor += 1
|
||||||
|
elif self.score[i] < self.score[j]:
|
||||||
|
discor += 1
|
||||||
|
elif self.raw_score[i] - self.raw_score[
|
||||||
|
j] <= -self.gap_threshold:
|
||||||
|
if self.score[i] < self.score[j]:
|
||||||
|
concor += 1
|
||||||
|
elif self.score[i] > self.score[j]:
|
||||||
|
discor += 1
|
||||||
|
|
||||||
|
if concor + discor == 0:
|
||||||
|
logger.warning(
|
||||||
|
'We don\'t have available pairs when evaluation. '
|
||||||
|
'Marking the kendall tau correlation as the lowest value (-1.0).'
|
||||||
|
)
|
||||||
|
return -1.0
|
||||||
|
else:
|
||||||
|
return (concor - discor) / (concor + discor)
|
||||||
@@ -39,7 +39,7 @@ class ConvSTFT(nn.Module):
|
|||||||
super(ConvSTFT, self).__init__()
|
super(ConvSTFT, self).__init__()
|
||||||
|
|
||||||
if fft_len is None:
|
if fft_len is None:
|
||||||
self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
|
self.fft_len = int(2**np.ceil(np.log2(win_len)))
|
||||||
else:
|
else:
|
||||||
self.fft_len = fft_len
|
self.fft_len = fft_len
|
||||||
|
|
||||||
@@ -78,7 +78,7 @@ class ConviSTFT(nn.Module):
|
|||||||
fix=True):
|
fix=True):
|
||||||
super(ConviSTFT, self).__init__()
|
super(ConviSTFT, self).__init__()
|
||||||
if fft_len is None:
|
if fft_len is None:
|
||||||
self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
|
self.fft_len = int(2**np.ceil(np.log2(win_len)))
|
||||||
else:
|
else:
|
||||||
self.fft_len = fft_len
|
self.fft_len = fft_len
|
||||||
kernel, window = init_kernels(
|
kernel, window = init_kernels(
|
||||||
|
|||||||
@@ -45,27 +45,5 @@ class GenericAutomaticSpeechRecognition(Model):
|
|||||||
def forward(self) -> Dict[str, Any]:
|
def forward(self) -> Dict[str, Any]:
|
||||||
"""preload model and return the info of the model
|
"""preload model and return the info of the model
|
||||||
"""
|
"""
|
||||||
if self.model_cfg['model_config']['type'] == Frameworks.tf:
|
|
||||||
from easyasr import asr_inference_paraformer_tf
|
|
||||||
if hasattr(asr_inference_paraformer_tf, 'preload'):
|
|
||||||
model_workspace = self.model_cfg['model_workspace']
|
|
||||||
model_path = os.path.join(model_workspace,
|
|
||||||
self.model_cfg['am_model'])
|
|
||||||
vocab_path = os.path.join(
|
|
||||||
model_workspace,
|
|
||||||
self.model_cfg['model_config']['vocab_file'])
|
|
||||||
sampled_ids = 'seq2seq/sampled_ids'
|
|
||||||
sampled_lengths = 'seq2seq/sampled_lengths'
|
|
||||||
if 'sampled_ids' in self.model_cfg['model_config']:
|
|
||||||
sampled_ids = self.model_cfg['model_config']['sampled_ids']
|
|
||||||
if 'sampled_lengths' in self.model_cfg['model_config']:
|
|
||||||
sampled_lengths = self.model_cfg['model_config'][
|
|
||||||
'sampled_lengths']
|
|
||||||
asr_inference_paraformer_tf.preload(
|
|
||||||
ngpu=1,
|
|
||||||
asr_model_file=model_path,
|
|
||||||
vocab_file=vocab_path,
|
|
||||||
sampled_ids=sampled_ids,
|
|
||||||
sampled_lengths=sampled_lengths)
|
|
||||||
|
|
||||||
return self.model_cfg
|
return self.model_cfg
|
||||||
|
|||||||
233
modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
Normal file
233
modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
|
||||||
|
from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32
|
||||||
|
|
||||||
|
|
||||||
|
class DFSMNUnit(nn.Module):
|
||||||
|
""" one multi-channel deep fsmn unit
|
||||||
|
Args:
|
||||||
|
dimin: input dimension
|
||||||
|
dimexpand: feature expansion dimension
|
||||||
|
dimout: output dimension
|
||||||
|
lorder: left ofder
|
||||||
|
rorder: right order
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
dimin=64,
|
||||||
|
dimexpand=128,
|
||||||
|
dimout=64,
|
||||||
|
lorder=10,
|
||||||
|
rorder=1):
|
||||||
|
super(DFSMNUnit, self).__init__()
|
||||||
|
|
||||||
|
self.expand = AffineTransform(dimin, dimexpand)
|
||||||
|
self.shrink = LinearTransform(dimexpand, dimout)
|
||||||
|
self.fsmn = Fsmn(dimout, dimout, lorder, rorder, 1, 1)
|
||||||
|
|
||||||
|
self.debug = False
|
||||||
|
self.dataout = None
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
input: [batch, time, feature]
|
||||||
|
"""
|
||||||
|
out1 = F.relu(self.expand(input))
|
||||||
|
out2 = self.shrink(out1)
|
||||||
|
out3 = self.fsmn(out2)
|
||||||
|
|
||||||
|
# add skip connection for matched data
|
||||||
|
if input.shape[-1] == out3.shape[-1]:
|
||||||
|
out3 = input + out3
|
||||||
|
if self.debug:
|
||||||
|
self.dataout = out3
|
||||||
|
return out3
|
||||||
|
|
||||||
|
def print_model(self):
|
||||||
|
self.expand.printModel()
|
||||||
|
self.shrink.printModel()
|
||||||
|
self.fsmn.printModel()
|
||||||
|
|
||||||
|
def to_kaldi_nnet(self):
|
||||||
|
re_str = self.expand.toKaldiNNet()
|
||||||
|
relu = RectifiedLinear(self.expand.linear.out_features,
|
||||||
|
self.expand.linear.out_features)
|
||||||
|
re_str += relu.toKaldiNNet()
|
||||||
|
re_str = self.shrink.toKaldiNNet()
|
||||||
|
re_str += self.fsmn.toKaldiNNet()
|
||||||
|
return re_str
|
||||||
|
|
||||||
|
|
||||||
|
class FSMNSeleNetV3(nn.Module):
|
||||||
|
""" Deep FSMN model with channel selection performs multi-channel kws.
|
||||||
|
Zhang, Shiliang, et al. "Deep-FSMN for large vocabulary continuous speech
|
||||||
|
recognition." 2018 IEEE International Conference on Acoustics, Speech and
|
||||||
|
Signal Processing (ICASSP). IEEE, 2018.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dim: input dimension
|
||||||
|
linear_dim: fsmn input dimension
|
||||||
|
proj_dim: fsmn projection dimension
|
||||||
|
lorder: fsmn left order
|
||||||
|
rorder: fsmn right order
|
||||||
|
num_syn: output dimension
|
||||||
|
fsmn_layers: no. of fsmn units
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
input_dim=120,
|
||||||
|
linear_dim=128,
|
||||||
|
proj_dim=64,
|
||||||
|
lorder=10,
|
||||||
|
rorder=1,
|
||||||
|
num_syn=5,
|
||||||
|
fsmn_layers=5):
|
||||||
|
super(FSMNSeleNetV3, self).__init__()
|
||||||
|
|
||||||
|
self.mem = []
|
||||||
|
# the first unit, mapping input dim to proj dim
|
||||||
|
unit = DFSMNUnit(input_dim, linear_dim, proj_dim, lorder, rorder)
|
||||||
|
self.mem.append(unit)
|
||||||
|
self.add_module('mem_{:d}'.format(0), unit)
|
||||||
|
|
||||||
|
# deep fsmn layers with skip connection
|
||||||
|
for i in range(1, fsmn_layers):
|
||||||
|
unit = DFSMNUnit(proj_dim, linear_dim, proj_dim, lorder, rorder)
|
||||||
|
self.mem.append(unit)
|
||||||
|
self.add_module('mem_{:d}'.format(i), unit)
|
||||||
|
|
||||||
|
self.expand2 = AffineTransform(proj_dim, linear_dim)
|
||||||
|
self.decision = AffineTransform(linear_dim, num_syn)
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
# multi-channel temp space, [batch, time, channel, feature]
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
|
||||||
|
self.expand2.linear.out_features).cuda()
|
||||||
|
else:
|
||||||
|
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
|
||||||
|
self.expand2.linear.out_features)
|
||||||
|
|
||||||
|
for n in range(input.shape[2]):
|
||||||
|
chin = input[:, :, n, :]
|
||||||
|
|
||||||
|
for unit in self.mem:
|
||||||
|
chout = unit(chin)
|
||||||
|
chin = chout
|
||||||
|
|
||||||
|
x[:, :, n, :] = F.relu(self.expand2(chout))
|
||||||
|
|
||||||
|
# perform max pooling
|
||||||
|
pool = nn.MaxPool2d((x.shape[2], 1), stride=(x.shape[2], 1))
|
||||||
|
y = pool(x)
|
||||||
|
|
||||||
|
# remove channel dimension
|
||||||
|
y = torch.squeeze(y, -2)
|
||||||
|
z = self.decision(y)
|
||||||
|
|
||||||
|
return z
|
||||||
|
|
||||||
|
def print_model(self):
|
||||||
|
for unit in self.mem:
|
||||||
|
unit.print_model()
|
||||||
|
|
||||||
|
self.expand2.printModel()
|
||||||
|
self.decision.printModel()
|
||||||
|
|
||||||
|
def print_header(self):
|
||||||
|
""" get DFSMN params
|
||||||
|
"""
|
||||||
|
input_dim = self.mem[0].expand.linear.in_features
|
||||||
|
linear_dim = self.mem[0].expand.linear.out_features
|
||||||
|
proj_dim = self.mem[0].shrink.linear.out_features
|
||||||
|
lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
|
||||||
|
rorder = 0
|
||||||
|
if self.mem[0].fsmn.conv_right is not None:
|
||||||
|
rorder = self.mem[0].fsmn.conv_right.kernel_size[0]
|
||||||
|
|
||||||
|
num_syn = self.decision.linear.out_features
|
||||||
|
fsmn_layers = len(self.mem)
|
||||||
|
|
||||||
|
# no. of output channels, 0.0 means the same as numins
|
||||||
|
numouts = 1.0
|
||||||
|
|
||||||
|
#
|
||||||
|
# write total header
|
||||||
|
#
|
||||||
|
header = [0.0] * HEADER_BLOCK_SIZE * 5
|
||||||
|
# numins
|
||||||
|
header[0] = 0.0
|
||||||
|
# numouts
|
||||||
|
header[1] = numouts
|
||||||
|
# dimins
|
||||||
|
header[2] = input_dim
|
||||||
|
# dimouts
|
||||||
|
header[3] = num_syn
|
||||||
|
# numlayers
|
||||||
|
header[4] = 4
|
||||||
|
|
||||||
|
#
|
||||||
|
# write each layer's header
|
||||||
|
#
|
||||||
|
hidx = 1
|
||||||
|
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||||
|
LayerType.LAYER_DFSMN.value)
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 4] = proj_dim
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 5] = lorder
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 6] = rorder
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 7] = fsmn_layers
|
||||||
|
hidx += 1
|
||||||
|
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||||
|
LayerType.LAYER_DENSE.value)
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 2] = proj_dim
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
|
||||||
|
ActivationType.ACTIVATION_RELU.value)
|
||||||
|
hidx += 1
|
||||||
|
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||||
|
LayerType.LAYER_MAX_POOLING.value)
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
|
||||||
|
hidx += 1
|
||||||
|
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||||
|
LayerType.LAYER_DENSE.value)
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
|
||||||
|
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
|
||||||
|
ActivationType.ACTIVATION_SOFTMAX.value)
|
||||||
|
|
||||||
|
for h in header:
|
||||||
|
print(f32ToI32(h))
|
||||||
|
|
||||||
|
def to_kaldi_nnet(self):
|
||||||
|
re_str = '<Nnet>\n'
|
||||||
|
for unit in self.mem:
|
||||||
|
re_str += unit.to_kaldi_nnet()
|
||||||
|
re_str = self.expand2.toKaldiNNet()
|
||||||
|
relu = RectifiedLinear(self.expand2.linear.out_features,
|
||||||
|
self.expand2.linear.out_features)
|
||||||
|
re_str += relu.toKaldiNNet()
|
||||||
|
re_str += self.decision.toKaldiNNet()
|
||||||
|
re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
|
||||||
|
self.decision.linear.out_features)
|
||||||
|
re_str += '<!EndOfComponent>\n'
|
||||||
|
re_str += '</Nnet>\n'
|
||||||
|
|
||||||
|
return re_str
|
||||||
@@ -11,6 +11,7 @@ from modelscope.models.builder import MODELS
|
|||||||
from modelscope.utils.audio.audio_utils import update_conf
|
from modelscope.utils.audio.audio_utils import update_conf
|
||||||
from modelscope.utils.constant import Tasks
|
from modelscope.utils.constant import Tasks
|
||||||
from .fsmn_sele_v2 import FSMNSeleNetV2
|
from .fsmn_sele_v2 import FSMNSeleNetV2
|
||||||
|
from .fsmn_sele_v3 import FSMNSeleNetV3
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
@MODELS.register_module(
|
||||||
@@ -18,6 +19,7 @@ from .fsmn_sele_v2 import FSMNSeleNetV2
|
|||||||
class FSMNSeleNetV2Decorator(TorchModel):
|
class FSMNSeleNetV2Decorator(TorchModel):
|
||||||
r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """
|
r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """
|
||||||
|
|
||||||
|
MODEL_CLASS = FSMNSeleNetV2
|
||||||
MODEL_TXT = 'model.txt'
|
MODEL_TXT = 'model.txt'
|
||||||
SC_CONFIG = 'sound_connect.conf'
|
SC_CONFIG = 'sound_connect.conf'
|
||||||
|
|
||||||
@@ -33,7 +35,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
|||||||
"""
|
"""
|
||||||
super().__init__(model_dir, *args, **kwargs)
|
super().__init__(model_dir, *args, **kwargs)
|
||||||
if training:
|
if training:
|
||||||
self.model = FSMNSeleNetV2(*args, **kwargs)
|
self.model = self.MODEL_CLASS(*args, **kwargs)
|
||||||
else:
|
else:
|
||||||
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
|
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
|
||||||
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
|
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
|
||||||
@@ -42,7 +44,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
|||||||
|
|
||||||
self._sc = None
|
self._sc = None
|
||||||
if os.path.exists(model_txt_file):
|
if os.path.exists(model_txt_file):
|
||||||
conf_dict = dict(mode=56542, kws_model=model_txt_file)
|
conf_dict = dict(kws_model=model_txt_file)
|
||||||
update_conf(sc_config_file, new_config_file, conf_dict)
|
update_conf(sc_config_file, new_config_file, conf_dict)
|
||||||
import py_sound_connect
|
import py_sound_connect
|
||||||
self._sc = py_sound_connect.SoundConnect(new_config_file)
|
self._sc = py_sound_connect.SoundConnect(new_config_file)
|
||||||
@@ -50,8 +52,8 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
|||||||
self.size_out = self._sc.bytesPerBlockOut()
|
self.size_out = self._sc.bytesPerBlockOut()
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f'Invalid model directory! Failed to load model file: {model_txt_file}.'
|
f'Invalid model directory! Failed to load model file:'
|
||||||
)
|
f' {model_txt_file}.')
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
if hasattr(self, 'tmp_dir'):
|
if hasattr(self, 'tmp_dir'):
|
||||||
@@ -73,3 +75,24 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
|||||||
'confidence': self._sc.kwsConfidence()
|
'confidence': self._sc.kwsConfidence()
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(
|
||||||
|
Tasks.keyword_spotting,
|
||||||
|
module_name=Models.speech_dfsmn_kws_char_farfield_iot)
|
||||||
|
class FSMNSeleNetV3Decorator(FSMNSeleNetV2Decorator):
|
||||||
|
r""" A decorator of FSMNSeleNetV3 for integrating into modelscope framework """
|
||||||
|
|
||||||
|
MODEL_CLASS = FSMNSeleNetV3
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
model_dir: str,
|
||||||
|
training: Optional[bool] = False,
|
||||||
|
*args,
|
||||||
|
**kwargs):
|
||||||
|
"""initialize the dfsmn model from the `model_dir` path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_dir (str): the model path.
|
||||||
|
"""
|
||||||
|
super().__init__(model_dir, training, *args, **kwargs)
|
||||||
|
|||||||
@@ -76,11 +76,13 @@ class CAMPPlus(nn.Module):
|
|||||||
bn_size=4,
|
bn_size=4,
|
||||||
init_channels=128,
|
init_channels=128,
|
||||||
config_str='batchnorm-relu',
|
config_str='batchnorm-relu',
|
||||||
memory_efficient=True):
|
memory_efficient=True,
|
||||||
|
output_level='segment'):
|
||||||
super(CAMPPlus, self).__init__()
|
super(CAMPPlus, self).__init__()
|
||||||
|
|
||||||
self.head = FCM(feat_dim=feat_dim)
|
self.head = FCM(feat_dim=feat_dim)
|
||||||
channels = self.head.out_channels
|
channels = self.head.out_channels
|
||||||
|
self.output_level = output_level
|
||||||
|
|
||||||
self.xvector = nn.Sequential(
|
self.xvector = nn.Sequential(
|
||||||
OrderedDict([
|
OrderedDict([
|
||||||
@@ -118,10 +120,14 @@ class CAMPPlus(nn.Module):
|
|||||||
self.xvector.add_module('out_nonlinear',
|
self.xvector.add_module('out_nonlinear',
|
||||||
get_nonlinear(config_str, channels))
|
get_nonlinear(config_str, channels))
|
||||||
|
|
||||||
self.xvector.add_module('stats', StatsPool())
|
if self.output_level == 'segment':
|
||||||
self.xvector.add_module(
|
self.xvector.add_module('stats', StatsPool())
|
||||||
'dense',
|
self.xvector.add_module(
|
||||||
DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
|
'dense',
|
||||||
|
DenseLayer(
|
||||||
|
channels * 2, embedding_size, config_str='batchnorm_'))
|
||||||
|
else:
|
||||||
|
assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. '
|
||||||
|
|
||||||
for m in self.modules():
|
for m in self.modules():
|
||||||
if isinstance(m, (nn.Conv1d, nn.Linear)):
|
if isinstance(m, (nn.Conv1d, nn.Linear)):
|
||||||
@@ -133,6 +139,8 @@ class CAMPPlus(nn.Module):
|
|||||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||||
x = self.head(x)
|
x = self.head(x)
|
||||||
x = self.xvector(x)
|
x = self.xvector(x)
|
||||||
|
if self.output_level == 'frame':
|
||||||
|
x = x.transpose(1, 2)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
344
modelscope/models/audio/sv/ERes2Net.py
Normal file
344
modelscope/models/audio/sv/ERes2Net.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||||
|
ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
|
||||||
|
fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
||||||
|
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
||||||
|
"""
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torchaudio.compliance.kaldi as Kaldi
|
||||||
|
|
||||||
|
import modelscope.models.audio.sv.pooling_layers as pooling_layers
|
||||||
|
from modelscope.metainfo import Models
|
||||||
|
from modelscope.models import MODELS, TorchModel
|
||||||
|
from modelscope.models.audio.sv.fusion import AFF
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
|
||||||
|
|
||||||
|
class ReLU(nn.Hardtanh):
|
||||||
|
|
||||||
|
def __init__(self, inplace=False):
|
||||||
|
super(ReLU, self).__init__(0, 20, inplace)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
inplace_str = 'inplace' if self.inplace else ''
|
||||||
|
return self.__class__.__name__ + ' (' \
|
||||||
|
+ inplace_str + ')'
|
||||||
|
|
||||||
|
|
||||||
|
def conv1x1(in_planes, out_planes, stride=1):
|
||||||
|
'1x1 convolution without padding'
|
||||||
|
return nn.Conv2d(
|
||||||
|
in_planes,
|
||||||
|
out_planes,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=stride,
|
||||||
|
padding=0,
|
||||||
|
bias=False)
|
||||||
|
|
||||||
|
|
||||||
|
def conv3x3(in_planes, out_planes, stride=1):
|
||||||
|
'3x3 convolution with padding'
|
||||||
|
return nn.Conv2d(
|
||||||
|
in_planes,
|
||||||
|
out_planes,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=stride,
|
||||||
|
padding=1,
|
||||||
|
bias=False)
|
||||||
|
|
||||||
|
|
||||||
|
class BasicBlockRes2Net(nn.Module):
|
||||||
|
expansion = 2
|
||||||
|
|
||||||
|
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||||
|
super(BasicBlockRes2Net, self).__init__()
|
||||||
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
|
self.conv1 = conv1x1(in_planes, width * scale, stride)
|
||||||
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
|
self.nums = scale
|
||||||
|
|
||||||
|
convs = []
|
||||||
|
bns = []
|
||||||
|
for i in range(self.nums):
|
||||||
|
convs.append(conv3x3(width, width))
|
||||||
|
bns.append(nn.BatchNorm2d(width))
|
||||||
|
self.convs = nn.ModuleList(convs)
|
||||||
|
self.bns = nn.ModuleList(bns)
|
||||||
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
|
self.conv3 = conv1x1(width * scale, planes * self.expansion)
|
||||||
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
|
self.shortcut = nn.Sequential()
|
||||||
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
|
self.shortcut = nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
in_planes,
|
||||||
|
self.expansion * planes,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=stride,
|
||||||
|
bias=False), nn.BatchNorm2d(self.expansion * planes))
|
||||||
|
self.stride = stride
|
||||||
|
self.width = width
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
out = self.conv1(x)
|
||||||
|
out = self.bn1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
spx = torch.split(out, self.width, 1)
|
||||||
|
for i in range(self.nums):
|
||||||
|
if i == 0:
|
||||||
|
sp = spx[i]
|
||||||
|
else:
|
||||||
|
sp = sp + spx[i]
|
||||||
|
sp = self.convs[i](sp)
|
||||||
|
sp = self.relu(self.bns[i](sp))
|
||||||
|
if i == 0:
|
||||||
|
out = sp
|
||||||
|
else:
|
||||||
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
|
out = self.conv3(out)
|
||||||
|
out = self.bn3(out)
|
||||||
|
|
||||||
|
residual = self.shortcut(x)
|
||||||
|
out += residual
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class BasicBlockRes2Net_diff_AFF(nn.Module):
|
||||||
|
expansion = 2
|
||||||
|
|
||||||
|
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||||
|
super(BasicBlockRes2Net_diff_AFF, self).__init__()
|
||||||
|
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||||
|
self.conv1 = conv1x1(in_planes, width * scale, stride)
|
||||||
|
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||||
|
self.nums = scale
|
||||||
|
|
||||||
|
convs = []
|
||||||
|
fuse_models = []
|
||||||
|
bns = []
|
||||||
|
for i in range(self.nums):
|
||||||
|
convs.append(conv3x3(width, width))
|
||||||
|
bns.append(nn.BatchNorm2d(width))
|
||||||
|
for j in range(self.nums - 1):
|
||||||
|
fuse_models.append(AFF(channels=width))
|
||||||
|
|
||||||
|
self.convs = nn.ModuleList(convs)
|
||||||
|
self.bns = nn.ModuleList(bns)
|
||||||
|
self.fuse_models = nn.ModuleList(fuse_models)
|
||||||
|
self.relu = ReLU(inplace=True)
|
||||||
|
|
||||||
|
self.conv3 = conv1x1(width * scale, planes * self.expansion)
|
||||||
|
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||||
|
self.shortcut = nn.Sequential()
|
||||||
|
if stride != 1 or in_planes != self.expansion * planes:
|
||||||
|
self.shortcut = nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
in_planes,
|
||||||
|
self.expansion * planes,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=stride,
|
||||||
|
bias=False), nn.BatchNorm2d(self.expansion * planes))
|
||||||
|
self.stride = stride
|
||||||
|
self.width = width
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
residual = x
|
||||||
|
|
||||||
|
out = self.conv1(x)
|
||||||
|
out = self.bn1(out)
|
||||||
|
out = self.relu(out)
|
||||||
|
spx = torch.split(out, self.width, 1)
|
||||||
|
for i in range(self.nums):
|
||||||
|
if i == 0:
|
||||||
|
sp = spx[i]
|
||||||
|
else:
|
||||||
|
sp = self.fuse_models[i - 1](sp, spx[i])
|
||||||
|
|
||||||
|
sp = self.convs[i](sp)
|
||||||
|
sp = self.relu(self.bns[i](sp))
|
||||||
|
if i == 0:
|
||||||
|
out = sp
|
||||||
|
else:
|
||||||
|
out = torch.cat((out, sp), 1)
|
||||||
|
|
||||||
|
out = self.conv3(out)
|
||||||
|
out = self.bn3(out)
|
||||||
|
|
||||||
|
residual = self.shortcut(x)
|
||||||
|
out += residual
|
||||||
|
out = self.relu(out)
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class ERes2Net(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
block=BasicBlockRes2Net,
|
||||||
|
block_fuse=BasicBlockRes2Net_diff_AFF,
|
||||||
|
num_blocks=[3, 4, 6, 3],
|
||||||
|
m_channels=32,
|
||||||
|
feat_dim=80,
|
||||||
|
embed_dim=192,
|
||||||
|
pooling_func='TSTP',
|
||||||
|
two_emb_layer=False):
|
||||||
|
super(ERes2Net, self).__init__()
|
||||||
|
self.in_planes = m_channels
|
||||||
|
self.feat_dim = feat_dim
|
||||||
|
self.embed_dim = embed_dim
|
||||||
|
self.stats_dim = int(feat_dim / 8) * m_channels * 8
|
||||||
|
self.two_emb_layer = two_emb_layer
|
||||||
|
|
||||||
|
self.conv1 = nn.Conv2d(
|
||||||
|
1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
|
||||||
|
self.bn1 = nn.BatchNorm2d(m_channels)
|
||||||
|
self.layer1 = self._make_layer(
|
||||||
|
block, m_channels, num_blocks[0], stride=1)
|
||||||
|
self.layer2 = self._make_layer(
|
||||||
|
block, m_channels * 2, num_blocks[1], stride=2)
|
||||||
|
self.layer3 = self._make_layer(
|
||||||
|
block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
||||||
|
self.layer4 = self._make_layer(
|
||||||
|
block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||||
|
|
||||||
|
# downsampling
|
||||||
|
self.layer1_downsample = nn.Conv2d(
|
||||||
|
m_channels * 2,
|
||||||
|
m_channels * 4,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=2,
|
||||||
|
padding=1,
|
||||||
|
bias=False)
|
||||||
|
self.layer2_downsample = nn.Conv2d(
|
||||||
|
m_channels * 4,
|
||||||
|
m_channels * 8,
|
||||||
|
kernel_size=3,
|
||||||
|
padding=1,
|
||||||
|
stride=2,
|
||||||
|
bias=False)
|
||||||
|
self.layer3_downsample = nn.Conv2d(
|
||||||
|
m_channels * 8,
|
||||||
|
m_channels * 16,
|
||||||
|
kernel_size=3,
|
||||||
|
padding=1,
|
||||||
|
stride=2,
|
||||||
|
bias=False)
|
||||||
|
|
||||||
|
# bottom-up fusion
|
||||||
|
self.fuse_mode12 = AFF(channels=m_channels * 4)
|
||||||
|
self.fuse_mode123 = AFF(channels=m_channels * 8)
|
||||||
|
self.fuse_mode1234 = AFF(channels=m_channels * 16)
|
||||||
|
|
||||||
|
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
|
||||||
|
self.pool = getattr(pooling_layers, pooling_func)(
|
||||||
|
in_dim=self.stats_dim * block.expansion)
|
||||||
|
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
|
||||||
|
embed_dim)
|
||||||
|
if self.two_emb_layer:
|
||||||
|
self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
|
||||||
|
self.seg_2 = nn.Linear(embed_dim, embed_dim)
|
||||||
|
else:
|
||||||
|
self.seg_bn_1 = nn.Identity()
|
||||||
|
self.seg_2 = nn.Identity()
|
||||||
|
|
||||||
|
def _make_layer(self, block, planes, num_blocks, stride):
|
||||||
|
strides = [stride] + [1] * (num_blocks - 1)
|
||||||
|
layers = []
|
||||||
|
for stride in strides:
|
||||||
|
layers.append(block(self.in_planes, planes, stride))
|
||||||
|
self.in_planes = planes * block.expansion
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = x.permute(0, 2, 1)
|
||||||
|
|
||||||
|
x = x.unsqueeze_(1)
|
||||||
|
out = F.relu(self.bn1(self.conv1(x)))
|
||||||
|
out1 = self.layer1(out)
|
||||||
|
|
||||||
|
# bottom-up fusion
|
||||||
|
out2 = self.layer2(out1)
|
||||||
|
out1_downsample = self.layer1_downsample(out1)
|
||||||
|
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||||
|
|
||||||
|
out3 = self.layer3(out2)
|
||||||
|
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||||
|
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||||
|
|
||||||
|
out4 = self.layer4(out3)
|
||||||
|
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||||
|
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
|
||||||
|
stats = self.pool(fuse_out1234)
|
||||||
|
|
||||||
|
embed_a = self.seg_1(stats)
|
||||||
|
if self.two_emb_layer:
|
||||||
|
out = F.relu(embed_a)
|
||||||
|
out = self.seg_bn_1(out)
|
||||||
|
embed_b = self.seg_2(out)
|
||||||
|
return embed_b
|
||||||
|
else:
|
||||||
|
return embed_a
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(
|
||||||
|
Tasks.speaker_verification, module_name=Models.eres2net_sv)
|
||||||
|
class SpeakerVerificationERes2Net(TorchModel):
|
||||||
|
r"""Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
|
||||||
|
of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
|
||||||
|
interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
|
||||||
|
Args:
|
||||||
|
model_dir: A model dir.
|
||||||
|
model_config: The model config.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(model_dir, model_config, *args, **kwargs)
|
||||||
|
self.model_config = model_config
|
||||||
|
self.other_config = kwargs
|
||||||
|
self.feature_dim = 80
|
||||||
|
|
||||||
|
self.embedding_model = ERes2Net()
|
||||||
|
|
||||||
|
pretrained_model_name = kwargs['pretrained_model']
|
||||||
|
self.__load_check_point(pretrained_model_name)
|
||||||
|
|
||||||
|
self.embedding_model.eval()
|
||||||
|
|
||||||
|
def forward(self, audio):
|
||||||
|
assert len(audio.shape) == 2 and audio.shape[
|
||||||
|
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
|
||||||
|
# audio shape: [1, T]
|
||||||
|
feature = self.__extract_feature(audio)
|
||||||
|
embedding = self.embedding_model(feature)
|
||||||
|
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
def __extract_feature(self, audio):
|
||||||
|
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
|
||||||
|
feature = feature - feature.mean(dim=0, keepdim=True)
|
||||||
|
feature = feature.unsqueeze(0)
|
||||||
|
return feature
|
||||||
|
|
||||||
|
def __load_check_point(self, pretrained_model_name, device=None):
|
||||||
|
if not device:
|
||||||
|
device = torch.device('cpu')
|
||||||
|
self.embedding_model.load_state_dict(
|
||||||
|
torch.load(
|
||||||
|
os.path.join(self.model_dir, pretrained_model_name),
|
||||||
|
map_location=device),
|
||||||
|
strict=True)
|
||||||
32
modelscope/models/audio/sv/fusion.py
Normal file
32
modelscope/models/audio/sv/fusion.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class AFF(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, channels=64, r=4):
|
||||||
|
super(AFF, self).__init__()
|
||||||
|
inter_channels = int(channels // r)
|
||||||
|
|
||||||
|
self.local_att = nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
channels * 2,
|
||||||
|
inter_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0),
|
||||||
|
nn.BatchNorm2d(inter_channels),
|
||||||
|
nn.SiLU(inplace=True),
|
||||||
|
nn.Conv2d(
|
||||||
|
inter_channels, channels, kernel_size=1, stride=1, padding=0),
|
||||||
|
nn.BatchNorm2d(channels),
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x, ds_y):
|
||||||
|
xa = torch.cat((x, ds_y), dim=1)
|
||||||
|
x_att = self.local_att(xa)
|
||||||
|
x_att = 1.0 + torch.tanh(x_att)
|
||||||
|
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
|
||||||
|
|
||||||
|
return xo
|
||||||
107
modelscope/models/audio/sv/pooling_layers.py
Normal file
107
modelscope/models/audio/sv/pooling_layers.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||||
|
"""
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class TAP(nn.Module):
|
||||||
|
"""
|
||||||
|
Temporal average pooling, only first-order mean is considered
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super(TAP, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
pooling_mean = x.mean(dim=-1)
|
||||||
|
# To be compatable with 2D input
|
||||||
|
pooling_mean = pooling_mean.flatten(start_dim=1)
|
||||||
|
return pooling_mean
|
||||||
|
|
||||||
|
|
||||||
|
class TSDP(nn.Module):
|
||||||
|
"""
|
||||||
|
Temporal standard deviation pooling, only second-order std is considered
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super(TSDP, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# The last dimension is the temporal axis
|
||||||
|
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
|
||||||
|
pooling_std = pooling_std.flatten(start_dim=1)
|
||||||
|
return pooling_std
|
||||||
|
|
||||||
|
|
||||||
|
class TSTP(nn.Module):
|
||||||
|
"""
|
||||||
|
Temporal statistics pooling, concatenate mean and std, which is used in
|
||||||
|
x-vector
|
||||||
|
Comment: simple concatenation can not make full use of both statistics
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super(TSTP, self).__init__()
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# The last dimension is the temporal axis
|
||||||
|
pooling_mean = x.mean(dim=-1)
|
||||||
|
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
|
||||||
|
pooling_mean = pooling_mean.flatten(start_dim=1)
|
||||||
|
pooling_std = pooling_std.flatten(start_dim=1)
|
||||||
|
|
||||||
|
stats = torch.cat((pooling_mean, pooling_std), 1)
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
class ASTP(nn.Module):
|
||||||
|
""" Attentive statistics pooling: Channel- and context-dependent
|
||||||
|
statistics pooling, first used in ECAPA_TDNN.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
|
||||||
|
super(ASTP, self).__init__()
|
||||||
|
self.global_context_att = global_context_att
|
||||||
|
|
||||||
|
# Use Conv1d with stride == 1 rather than Linear, then we don't
|
||||||
|
# need to transpose inputs.
|
||||||
|
if global_context_att:
|
||||||
|
self.linear1 = nn.Conv1d(
|
||||||
|
in_dim * 3, bottleneck_dim,
|
||||||
|
kernel_size=1) # equals W and b in the paper
|
||||||
|
else:
|
||||||
|
self.linear1 = nn.Conv1d(
|
||||||
|
in_dim, bottleneck_dim,
|
||||||
|
kernel_size=1) # equals W and b in the paper
|
||||||
|
self.linear2 = nn.Conv1d(
|
||||||
|
bottleneck_dim, in_dim,
|
||||||
|
kernel_size=1) # equals V and k in the paper
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""
|
||||||
|
x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
|
||||||
|
or a 4-dimensional tensor in resnet architecture (B,C,F,T)
|
||||||
|
0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
|
||||||
|
"""
|
||||||
|
if len(x.shape) == 4:
|
||||||
|
x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
|
||||||
|
assert len(x.shape) == 3
|
||||||
|
|
||||||
|
if self.global_context_att:
|
||||||
|
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
|
||||||
|
context_std = torch.sqrt(
|
||||||
|
torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
|
||||||
|
x_in = torch.cat((x, context_mean, context_std), dim=1)
|
||||||
|
else:
|
||||||
|
x_in = x
|
||||||
|
|
||||||
|
# DON'T use ReLU here! ReLU may be hard to converge.
|
||||||
|
alpha = torch.tanh(
|
||||||
|
self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
|
||||||
|
alpha = torch.softmax(self.linear2(alpha), dim=2)
|
||||||
|
mean = torch.sum(alpha * x, dim=2)
|
||||||
|
var = torch.sum(alpha * (x**2), dim=2) - mean**2
|
||||||
|
std = torch.sqrt(var.clamp(min=1e-10))
|
||||||
|
return torch.cat([mean, std], dim=1)
|
||||||
573
modelscope/models/audio/sv/rdino.py
Normal file
573
modelscope/models/audio/sv/rdino.py
Normal file
@@ -0,0 +1,573 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
|
||||||
|
RDINOHead implementation is adapted from DINO framework.
|
||||||
|
"""
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torchaudio.compliance.kaldi as Kaldi
|
||||||
|
|
||||||
|
from modelscope.metainfo import Models
|
||||||
|
from modelscope.models import MODELS, TorchModel
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
|
||||||
|
|
||||||
|
def length_to_mask(length, max_len=None, dtype=None, device=None):
|
||||||
|
assert len(length.shape) == 1
|
||||||
|
|
||||||
|
if max_len is None:
|
||||||
|
max_len = length.max().long().item()
|
||||||
|
mask = torch.arange(
|
||||||
|
max_len, device=length.device, dtype=length.dtype).expand(
|
||||||
|
len(length), max_len) < length.unsqueeze(1)
|
||||||
|
|
||||||
|
if dtype is None:
|
||||||
|
dtype = length.dtype
|
||||||
|
|
||||||
|
if device is None:
|
||||||
|
device = length.device
|
||||||
|
|
||||||
|
mask = torch.as_tensor(mask, dtype=dtype, device=device)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
|
||||||
|
if stride > 1:
|
||||||
|
n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
|
||||||
|
L_out = stride * (n_steps - 1) + kernel_size * dilation
|
||||||
|
padding = [kernel_size // 2, kernel_size // 2]
|
||||||
|
|
||||||
|
else:
|
||||||
|
L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
|
||||||
|
|
||||||
|
padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
|
||||||
|
return padding
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1d(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
out_channels,
|
||||||
|
kernel_size,
|
||||||
|
in_channels,
|
||||||
|
stride=1,
|
||||||
|
dilation=1,
|
||||||
|
padding='same',
|
||||||
|
groups=1,
|
||||||
|
bias=True,
|
||||||
|
padding_mode='reflect',
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.stride = stride
|
||||||
|
self.dilation = dilation
|
||||||
|
self.padding = padding
|
||||||
|
self.padding_mode = padding_mode
|
||||||
|
|
||||||
|
self.conv = nn.Conv1d(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
self.kernel_size,
|
||||||
|
stride=self.stride,
|
||||||
|
dilation=self.dilation,
|
||||||
|
padding=0,
|
||||||
|
groups=groups,
|
||||||
|
bias=bias,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
if self.padding == 'same':
|
||||||
|
x = self._manage_padding(x, self.kernel_size, self.dilation,
|
||||||
|
self.stride)
|
||||||
|
|
||||||
|
elif self.padding == 'causal':
|
||||||
|
num_pad = (self.kernel_size - 1) * self.dilation
|
||||||
|
x = F.pad(x, (num_pad, 0))
|
||||||
|
|
||||||
|
elif self.padding == 'valid':
|
||||||
|
pass
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Padding must be 'same', 'valid' or 'causal'. Got "
|
||||||
|
+ self.padding)
|
||||||
|
|
||||||
|
wx = self.conv(x)
|
||||||
|
|
||||||
|
return wx
|
||||||
|
|
||||||
|
def _manage_padding(
|
||||||
|
self,
|
||||||
|
x,
|
||||||
|
kernel_size: int,
|
||||||
|
dilation: int,
|
||||||
|
stride: int,
|
||||||
|
):
|
||||||
|
L_in = x.shape[-1]
|
||||||
|
padding = get_padding_elem(L_in, stride, kernel_size, dilation)
|
||||||
|
x = F.pad(x, padding, mode=self.padding_mode)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class BatchNorm1d(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_size,
|
||||||
|
eps=1e-05,
|
||||||
|
momentum=0.1,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.norm = nn.BatchNorm1d(
|
||||||
|
input_size,
|
||||||
|
eps=eps,
|
||||||
|
momentum=momentum,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.norm(x)
|
||||||
|
|
||||||
|
|
||||||
|
class TDNNBlock(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size,
|
||||||
|
dilation,
|
||||||
|
activation=nn.ReLU,
|
||||||
|
groups=1,
|
||||||
|
):
|
||||||
|
super(TDNNBlock, self).__init__()
|
||||||
|
self.conv = Conv1d(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
dilation=dilation,
|
||||||
|
groups=groups,
|
||||||
|
)
|
||||||
|
self.activation = activation()
|
||||||
|
self.norm = BatchNorm1d(input_size=out_channels)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.norm(self.activation(self.conv(x)))
|
||||||
|
|
||||||
|
|
||||||
|
class Res2NetBlock(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
scale=8,
|
||||||
|
kernel_size=3,
|
||||||
|
dilation=1):
|
||||||
|
super(Res2NetBlock, self).__init__()
|
||||||
|
assert in_channels % scale == 0
|
||||||
|
assert out_channels % scale == 0
|
||||||
|
|
||||||
|
in_channel = in_channels // scale
|
||||||
|
hidden_channel = out_channels // scale
|
||||||
|
|
||||||
|
self.blocks = nn.ModuleList([
|
||||||
|
TDNNBlock(
|
||||||
|
in_channel,
|
||||||
|
hidden_channel,
|
||||||
|
kernel_size=kernel_size,
|
||||||
|
dilation=dilation,
|
||||||
|
) for i in range(scale - 1)
|
||||||
|
])
|
||||||
|
self.scale = scale
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
y = []
|
||||||
|
for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
|
||||||
|
if i == 0:
|
||||||
|
y_i = x_i
|
||||||
|
elif i == 1:
|
||||||
|
y_i = self.blocks[i - 1](x_i)
|
||||||
|
else:
|
||||||
|
y_i = self.blocks[i - 1](x_i + y_i)
|
||||||
|
y.append(y_i)
|
||||||
|
y = torch.cat(y, dim=1)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
class SEBlock(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, in_channels, se_channels, out_channels):
|
||||||
|
super(SEBlock, self).__init__()
|
||||||
|
|
||||||
|
self.conv1 = Conv1d(
|
||||||
|
in_channels=in_channels, out_channels=se_channels, kernel_size=1)
|
||||||
|
self.relu = torch.nn.ReLU(inplace=True)
|
||||||
|
self.conv2 = Conv1d(
|
||||||
|
in_channels=se_channels, out_channels=out_channels, kernel_size=1)
|
||||||
|
self.sigmoid = torch.nn.Sigmoid()
|
||||||
|
|
||||||
|
def forward(self, x, lengths=None):
|
||||||
|
L = x.shape[-1]
|
||||||
|
if lengths is not None:
|
||||||
|
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
|
||||||
|
mask = mask.unsqueeze(1)
|
||||||
|
total = mask.sum(dim=2, keepdim=True)
|
||||||
|
s = (x * mask).sum(dim=2, keepdim=True) / total
|
||||||
|
else:
|
||||||
|
s = x.mean(dim=2, keepdim=True)
|
||||||
|
|
||||||
|
s = self.relu(self.conv1(s))
|
||||||
|
s = self.sigmoid(self.conv2(s))
|
||||||
|
|
||||||
|
return s * x
|
||||||
|
|
||||||
|
|
||||||
|
class AttentiveStatisticsPooling(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, channels, attention_channels=128, global_context=True):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.eps = 1e-12
|
||||||
|
self.global_context = global_context
|
||||||
|
if global_context:
|
||||||
|
self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
|
||||||
|
else:
|
||||||
|
self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
|
||||||
|
self.tanh = nn.Tanh()
|
||||||
|
self.conv = Conv1d(
|
||||||
|
in_channels=attention_channels,
|
||||||
|
out_channels=channels,
|
||||||
|
kernel_size=1)
|
||||||
|
|
||||||
|
def forward(self, x, lengths=None):
|
||||||
|
L = x.shape[-1]
|
||||||
|
|
||||||
|
def _compute_statistics(x, m, dim=2, eps=self.eps):
|
||||||
|
mean = (m * x).sum(dim)
|
||||||
|
std = torch.sqrt(
|
||||||
|
(m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
|
||||||
|
return mean, std
|
||||||
|
|
||||||
|
if lengths is None:
|
||||||
|
lengths = torch.ones(x.shape[0], device=x.device)
|
||||||
|
|
||||||
|
# Make binary mask of shape [N, 1, L]
|
||||||
|
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
|
||||||
|
mask = mask.unsqueeze(1)
|
||||||
|
|
||||||
|
# Expand the temporal context of the pooling layer by allowing the
|
||||||
|
# self-attention to look at global properties of the utterance.
|
||||||
|
if self.global_context:
|
||||||
|
# torch.std is unstable for backward computation
|
||||||
|
# https://github.com/pytorch/pytorch/issues/4320
|
||||||
|
total = mask.sum(dim=2, keepdim=True).float()
|
||||||
|
mean, std = _compute_statistics(x, mask / total)
|
||||||
|
mean = mean.unsqueeze(2).repeat(1, 1, L)
|
||||||
|
std = std.unsqueeze(2).repeat(1, 1, L)
|
||||||
|
attn = torch.cat([x, mean, std], dim=1)
|
||||||
|
else:
|
||||||
|
attn = x
|
||||||
|
|
||||||
|
# Apply layers
|
||||||
|
attn = self.conv(self.tanh(self.tdnn(attn)))
|
||||||
|
|
||||||
|
# Filter out zero-paddings
|
||||||
|
attn = attn.masked_fill(mask == 0, float('-inf'))
|
||||||
|
|
||||||
|
attn = F.softmax(attn, dim=2)
|
||||||
|
mean, std = _compute_statistics(x, attn)
|
||||||
|
# Append mean and std of the batch
|
||||||
|
pooled_stats = torch.cat((mean, std), dim=1)
|
||||||
|
pooled_stats = pooled_stats.unsqueeze(2)
|
||||||
|
|
||||||
|
return pooled_stats
|
||||||
|
|
||||||
|
|
||||||
|
class SERes2NetBlock(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
res2net_scale=8,
|
||||||
|
se_channels=128,
|
||||||
|
kernel_size=1,
|
||||||
|
dilation=1,
|
||||||
|
activation=torch.nn.ReLU,
|
||||||
|
groups=1,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.tdnn1 = TDNNBlock(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
dilation=1,
|
||||||
|
activation=activation,
|
||||||
|
groups=groups,
|
||||||
|
)
|
||||||
|
self.res2net_block = Res2NetBlock(out_channels, out_channels,
|
||||||
|
res2net_scale, kernel_size, dilation)
|
||||||
|
self.tdnn2 = TDNNBlock(
|
||||||
|
out_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
dilation=1,
|
||||||
|
activation=activation,
|
||||||
|
groups=groups,
|
||||||
|
)
|
||||||
|
self.se_block = SEBlock(out_channels, se_channels, out_channels)
|
||||||
|
|
||||||
|
self.shortcut = None
|
||||||
|
if in_channels != out_channels:
|
||||||
|
self.shortcut = Conv1d(
|
||||||
|
in_channels=in_channels,
|
||||||
|
out_channels=out_channels,
|
||||||
|
kernel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x, lengths=None):
|
||||||
|
residual = x
|
||||||
|
if self.shortcut:
|
||||||
|
residual = self.shortcut(x)
|
||||||
|
|
||||||
|
x = self.tdnn1(x)
|
||||||
|
x = self.res2net_block(x)
|
||||||
|
x = self.tdnn2(x)
|
||||||
|
x = self.se_block(x, lengths)
|
||||||
|
|
||||||
|
return x + residual
|
||||||
|
|
||||||
|
|
||||||
|
class ECAPA_TDNN(nn.Module):
|
||||||
|
"""An implementation of the speaker embedding model in a paper.
|
||||||
|
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
|
||||||
|
TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_size,
|
||||||
|
device='cpu',
|
||||||
|
lin_neurons=512,
|
||||||
|
activation=torch.nn.ReLU,
|
||||||
|
channels=[512, 512, 512, 512, 1536],
|
||||||
|
kernel_sizes=[5, 3, 3, 3, 1],
|
||||||
|
dilations=[1, 2, 3, 4, 1],
|
||||||
|
attention_channels=128,
|
||||||
|
res2net_scale=8,
|
||||||
|
se_channels=128,
|
||||||
|
global_context=True,
|
||||||
|
groups=[1, 1, 1, 1, 1],
|
||||||
|
):
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
assert len(channels) == len(kernel_sizes)
|
||||||
|
assert len(channels) == len(dilations)
|
||||||
|
self.channels = channels
|
||||||
|
self.blocks = nn.ModuleList()
|
||||||
|
|
||||||
|
# The initial TDNN layer
|
||||||
|
self.blocks.append(
|
||||||
|
TDNNBlock(
|
||||||
|
input_size,
|
||||||
|
channels[0],
|
||||||
|
kernel_sizes[0],
|
||||||
|
dilations[0],
|
||||||
|
activation,
|
||||||
|
groups[0],
|
||||||
|
))
|
||||||
|
|
||||||
|
# SE-Res2Net layers
|
||||||
|
for i in range(1, len(channels) - 1):
|
||||||
|
self.blocks.append(
|
||||||
|
SERes2NetBlock(
|
||||||
|
channels[i - 1],
|
||||||
|
channels[i],
|
||||||
|
res2net_scale=res2net_scale,
|
||||||
|
se_channels=se_channels,
|
||||||
|
kernel_size=kernel_sizes[i],
|
||||||
|
dilation=dilations[i],
|
||||||
|
activation=activation,
|
||||||
|
groups=groups[i],
|
||||||
|
))
|
||||||
|
|
||||||
|
# Multi-layer feature aggregation
|
||||||
|
self.mfa = TDNNBlock(
|
||||||
|
channels[-1],
|
||||||
|
channels[-1],
|
||||||
|
kernel_sizes[-1],
|
||||||
|
dilations[-1],
|
||||||
|
activation,
|
||||||
|
groups=groups[-1],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Attentive Statistical Pooling
|
||||||
|
self.asp = AttentiveStatisticsPooling(
|
||||||
|
channels[-1],
|
||||||
|
attention_channels=attention_channels,
|
||||||
|
global_context=global_context,
|
||||||
|
)
|
||||||
|
self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
|
||||||
|
|
||||||
|
# Final linear transformation
|
||||||
|
self.fc = Conv1d(
|
||||||
|
in_channels=channels[-1] * 2,
|
||||||
|
out_channels=lin_neurons,
|
||||||
|
kernel_size=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x, lengths=None):
|
||||||
|
"""Returns the embedding vector.
|
||||||
|
|
||||||
|
Arguments
|
||||||
|
---------
|
||||||
|
x : torch.Tensor
|
||||||
|
Tensor of shape (batch, time, channel).
|
||||||
|
"""
|
||||||
|
x = x.transpose(1, 2)
|
||||||
|
|
||||||
|
xl = []
|
||||||
|
for layer in self.blocks:
|
||||||
|
try:
|
||||||
|
x = layer(x, lengths=lengths)
|
||||||
|
except TypeError:
|
||||||
|
x = layer(x)
|
||||||
|
xl.append(x)
|
||||||
|
|
||||||
|
# Multi-layer feature aggregation
|
||||||
|
x = torch.cat(xl[1:], dim=1)
|
||||||
|
x = self.mfa(x)
|
||||||
|
|
||||||
|
# Attentive Statistical Pooling
|
||||||
|
x = self.asp(x, lengths=lengths)
|
||||||
|
x = self.asp_bn(x)
|
||||||
|
|
||||||
|
# Final linear transformation
|
||||||
|
x = self.fc(x)
|
||||||
|
|
||||||
|
x = x.transpose(1, 2).squeeze(1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class RDINOHead(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
in_dim,
|
||||||
|
out_dim,
|
||||||
|
use_bn=False,
|
||||||
|
norm_last_layer=True,
|
||||||
|
nlayers=3,
|
||||||
|
hidden_dim=2048,
|
||||||
|
bottleneck_dim=256,
|
||||||
|
add_dim=8192):
|
||||||
|
super().__init__()
|
||||||
|
nlayers = max(nlayers, 1)
|
||||||
|
if nlayers == 1:
|
||||||
|
self.mlp = nn.Linear(in_dim, bottleneck_dim)
|
||||||
|
else:
|
||||||
|
layers = [nn.Linear(in_dim, hidden_dim)]
|
||||||
|
if use_bn:
|
||||||
|
layers.append(nn.BatchNorm1d(hidden_dim))
|
||||||
|
layers.append(nn.GELU())
|
||||||
|
for _ in range(nlayers - 2):
|
||||||
|
layers.append(nn.Linear(hidden_dim, hidden_dim))
|
||||||
|
if use_bn:
|
||||||
|
layers.append(nn.BatchNorm1d(hidden_dim))
|
||||||
|
layers.append(nn.GELU())
|
||||||
|
|
||||||
|
layers.append(nn.Linear(hidden_dim, add_dim))
|
||||||
|
self.mlp = nn.Sequential(*layers)
|
||||||
|
self.add_layer = nn.Linear(add_dim, bottleneck_dim)
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
self.last_layer = nn.utils.weight_norm(
|
||||||
|
nn.Linear(bottleneck_dim, out_dim, bias=False))
|
||||||
|
self.last_layer.weight_g.data.fill_(1)
|
||||||
|
if norm_last_layer:
|
||||||
|
self.last_layer.weight_g.requires_grad = False
|
||||||
|
|
||||||
|
def _init_weights(self, m):
|
||||||
|
if isinstance(m, nn.Linear):
|
||||||
|
torch.nn.init.trunc_normal_(m.weight, std=.02)
|
||||||
|
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||||
|
nn.init.constant_(m.bias, 0)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
vicr_out = self.mlp(x)
|
||||||
|
x = self.add_layer(vicr_out)
|
||||||
|
x = nn.functional.normalize(x, dim=-1, p=2)
|
||||||
|
x = self.last_layer(x)
|
||||||
|
return vicr_out, x
|
||||||
|
|
||||||
|
|
||||||
|
class Combine(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, backbone, head):
|
||||||
|
super(Combine, self).__init__()
|
||||||
|
self.backbone = backbone
|
||||||
|
self.head = head
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.backbone(x)
|
||||||
|
output = self.head(x)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(
|
||||||
|
Tasks.speaker_verification, module_name=Models.rdino_tdnn_sv)
|
||||||
|
class SpeakerVerification_RDINO(TorchModel):
|
||||||
|
|
||||||
|
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(model_dir, model_config, *args, **kwargs)
|
||||||
|
self.model_config = model_config
|
||||||
|
self.other_config = kwargs
|
||||||
|
if self.model_config['channel'] != 1024:
|
||||||
|
raise ValueError(
|
||||||
|
'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
|
||||||
|
)
|
||||||
|
|
||||||
|
self.feature_dim = 80
|
||||||
|
channels_config = [1024, 1024, 1024, 1024, 3072]
|
||||||
|
|
||||||
|
self.embedding_model = ECAPA_TDNN(
|
||||||
|
self.feature_dim, channels=channels_config)
|
||||||
|
self.embedding_model = Combine(self.embedding_model,
|
||||||
|
RDINOHead(512, 65536, True))
|
||||||
|
|
||||||
|
pretrained_model_name = kwargs['pretrained_model']
|
||||||
|
self.__load_check_point(pretrained_model_name)
|
||||||
|
|
||||||
|
self.embedding_model.eval()
|
||||||
|
|
||||||
|
def forward(self, audio):
|
||||||
|
assert len(audio.shape) == 2 and audio.shape[
|
||||||
|
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
|
||||||
|
# audio shape: [1, T]
|
||||||
|
feature = self.__extract_feature(audio)
|
||||||
|
embedding = self.embedding_model.backbone(feature)
|
||||||
|
|
||||||
|
return embedding
|
||||||
|
|
||||||
|
def __extract_feature(self, audio):
|
||||||
|
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
|
||||||
|
feature = feature - feature.mean(dim=0, keepdim=True)
|
||||||
|
feature = feature.unsqueeze(0)
|
||||||
|
return feature
|
||||||
|
|
||||||
|
def __load_check_point(self, pretrained_model_name, device=None):
|
||||||
|
if not device:
|
||||||
|
device = torch.device('cpu')
|
||||||
|
state_dict = torch.load(
|
||||||
|
os.path.join(self.model_dir, pretrained_model_name),
|
||||||
|
map_location=device)
|
||||||
|
state_dict_tea = {
|
||||||
|
k.replace('module.', ''): v
|
||||||
|
for k, v in state_dict['teacher'].items()
|
||||||
|
}
|
||||||
|
self.embedding_model.load_state_dict(state_dict_tea, strict=True)
|
||||||
319
modelscope/models/audio/sv/speaker_change_locator.py
Normal file
319
modelscope/models/audio/sv/speaker_change_locator.py
Normal file
@@ -0,0 +1,319 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
|
||||||
|
import os
|
||||||
|
from collections import OrderedDict
|
||||||
|
from typing import Any, Dict, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import torchaudio.compliance.kaldi as Kaldi
|
||||||
|
|
||||||
|
from modelscope.metainfo import Models
|
||||||
|
from modelscope.models import MODELS, TorchModel
|
||||||
|
from modelscope.models.audio.sv.DTDNN import CAMPPlus
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
|
||||||
|
|
||||||
|
class MultiHeadSelfAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, n_units, h=8, dropout=0.1):
|
||||||
|
super(MultiHeadSelfAttention, self).__init__()
|
||||||
|
self.linearQ = nn.Linear(n_units, n_units)
|
||||||
|
self.linearK = nn.Linear(n_units, n_units)
|
||||||
|
self.linearV = nn.Linear(n_units, n_units)
|
||||||
|
self.linearO = nn.Linear(n_units, n_units)
|
||||||
|
self.d_k = n_units // h
|
||||||
|
self.h = h
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
self.att = None
|
||||||
|
|
||||||
|
def forward(self, x, batch_size):
|
||||||
|
# x: (BT, F)
|
||||||
|
q = self.linearQ(x).reshape(batch_size, -1, self.h, self.d_k)
|
||||||
|
k = self.linearK(x).reshape(batch_size, -1, self.h, self.d_k)
|
||||||
|
v = self.linearV(x).reshape(batch_size, -1, self.h, self.d_k)
|
||||||
|
scores = torch.matmul(q.transpose(1, 2), k.permute(
|
||||||
|
0, 2, 3, 1)) / np.sqrt(self.d_k)
|
||||||
|
# scores: (B, h, T, T)
|
||||||
|
self.att = F.softmax(scores, dim=3)
|
||||||
|
p_att = self.dropout(self.att)
|
||||||
|
# v : (B, T, h, d_k)
|
||||||
|
# p_att : (B, h, T, T)
|
||||||
|
x = torch.matmul(p_att, v.transpose(1, 2))
|
||||||
|
# x : (B, h, T, d_k)
|
||||||
|
x = x.transpose(1, 2).reshape(-1, self.h * self.d_k)
|
||||||
|
return self.linearO(x)
|
||||||
|
|
||||||
|
|
||||||
|
class PositionwiseFeedForward(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, n_units, d_units, dropout):
|
||||||
|
super(PositionwiseFeedForward, self).__init__()
|
||||||
|
self.linear1 = nn.Linear(n_units, d_units)
|
||||||
|
self.linear2 = nn.Linear(d_units, n_units)
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.linear2(self.dropout(F.relu(self.linear1(x))))
|
||||||
|
|
||||||
|
|
||||||
|
class PosEncoding(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, max_seq_len, d_word_vec):
|
||||||
|
super(PosEncoding, self).__init__()
|
||||||
|
pos_enc = np.array([[
|
||||||
|
pos / np.power(10000, 2.0 * (j // 2) / d_word_vec)
|
||||||
|
for j in range(d_word_vec)
|
||||||
|
] for pos in range(max_seq_len)])
|
||||||
|
pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
|
||||||
|
pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
|
||||||
|
pad_row = np.zeros([1, d_word_vec])
|
||||||
|
pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
|
||||||
|
|
||||||
|
self.pos_enc = torch.nn.Embedding(max_seq_len + 1, d_word_vec)
|
||||||
|
self.pos_enc.weight = torch.nn.Parameter(
|
||||||
|
torch.from_numpy(pos_enc), requires_grad=False)
|
||||||
|
|
||||||
|
def forward(self, input_len):
|
||||||
|
max_len = torch.max(input_len)
|
||||||
|
input_pos = torch.LongTensor([
|
||||||
|
list(range(1, len + 1)) + [0] * (max_len - len)
|
||||||
|
for len in input_len
|
||||||
|
])
|
||||||
|
|
||||||
|
return self.pos_enc(input_pos)
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerEncoder(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim,
|
||||||
|
n_units=256,
|
||||||
|
n_layers=2,
|
||||||
|
e_units=512,
|
||||||
|
h=4,
|
||||||
|
dropout=0.1):
|
||||||
|
super(TransformerEncoder, self).__init__()
|
||||||
|
self.linear_in = nn.Linear(idim, n_units)
|
||||||
|
self.lnorm_in = nn.LayerNorm(n_units)
|
||||||
|
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
for i in range(n_layers):
|
||||||
|
setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
|
||||||
|
setattr(self, '{}{:d}'.format('self_att_', i),
|
||||||
|
MultiHeadSelfAttention(n_units, h))
|
||||||
|
setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
|
||||||
|
setattr(self, '{}{:d}'.format('ff_', i),
|
||||||
|
PositionwiseFeedForward(n_units, e_units, dropout))
|
||||||
|
self.lnorm_out = nn.LayerNorm(n_units)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# x: [B, num_anchors, T, n_in]
|
||||||
|
bs, num, tframe, dim = x.size()
|
||||||
|
x = x.reshape(bs * num, tframe, -1) # [B*num_anchors, T, dim]
|
||||||
|
# x: (B, T, F) ... batch, time, (mel)freq
|
||||||
|
B_size, T_size, _ = x.shape
|
||||||
|
# e: (BT, F)
|
||||||
|
e = self.linear_in(x.reshape(B_size * T_size, -1))
|
||||||
|
# Encoder stack
|
||||||
|
for i in range(self.n_layers):
|
||||||
|
# layer normalization
|
||||||
|
e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
|
||||||
|
# self-attention
|
||||||
|
s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
|
||||||
|
# residual
|
||||||
|
e = e + self.dropout(s)
|
||||||
|
# layer normalization
|
||||||
|
e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
|
||||||
|
# positionwise feed-forward
|
||||||
|
s = getattr(self, '{}{:d}'.format('ff_', i))(e)
|
||||||
|
# residual
|
||||||
|
e = e + self.dropout(s)
|
||||||
|
# final layer normalization
|
||||||
|
# output: (BT, F)
|
||||||
|
# output: (B, F, T)
|
||||||
|
output = self.lnorm_out(e).reshape(B_size, T_size, -1)
|
||||||
|
output = output.reshape(bs, num, tframe,
|
||||||
|
-1) # [B, num_anchors, T, dim]
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerEncoder_out(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
idim,
|
||||||
|
n_units=256,
|
||||||
|
n_layers=2,
|
||||||
|
e_units=512,
|
||||||
|
h=4,
|
||||||
|
dropout=0.1):
|
||||||
|
super(TransformerEncoder_out, self).__init__()
|
||||||
|
self.linear_in = nn.Linear(idim, n_units)
|
||||||
|
self.lnorm_in = nn.LayerNorm(n_units)
|
||||||
|
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
for i in range(n_layers):
|
||||||
|
setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
|
||||||
|
setattr(self, '{}{:d}'.format('self_att_', i),
|
||||||
|
MultiHeadSelfAttention(n_units, h))
|
||||||
|
setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
|
||||||
|
setattr(self, '{}{:d}'.format('ff_', i),
|
||||||
|
PositionwiseFeedForward(n_units, e_units, dropout))
|
||||||
|
self.lnorm_out = nn.LayerNorm(n_units)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# x: (B, T, F)
|
||||||
|
B_size, T_size, _ = x.shape
|
||||||
|
# e: (BT, F)
|
||||||
|
e = self.linear_in(x.reshape(B_size * T_size, -1))
|
||||||
|
# Encoder stack
|
||||||
|
for i in range(self.n_layers):
|
||||||
|
# layer normalization
|
||||||
|
e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
|
||||||
|
# self-attention
|
||||||
|
s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
|
||||||
|
# residual
|
||||||
|
e = e + self.dropout(s)
|
||||||
|
# layer normalization
|
||||||
|
e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
|
||||||
|
# positionwise feed-forward
|
||||||
|
s = getattr(self, '{}{:d}'.format('ff_', i))(e)
|
||||||
|
# residual
|
||||||
|
e = e + self.dropout(s)
|
||||||
|
# final layer normalization
|
||||||
|
# output: (BT, F)
|
||||||
|
# output: (B, T, F)
|
||||||
|
output = self.lnorm_out(e).reshape(B_size, T_size, -1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class OutLayer(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, n_units=256, num_anchors=2):
|
||||||
|
super(OutLayer, self).__init__()
|
||||||
|
self.combine = TransformerEncoder_out(num_anchors * n_units, n_units)
|
||||||
|
self.out_linear = nn.Linear(n_units // num_anchors, 1)
|
||||||
|
|
||||||
|
def forward(self, input):
|
||||||
|
# input: [B, num_anchors, T, dim]
|
||||||
|
bs, num, tframe, dim = input.size()
|
||||||
|
output = input.permute(0, 2, 1,
|
||||||
|
3).reshape(bs, tframe,
|
||||||
|
-1) # [Bs, t, num_anchors*dim]
|
||||||
|
output = self.combine(output) # [Bs, t, n_units]
|
||||||
|
output = output.reshape(
|
||||||
|
bs, tframe, num, -1) # [Bs, t, num_anchors, n_units//num_anchors]
|
||||||
|
output = self.out_linear(output).squeeze(-1) # [Bs, t, num_anchors]
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerDetector(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
frame_dim=512,
|
||||||
|
anchor_dim=192,
|
||||||
|
hidden_dim=256,
|
||||||
|
max_seq_len=1000):
|
||||||
|
super(TransformerDetector, self).__init__()
|
||||||
|
self.detection = TransformerEncoder(
|
||||||
|
idim=frame_dim + anchor_dim, n_units=hidden_dim)
|
||||||
|
self.output = OutLayer(n_units=hidden_dim)
|
||||||
|
self.pos_enc = PosEncoding(max_seq_len, hidden_dim)
|
||||||
|
|
||||||
|
def forward(self, feats, anchors):
|
||||||
|
# feats: [1, t, fdim]
|
||||||
|
num_frames = feats.shape[1]
|
||||||
|
num_anchors = anchors.shape[1]
|
||||||
|
bs = feats.shape[0]
|
||||||
|
feats = feats.unsqueeze(1).repeat(
|
||||||
|
1, num_anchors, 1, 1) # shape: [Bs, num_anchors, t, fdim]
|
||||||
|
anchors = anchors.unsqueeze(2).repeat(
|
||||||
|
1, 1, num_frames, 1) # shape: [Bs, num_anchors, t, xdim]
|
||||||
|
sd_in = torch.cat((feats, anchors),
|
||||||
|
dim=-1) # shape: [Bs, num_anchors, t, fdim+xdim]
|
||||||
|
sd_out = self.detection(sd_in) # shape: [Bs, num_anchors, t, sd_dim]
|
||||||
|
|
||||||
|
# pos
|
||||||
|
pos_emb = self.pos_enc(torch.tensor([num_frames] * (bs * num_anchors)))
|
||||||
|
pos_emb = pos_emb.reshape(bs, num_anchors, num_frames, -1)
|
||||||
|
sd_out += pos_emb
|
||||||
|
|
||||||
|
# output
|
||||||
|
output = self.output(sd_out) # shape: [Bs, t, num_anchors]
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.scl_sd)
|
||||||
|
class SpeakerChangeLocatorTransformer(TorchModel):
|
||||||
|
r"""A speaekr change locator using the transformer architecture as the backbone.
|
||||||
|
Args:
|
||||||
|
model_dir: A model dir.
|
||||||
|
model_config: The model config.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
|
||||||
|
**kwargs):
|
||||||
|
super().__init__(model_dir, model_config, *args, **kwargs)
|
||||||
|
self.model_config = model_config
|
||||||
|
|
||||||
|
self.feature_dim = self.model_config['fbank_dim']
|
||||||
|
frame_size = self.model_config['frame_size']
|
||||||
|
anchor_size = self.model_config['anchor_size']
|
||||||
|
|
||||||
|
self.encoder = CAMPPlus(self.feature_dim, output_level='frame')
|
||||||
|
self.backend = TransformerDetector(
|
||||||
|
frame_dim=frame_size, anchor_dim=anchor_size)
|
||||||
|
|
||||||
|
pretrained_encoder = kwargs['pretrained_encoder']
|
||||||
|
pretrained_backend = kwargs['pretrained_backend']
|
||||||
|
|
||||||
|
self.__load_check_point(pretrained_encoder, pretrained_backend)
|
||||||
|
|
||||||
|
self.encoder.eval()
|
||||||
|
self.backend.eval()
|
||||||
|
|
||||||
|
def forward(self, audio, anchors):
|
||||||
|
assert len(audio.shape) == 2 and audio.shape[
|
||||||
|
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
|
||||||
|
assert len(
|
||||||
|
anchors.shape
|
||||||
|
) == 3 and anchors.shape[0] == 1 and anchors.shape[
|
||||||
|
1] == 2, 'modelscope error: the shape of input anchors to model needs to be [1, 2, D]'
|
||||||
|
# audio shape: [1, T]
|
||||||
|
feature = self.__extract_feature(audio)
|
||||||
|
frame_state = self.encoder(feature)
|
||||||
|
output = self.backend(frame_state, anchors)
|
||||||
|
output = output.squeeze(0).detach().cpu().sigmoid()
|
||||||
|
|
||||||
|
time_scale_factor = int(np.ceil(feature.shape[1] / output.shape[0]))
|
||||||
|
output = output.unsqueeze(1).expand(-1, time_scale_factor,
|
||||||
|
-1).reshape(-1, output.shape[-1])
|
||||||
|
return output
|
||||||
|
|
||||||
|
def __extract_feature(self, audio):
|
||||||
|
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
|
||||||
|
feature = feature - feature.mean(dim=0, keepdim=True)
|
||||||
|
feature = feature.unsqueeze(0)
|
||||||
|
return feature
|
||||||
|
|
||||||
|
def __load_check_point(self,
|
||||||
|
pretrained_encoder,
|
||||||
|
pretrained_backend,
|
||||||
|
device=None):
|
||||||
|
if not device:
|
||||||
|
device = torch.device('cpu')
|
||||||
|
self.encoder.load_state_dict(
|
||||||
|
torch.load(
|
||||||
|
os.path.join(self.model_dir, pretrained_encoder),
|
||||||
|
map_location=device))
|
||||||
|
|
||||||
|
self.backend.load_state_dict(
|
||||||
|
torch.load(
|
||||||
|
os.path.join(self.model_dir, pretrained_backend),
|
||||||
|
map_location=device))
|
||||||
@@ -17,11 +17,9 @@ from kantts.train.trainer import GAN_Trainer, Sambert_Trainer, distributed_init
|
|||||||
from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit
|
from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from modelscope import __version__
|
|
||||||
from modelscope.utils.audio.audio_utils import TtsCustomParams
|
from modelscope.utils.audio.audio_utils import TtsCustomParams
|
||||||
from modelscope.utils.audio.tts_exceptions import (
|
from modelscope.utils.audio.tts_exceptions import (
|
||||||
TtsModelConfigurationException, TtsModelNotExistsException)
|
TtsModelConfigurationException, TtsModelNotExistsException)
|
||||||
from modelscope.utils.constant import ModelFile, Tasks
|
|
||||||
from modelscope.utils.logger import get_logger
|
from modelscope.utils.logger import get_logger
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@@ -394,6 +392,7 @@ class Voice:
|
|||||||
logger.info(f'TRAINING steps: {train_max_steps}')
|
logger.info(f'TRAINING steps: {train_max_steps}')
|
||||||
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
|
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
|
||||||
time.localtime())
|
time.localtime())
|
||||||
|
from modelscope import __version__
|
||||||
config['modelscope_version'] = __version__
|
config['modelscope_version'] = __version__
|
||||||
|
|
||||||
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
|
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
|
||||||
@@ -558,6 +557,7 @@ class Voice:
|
|||||||
logger.info(f'resume from: {resume_from}')
|
logger.info(f'resume from: {resume_from}')
|
||||||
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
|
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
|
||||||
time.localtime())
|
time.localtime())
|
||||||
|
from modelscope import __version__
|
||||||
config['modelscope_version'] = __version__
|
config['modelscope_version'] = __version__
|
||||||
|
|
||||||
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
|
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
|
||||||
|
|||||||
@@ -4,9 +4,8 @@
|
|||||||
from . import (action_recognition, animal_recognition, bad_image_detecting,
|
from . import (action_recognition, animal_recognition, bad_image_detecting,
|
||||||
body_2d_keypoints, body_3d_keypoints, cartoon,
|
body_2d_keypoints, body_3d_keypoints, cartoon,
|
||||||
cmdssl_video_embedding, controllable_image_generation,
|
cmdssl_video_embedding, controllable_image_generation,
|
||||||
crowd_counting, face_2d_keypoints, face_detection,
|
crowd_counting, face_detection, face_generation,
|
||||||
face_generation, face_reconstruction, human_reconstruction,
|
face_reconstruction, human_reconstruction, image_classification,
|
||||||
human_wholebody_keypoint, image_classification,
|
|
||||||
image_color_enhance, image_colorization, image_defrcn_fewshot,
|
image_color_enhance, image_colorization, image_defrcn_fewshot,
|
||||||
image_denoise, image_inpainting, image_instance_segmentation,
|
image_denoise, image_inpainting, image_instance_segmentation,
|
||||||
image_matching, image_mvs_depth_estimation,
|
image_matching, image_mvs_depth_estimation,
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ class PoseHighResolutionNetV2(TorchModel):
|
|||||||
self.stage4, pre_stage_channels = self._make_stage(
|
self.stage4, pre_stage_channels = self._make_stage(
|
||||||
self.stage4_cfg, num_channels, multi_scale_output=True)
|
self.stage4_cfg, num_channels, multi_scale_output=True)
|
||||||
"""final four layers"""
|
"""final four layers"""
|
||||||
last_inp_channels = np.int(np.sum(pre_stage_channels))
|
last_inp_channels = int(np.sum(pre_stage_channels))
|
||||||
self.final_layer = nn.Sequential(
|
self.final_layer = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
in_channels=last_inp_channels,
|
in_channels=last_inp_channels,
|
||||||
|
|||||||
@@ -81,7 +81,7 @@ class FaceLandmark:
|
|||||||
bbox[2] = center[0] + one_edge // 2
|
bbox[2] = center[0] + one_edge // 2
|
||||||
bbox[3] = center[1] + one_edge // 2
|
bbox[3] = center[1] + one_edge // 2
|
||||||
|
|
||||||
bbox = bbox.astype(np.int)
|
bbox = bbox.astype(int)
|
||||||
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
|
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
|
||||||
h, w, _ = crop_image.shape
|
h, w, _ = crop_image.shape
|
||||||
crop_image = cv2.resize(
|
crop_image = cv2.resize(
|
||||||
|
|||||||
@@ -356,7 +356,7 @@ class HighResolutionNet(nn.Module):
|
|||||||
num_channels)
|
num_channels)
|
||||||
self.stage3, pre_stage_channels = self._make_stage(
|
self.stage3, pre_stage_channels = self._make_stage(
|
||||||
self.stage3_cfg, num_channels)
|
self.stage3_cfg, num_channels)
|
||||||
last_inp_channels = np.int(np.sum(pre_stage_channels)) + 256
|
last_inp_channels = int(np.sum(pre_stage_channels)) + 256
|
||||||
self.redc_layer = nn.Sequential(
|
self.redc_layer = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
in_channels=last_inp_channels,
|
in_channels=last_inp_channels,
|
||||||
|
|||||||
@@ -1,25 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.base import BaseModel
|
|
||||||
from easycv.utils.ms_utils import EasyCVMeta
|
|
||||||
|
|
||||||
from modelscope.models.base import TorchModel
|
|
||||||
|
|
||||||
|
|
||||||
class EasyCVBaseModel(BaseModel, TorchModel):
|
|
||||||
"""Base model for EasyCV."""
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, args=(), kwargs={}):
|
|
||||||
kwargs.pop(EasyCVMeta.ARCH, None) # pop useless keys
|
|
||||||
BaseModel.__init__(self)
|
|
||||||
TorchModel.__init__(self, model_dir=model_dir)
|
|
||||||
|
|
||||||
def forward(self, img, mode='train', **kwargs):
|
|
||||||
if self.training:
|
|
||||||
losses = self.forward_train(img, **kwargs)
|
|
||||||
loss, log_vars = self._parse_losses(losses)
|
|
||||||
return dict(loss=loss, log_vars=log_vars)
|
|
||||||
else:
|
|
||||||
return self.forward_test(img, **kwargs)
|
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
|
||||||
return self.forward(*args, **kwargs)
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
from modelscope.utils.import_utils import LazyImportModule
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from .face_2d_keypoints_align import Face2DKeypoints
|
|
||||||
|
|
||||||
else:
|
|
||||||
_import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.modules[__name__] = LazyImportModule(
|
|
||||||
__name__,
|
|
||||||
globals()['__file__'],
|
|
||||||
_import_structure,
|
|
||||||
module_spec=__spec__,
|
|
||||||
extra_objects={},
|
|
||||||
)
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.face.face_keypoint import FaceKeypoint
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
|
|
||||||
class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
FaceKeypoint.__init__(self, *args, **kwargs)
|
|
||||||
@@ -82,7 +82,7 @@ class FaceLandmark:
|
|||||||
bbox[2] = center[0] + one_edge // 2
|
bbox[2] = center[0] + one_edge // 2
|
||||||
bbox[3] = center[1] + one_edge // 2
|
bbox[3] = center[1] + one_edge // 2
|
||||||
|
|
||||||
bbox = bbox.astype(np.int)
|
bbox = bbox.astype(int)
|
||||||
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
|
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
|
||||||
h, w, _ = crop_image.shape
|
h, w, _ = crop_image.shape
|
||||||
crop_image = cv2.resize(crop_image,
|
crop_image = cv2.resize(crop_image,
|
||||||
|
|||||||
@@ -1,20 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
from modelscope.utils.import_utils import LazyImportModule
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from .hand_2d_keypoints import Hand2dKeyPoints
|
|
||||||
|
|
||||||
else:
|
|
||||||
_import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
|
|
||||||
|
|
||||||
import sys
|
|
||||||
|
|
||||||
sys.modules[__name__] = LazyImportModule(
|
|
||||||
__name__,
|
|
||||||
globals()['__file__'],
|
|
||||||
_import_structure,
|
|
||||||
module_spec=__spec__,
|
|
||||||
extra_objects={},
|
|
||||||
)
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.pose import TopDown
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
|
|
||||||
class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
TopDown.__init__(self, *args, **kwargs)
|
|
||||||
@@ -31,7 +31,7 @@ class human_segmenter(object):
|
|||||||
img = np.dstack((img, img, img))
|
img = np.dstack((img, img, img))
|
||||||
elif img.shape[2] == 4:
|
elif img.shape[2] == 4:
|
||||||
img = img[:, :, :3]
|
img = img[:, :, :3]
|
||||||
img = img.astype(np.float)
|
img = img.astype(float)
|
||||||
return img
|
return img
|
||||||
|
|
||||||
def run(self, img):
|
def run(self, img):
|
||||||
|
|||||||
@@ -69,8 +69,8 @@ def eval_grid(coords,
|
|||||||
num_samples=512 * 512 * 512):
|
num_samples=512 * 512 * 512):
|
||||||
resolution = coords.shape[1:4]
|
resolution = coords.shape[1:4]
|
||||||
sdf = np.zeros(resolution)
|
sdf = np.zeros(resolution)
|
||||||
dirty = np.ones(resolution, dtype=np.bool)
|
dirty = np.ones(resolution, dtype=bool)
|
||||||
grid_mask = np.zeros(resolution, dtype=np.bool)
|
grid_mask = np.zeros(resolution, dtype=bool)
|
||||||
reso = resolution[0] // init_resolution
|
reso = resolution[0] // init_resolution
|
||||||
|
|
||||||
while reso > 0:
|
while reso > 0:
|
||||||
|
|||||||
@@ -1,17 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.pose.top_down import TopDown
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.human_wholebody_keypoint,
|
|
||||||
module_name=Models.human_wholebody_keypoint)
|
|
||||||
class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
TopDown.__init__(self, *args, **kwargs)
|
|
||||||
@@ -163,7 +163,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
|
|||||||
os.path.join(split_dir,
|
os.path.join(split_dir,
|
||||||
'box_{}shot_{}_train.txt'.format(shot,
|
'box_{}shot_{}_train.txt'.format(shot,
|
||||||
cls))) as f:
|
cls))) as f:
|
||||||
fileids_ = np.loadtxt(f, dtype=np.str).tolist()
|
fileids_ = np.loadtxt(f, dtype=np.str_).tolist()
|
||||||
if isinstance(fileids_, str):
|
if isinstance(fileids_, str):
|
||||||
fileids_ = [fileids_]
|
fileids_ = [fileids_]
|
||||||
fileids_ = [
|
fileids_ = [
|
||||||
@@ -219,7 +219,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
|
|||||||
with PathManager.open(
|
with PathManager.open(
|
||||||
os.path.join(root, dirname, 'ImageSets', 'Main',
|
os.path.join(root, dirname, 'ImageSets', 'Main',
|
||||||
split + '.txt')) as f:
|
split + '.txt')) as f:
|
||||||
fileids = np.loadtxt(f, dtype=np.str)
|
fileids = np.loadtxt(f, dtype=np.str_)
|
||||||
|
|
||||||
for fileid in fileids:
|
for fileid in fileids:
|
||||||
anno_file = os.path.join(root, dirname, 'Annotations',
|
anno_file = os.path.join(root, dirname, 'Annotations',
|
||||||
|
|||||||
@@ -8,10 +8,12 @@ if TYPE_CHECKING:
|
|||||||
from .maskdino_swin import MaskDINOSwin
|
from .maskdino_swin import MaskDINOSwin
|
||||||
from .model import CascadeMaskRCNNSwinModel
|
from .model import CascadeMaskRCNNSwinModel
|
||||||
from .maskdino_model import MaskDINOSwinModel
|
from .maskdino_model import MaskDINOSwinModel
|
||||||
|
from .fastinst_model import FastInst
|
||||||
from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
|
from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
|
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
|
||||||
|
'fastinst_model': ['FastInst'],
|
||||||
'maskdino_swin': ['MaskDINOSwin'],
|
'maskdino_swin': ['MaskDINOSwin'],
|
||||||
'model': ['CascadeMaskRCNNSwinModel'],
|
'model': ['CascadeMaskRCNNSwinModel'],
|
||||||
'maskdino_model': ['MaskDINOSwinModel'],
|
'maskdino_model': ['MaskDINOSwinModel'],
|
||||||
|
|||||||
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .swin_transformer import SwinTransformer
|
from .swin_transformer import SwinTransformer
|
||||||
from .swin_transformer import D2SwinTransformer
|
from .swin_transformer import D2SwinTransformer
|
||||||
|
from .resnet import build_resnet_backbone
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
|
'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
|
||||||
|
'resnet': ['build_resnet_backbone']
|
||||||
}
|
}
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@@ -0,0 +1,114 @@
|
|||||||
|
# Part of the implementation is borrowed and modified from Detectron2, publicly available at
|
||||||
|
# https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py
|
||||||
|
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
from modelscope.models.cv.image_human_parsing.backbone.deeplab_resnet import (
|
||||||
|
BottleneckBlock, DeeplabResNet, get_norm)
|
||||||
|
from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
|
||||||
|
Conv2d
|
||||||
|
|
||||||
|
|
||||||
|
class BasicStem(nn.Module):
|
||||||
|
"""
|
||||||
|
The standard ResNet stem (layers before the first residual block),
|
||||||
|
with a conv, relu and max_pool.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, in_channels=3, out_channels=64, norm='BN'):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
norm (str or callable): norm after the first conv layer.
|
||||||
|
See :func:`layers.get_norm` for supported format.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.stride = 4
|
||||||
|
self.conv1 = Conv2d(
|
||||||
|
in_channels,
|
||||||
|
out_channels,
|
||||||
|
kernel_size=7,
|
||||||
|
stride=2,
|
||||||
|
padding=3,
|
||||||
|
bias=False,
|
||||||
|
norm=get_norm(norm, out_channels),
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.conv1(x)
|
||||||
|
x = F.relu_(x)
|
||||||
|
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def build_resnet_backbone(out_features, depth, num_groups, width_per_group,
|
||||||
|
norm, stem_out_channels, res2_out_channels,
|
||||||
|
stride_in_1x1, res4_dilation, res5_dilation,
|
||||||
|
res5_multi_grid, input_shape):
|
||||||
|
stem = BasicStem(
|
||||||
|
in_channels=input_shape['channels'],
|
||||||
|
out_channels=stem_out_channels,
|
||||||
|
norm=norm)
|
||||||
|
bottleneck_channels = num_groups * width_per_group
|
||||||
|
in_channels = stem_out_channels
|
||||||
|
out_channels = res2_out_channels
|
||||||
|
|
||||||
|
assert res4_dilation in {
|
||||||
|
1, 2
|
||||||
|
}, 'res4_dilation cannot be {}.'.format(res4_dilation)
|
||||||
|
assert res5_dilation in {
|
||||||
|
1, 2, 4
|
||||||
|
}, 'res5_dilation cannot be {}.'.format(res5_dilation)
|
||||||
|
if res4_dilation == 2:
|
||||||
|
# Always dilate res5 if res4 is dilated.
|
||||||
|
assert res5_dilation == 4
|
||||||
|
|
||||||
|
num_blocks_per_stage = {
|
||||||
|
50: [3, 4, 6, 3],
|
||||||
|
101: [3, 4, 23, 3],
|
||||||
|
152: [3, 8, 36, 3]
|
||||||
|
}[depth]
|
||||||
|
|
||||||
|
stages = []
|
||||||
|
out_stage_idx = [{
|
||||||
|
'res2': 2,
|
||||||
|
'res3': 3,
|
||||||
|
'res4': 4,
|
||||||
|
'res5': 5
|
||||||
|
}[f] for f in out_features]
|
||||||
|
max_stage_idx = max(out_stage_idx)
|
||||||
|
for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
|
||||||
|
if stage_idx == 4:
|
||||||
|
dilation = res4_dilation
|
||||||
|
elif stage_idx == 5:
|
||||||
|
dilation = res5_dilation
|
||||||
|
else:
|
||||||
|
dilation = 1
|
||||||
|
first_stride = 1 if idx == 0 or dilation > 1 else 2
|
||||||
|
stride_per_block = [first_stride]
|
||||||
|
stride_per_block += [1] * (num_blocks_per_stage[idx] - 1)
|
||||||
|
stage_kargs = {
|
||||||
|
'num_blocks': num_blocks_per_stage[idx],
|
||||||
|
'stride_per_block': stride_per_block,
|
||||||
|
'in_channels': in_channels,
|
||||||
|
'out_channels': out_channels,
|
||||||
|
'norm': norm,
|
||||||
|
'bottleneck_channels': bottleneck_channels,
|
||||||
|
'stride_in_1x1': stride_in_1x1,
|
||||||
|
'dilation': dilation,
|
||||||
|
'num_groups': num_groups,
|
||||||
|
'block_class': BottleneckBlock
|
||||||
|
}
|
||||||
|
if stage_idx == 5:
|
||||||
|
stage_kargs.pop('dilation')
|
||||||
|
stage_kargs['dilation_per_block'] = [
|
||||||
|
dilation * mg for mg in res5_multi_grid
|
||||||
|
]
|
||||||
|
blocks = DeeplabResNet.make_stage(**stage_kargs)
|
||||||
|
in_channels = out_channels
|
||||||
|
out_channels *= 2
|
||||||
|
bottleneck_channels *= 2
|
||||||
|
stages.append(blocks)
|
||||||
|
return DeeplabResNet(stem, stages, out_features=out_features)
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
@@ -0,0 +1,351 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
import math
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import (
|
||||||
|
MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer)
|
||||||
|
|
||||||
|
|
||||||
|
class QueryProposal(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, num_features, num_queries, num_classes):
|
||||||
|
super().__init__()
|
||||||
|
self.topk = num_queries
|
||||||
|
self.num_classes = num_classes
|
||||||
|
|
||||||
|
self.conv_proposal_cls_logits = nn.Sequential(
|
||||||
|
nn.Conv2d(
|
||||||
|
num_features, num_features, kernel_size=3, stride=1,
|
||||||
|
padding=1),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
nn.Conv2d(
|
||||||
|
num_features,
|
||||||
|
num_classes + 1,
|
||||||
|
kernel_size=1,
|
||||||
|
stride=1,
|
||||||
|
padding=0),
|
||||||
|
)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def compute_coordinates(self, x):
|
||||||
|
h, w = x.size(2), x.size(3)
|
||||||
|
y_loc = torch.linspace(0, 1, h, device=x.device)
|
||||||
|
x_loc = torch.linspace(0, 1, w, device=x.device)
|
||||||
|
y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
|
||||||
|
locations = torch.stack([x_loc, y_loc], 0).unsqueeze(0)
|
||||||
|
return locations
|
||||||
|
|
||||||
|
def seek_local_maximum(self, x, epsilon=1e-6):
|
||||||
|
"""
|
||||||
|
inputs:
|
||||||
|
x: torch.tensor, shape [b, c, h, w]
|
||||||
|
return:
|
||||||
|
torch.tensor, shape [b, c, h, w]
|
||||||
|
"""
|
||||||
|
x_pad = F.pad(x, (1, 1, 1, 1), 'constant', 0)
|
||||||
|
# top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
|
||||||
|
maximum = (x >= x_pad[:, :, :-2, 1:-1]) & \
|
||||||
|
(x >= x_pad[:, :, 2:, 1:-1]) & \
|
||||||
|
(x >= x_pad[:, :, 1:-1, :-2]) & \
|
||||||
|
(x >= x_pad[:, :, 1:-1, 2:]) & \
|
||||||
|
(x >= x_pad[:, :, :-2, :-2]) & \
|
||||||
|
(x >= x_pad[:, :, :-2, 2:]) & \
|
||||||
|
(x >= x_pad[:, :, 2:, :-2]) & \
|
||||||
|
(x >= x_pad[:, :, 2:, 2:]) & \
|
||||||
|
(x >= epsilon)
|
||||||
|
return maximum.to(x)
|
||||||
|
|
||||||
|
def forward(self, x, pos_embeddings):
|
||||||
|
|
||||||
|
proposal_cls_logits = self.conv_proposal_cls_logits(x) # b, c, h, w
|
||||||
|
proposal_cls_probs = proposal_cls_logits.softmax(dim=1) # b, c, h, w
|
||||||
|
proposal_cls_one_hot = F.one_hot(
|
||||||
|
proposal_cls_probs[:, :-1, :, :].max(1)[1],
|
||||||
|
num_classes=self.num_classes + 1).permute(0, 3, 1, 2) # b, c, h, w
|
||||||
|
proposal_cls_probs = proposal_cls_probs.mul(proposal_cls_one_hot)
|
||||||
|
proposal_local_maximum_map = self.seek_local_maximum(
|
||||||
|
proposal_cls_probs) # b, c, h, w
|
||||||
|
proposal_cls_probs = proposal_cls_probs + proposal_local_maximum_map # b, c, h, w
|
||||||
|
|
||||||
|
# top-k indices
|
||||||
|
topk_indices = torch.topk(
|
||||||
|
proposal_cls_probs[:, :-1, :, :].flatten(2).max(1)[0],
|
||||||
|
self.topk,
|
||||||
|
dim=1)[1] # b, q
|
||||||
|
topk_indices = topk_indices.unsqueeze(1) # b, 1, q
|
||||||
|
|
||||||
|
# topk queries
|
||||||
|
topk_proposals = torch.gather(
|
||||||
|
x.flatten(2), dim=2, index=topk_indices.repeat(1, x.shape[1],
|
||||||
|
1)) # b, c, q
|
||||||
|
pos_embeddings = pos_embeddings.repeat(x.shape[0], 1, 1, 1).flatten(2)
|
||||||
|
topk_pos_embeddings = torch.gather(
|
||||||
|
pos_embeddings,
|
||||||
|
dim=2,
|
||||||
|
index=topk_indices.repeat(1, pos_embeddings.shape[1],
|
||||||
|
1)) # b, c, q
|
||||||
|
if self.training:
|
||||||
|
locations = self.compute_coordinates(x).repeat(x.shape[0], 1, 1, 1)
|
||||||
|
topk_locations = torch.gather(
|
||||||
|
locations.flatten(2),
|
||||||
|
dim=2,
|
||||||
|
index=topk_indices.repeat(1, locations.shape[1], 1))
|
||||||
|
topk_locations = topk_locations.transpose(-1, -2) # b, q, 2
|
||||||
|
else:
|
||||||
|
topk_locations = None
|
||||||
|
return topk_proposals, topk_pos_embeddings, topk_locations, proposal_cls_logits
|
||||||
|
|
||||||
|
|
||||||
|
class FastInstDecoder(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, in_channels, *, num_classes: int, hidden_dim: int,
|
||||||
|
num_queries: int, num_aux_queries: int, nheads: int,
|
||||||
|
dim_feedforward: int, dec_layers: int, pre_norm: bool,
|
||||||
|
mask_dim: int):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
in_channels: channels of the input features
|
||||||
|
num_classes: number of classes
|
||||||
|
hidden_dim: Transformer feature dimension
|
||||||
|
num_queries: number of queries
|
||||||
|
num_aux_queries: number of auxiliary queries
|
||||||
|
nheads: number of heads
|
||||||
|
dim_feedforward: feature dimension in feedforward network
|
||||||
|
dec_layers: number of Transformer decoder layers
|
||||||
|
pre_norm: whether to use pre-LayerNorm or not
|
||||||
|
mask_dim: mask feature dimension
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.num_heads = nheads
|
||||||
|
self.num_layers = dec_layers
|
||||||
|
self.num_queries = num_queries
|
||||||
|
self.num_aux_queries = num_aux_queries
|
||||||
|
self.num_classes = num_classes
|
||||||
|
|
||||||
|
meta_pos_size = int(round(math.sqrt(self.num_queries)))
|
||||||
|
self.meta_pos_embed = nn.Parameter(
|
||||||
|
torch.empty(1, hidden_dim, meta_pos_size, meta_pos_size))
|
||||||
|
if num_aux_queries > 0:
|
||||||
|
self.empty_query_features = nn.Embedding(num_aux_queries,
|
||||||
|
hidden_dim)
|
||||||
|
self.empty_query_pos_embed = nn.Embedding(num_aux_queries,
|
||||||
|
hidden_dim)
|
||||||
|
|
||||||
|
self.query_proposal = QueryProposal(hidden_dim, num_queries,
|
||||||
|
num_classes)
|
||||||
|
|
||||||
|
self.transformer_query_cross_attention_layers = nn.ModuleList()
|
||||||
|
self.transformer_query_self_attention_layers = nn.ModuleList()
|
||||||
|
self.transformer_query_ffn_layers = nn.ModuleList()
|
||||||
|
self.transformer_mask_cross_attention_layers = nn.ModuleList()
|
||||||
|
self.transformer_mask_ffn_layers = nn.ModuleList()
|
||||||
|
for idx in range(self.num_layers):
|
||||||
|
self.transformer_query_cross_attention_layers.append(
|
||||||
|
CrossAttentionLayer(
|
||||||
|
d_model=hidden_dim,
|
||||||
|
nhead=nheads,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize_before=pre_norm))
|
||||||
|
self.transformer_query_self_attention_layers.append(
|
||||||
|
SelfAttentionLayer(
|
||||||
|
d_model=hidden_dim,
|
||||||
|
nhead=nheads,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize_before=pre_norm))
|
||||||
|
self.transformer_query_ffn_layers.append(
|
||||||
|
FFNLayer(
|
||||||
|
d_model=hidden_dim,
|
||||||
|
dim_feedforward=dim_feedforward,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize_before=pre_norm))
|
||||||
|
self.transformer_mask_cross_attention_layers.append(
|
||||||
|
CrossAttentionLayer(
|
||||||
|
d_model=hidden_dim,
|
||||||
|
nhead=nheads,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize_before=pre_norm))
|
||||||
|
self.transformer_mask_ffn_layers.append(
|
||||||
|
FFNLayer(
|
||||||
|
d_model=hidden_dim,
|
||||||
|
dim_feedforward=dim_feedforward,
|
||||||
|
dropout=0.0,
|
||||||
|
normalize_before=pre_norm))
|
||||||
|
|
||||||
|
self.decoder_query_norm_layers = nn.ModuleList()
|
||||||
|
self.class_embed_layers = nn.ModuleList()
|
||||||
|
self.mask_embed_layers = nn.ModuleList()
|
||||||
|
self.mask_features_layers = nn.ModuleList()
|
||||||
|
for idx in range(self.num_layers + 1):
|
||||||
|
self.decoder_query_norm_layers.append(nn.LayerNorm(hidden_dim))
|
||||||
|
self.class_embed_layers.append(
|
||||||
|
MLP(hidden_dim, hidden_dim, num_classes + 1, 3))
|
||||||
|
self.mask_embed_layers.append(
|
||||||
|
MLP(hidden_dim, hidden_dim, mask_dim, 3))
|
||||||
|
self.mask_features_layers.append(nn.Linear(hidden_dim, mask_dim))
|
||||||
|
|
||||||
|
def forward(self, x, mask_features, targets=None):
|
||||||
|
bs = x[0].shape[0]
|
||||||
|
proposal_size = x[1].shape[-2:]
|
||||||
|
pixel_feature_size = x[2].shape[-2:]
|
||||||
|
|
||||||
|
pixel_pos_embeds = F.interpolate(
|
||||||
|
self.meta_pos_embed,
|
||||||
|
size=pixel_feature_size,
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False)
|
||||||
|
proposal_pos_embeds = F.interpolate(
|
||||||
|
self.meta_pos_embed,
|
||||||
|
size=proposal_size,
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False)
|
||||||
|
|
||||||
|
pixel_features = x[2].flatten(2).permute(2, 0, 1)
|
||||||
|
pixel_pos_embeds = pixel_pos_embeds.flatten(2).permute(2, 0, 1)
|
||||||
|
|
||||||
|
query_features, query_pos_embeds, query_locations, proposal_cls_logits = self.query_proposal(
|
||||||
|
x[1], proposal_pos_embeds)
|
||||||
|
query_features = query_features.permute(2, 0, 1)
|
||||||
|
query_pos_embeds = query_pos_embeds.permute(2, 0, 1)
|
||||||
|
if self.num_aux_queries > 0:
|
||||||
|
aux_query_features = self.empty_query_features.weight.unsqueeze(
|
||||||
|
1).repeat(1, bs, 1)
|
||||||
|
aux_query_pos_embed = self.empty_query_pos_embed.weight.unsqueeze(
|
||||||
|
1).repeat(1, bs, 1)
|
||||||
|
query_features = torch.cat([query_features, aux_query_features],
|
||||||
|
dim=0)
|
||||||
|
query_pos_embeds = torch.cat(
|
||||||
|
[query_pos_embeds, aux_query_pos_embed], dim=0)
|
||||||
|
|
||||||
|
outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
|
||||||
|
query_features,
|
||||||
|
pixel_features,
|
||||||
|
pixel_feature_size,
|
||||||
|
-1,
|
||||||
|
return_attn_mask=True)
|
||||||
|
predictions_class = [outputs_class]
|
||||||
|
predictions_mask = [outputs_mask]
|
||||||
|
predictions_matching_index = [None]
|
||||||
|
query_feature_memory = [query_features]
|
||||||
|
pixel_feature_memory = [pixel_features]
|
||||||
|
|
||||||
|
for i in range(self.num_layers):
|
||||||
|
query_features, pixel_features = self.forward_one_layer(
|
||||||
|
query_features, pixel_features, query_pos_embeds,
|
||||||
|
pixel_pos_embeds, attn_mask, i)
|
||||||
|
if i < self.num_layers - 1:
|
||||||
|
outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
|
||||||
|
query_features,
|
||||||
|
pixel_features,
|
||||||
|
pixel_feature_size,
|
||||||
|
i,
|
||||||
|
return_attn_mask=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
outputs_class, outputs_mask, _, matching_indices, gt_attn_mask = self.forward_prediction_heads(
|
||||||
|
query_features,
|
||||||
|
pixel_features,
|
||||||
|
pixel_feature_size,
|
||||||
|
i,
|
||||||
|
)
|
||||||
|
predictions_class.append(outputs_class)
|
||||||
|
predictions_mask.append(outputs_mask)
|
||||||
|
predictions_matching_index.append(None)
|
||||||
|
query_feature_memory.append(query_features)
|
||||||
|
pixel_feature_memory.append(pixel_features)
|
||||||
|
|
||||||
|
out = {
|
||||||
|
'proposal_cls_logits':
|
||||||
|
proposal_cls_logits,
|
||||||
|
'query_locations':
|
||||||
|
query_locations,
|
||||||
|
'pred_logits':
|
||||||
|
predictions_class[-1],
|
||||||
|
'pred_masks':
|
||||||
|
predictions_mask[-1],
|
||||||
|
'pred_indices':
|
||||||
|
predictions_matching_index[-1],
|
||||||
|
'aux_outputs':
|
||||||
|
self._set_aux_loss(predictions_class, predictions_mask,
|
||||||
|
predictions_matching_index, query_locations)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
|
||||||
|
def forward_one_layer(self, query_features, pixel_features,
|
||||||
|
query_pos_embeds, pixel_pos_embeds, attn_mask, i):
|
||||||
|
pixel_features = self.transformer_mask_cross_attention_layers[i](
|
||||||
|
pixel_features,
|
||||||
|
query_features,
|
||||||
|
query_pos=pixel_pos_embeds,
|
||||||
|
pos=query_pos_embeds)
|
||||||
|
pixel_features = self.transformer_mask_ffn_layers[i](pixel_features)
|
||||||
|
|
||||||
|
query_features = self.transformer_query_cross_attention_layers[i](
|
||||||
|
query_features,
|
||||||
|
pixel_features,
|
||||||
|
memory_mask=attn_mask,
|
||||||
|
query_pos=query_pos_embeds,
|
||||||
|
pos=pixel_pos_embeds)
|
||||||
|
query_features = self.transformer_query_self_attention_layers[i](
|
||||||
|
query_features, query_pos=query_pos_embeds)
|
||||||
|
query_features = self.transformer_query_ffn_layers[i](query_features)
|
||||||
|
return query_features, pixel_features
|
||||||
|
|
||||||
|
def forward_prediction_heads(self,
|
||||||
|
query_features,
|
||||||
|
pixel_features,
|
||||||
|
pixel_feature_size,
|
||||||
|
idx_layer,
|
||||||
|
return_attn_mask=False,
|
||||||
|
return_gt_attn_mask=False,
|
||||||
|
targets=None,
|
||||||
|
query_locations=None):
|
||||||
|
decoder_query_features = self.decoder_query_norm_layers[idx_layer + 1](
|
||||||
|
query_features[:self.num_queries])
|
||||||
|
decoder_query_features = decoder_query_features.transpose(0, 1)
|
||||||
|
if idx_layer + 1 == self.num_layers:
|
||||||
|
outputs_class = self.class_embed_layers[idx_layer + 1](
|
||||||
|
decoder_query_features)
|
||||||
|
else:
|
||||||
|
outputs_class = None
|
||||||
|
outputs_mask_embed = self.mask_embed_layers[idx_layer + 1](
|
||||||
|
decoder_query_features)
|
||||||
|
outputs_mask_features = self.mask_features_layers[idx_layer + 1](
|
||||||
|
pixel_features.transpose(0, 1))
|
||||||
|
|
||||||
|
outputs_mask = torch.einsum('bqc,blc->bql', outputs_mask_embed,
|
||||||
|
outputs_mask_features)
|
||||||
|
outputs_mask = outputs_mask.reshape(-1, self.num_queries,
|
||||||
|
*pixel_feature_size)
|
||||||
|
|
||||||
|
if return_attn_mask:
|
||||||
|
# outputs_mask.shape: b, q, h, w
|
||||||
|
attn_mask = F.pad(outputs_mask,
|
||||||
|
(0, 0, 0, 0, 0, self.num_aux_queries),
|
||||||
|
'constant', 1)
|
||||||
|
attn_mask = (attn_mask < 0.).flatten(2) # b, q, hw
|
||||||
|
invalid_query = attn_mask.all(-1, keepdim=True) # b, q, 1
|
||||||
|
attn_mask = (~invalid_query) & attn_mask # b, q, hw
|
||||||
|
attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1,
|
||||||
|
1).flatten(0, 1)
|
||||||
|
attn_mask = attn_mask.detach()
|
||||||
|
else:
|
||||||
|
attn_mask = None
|
||||||
|
|
||||||
|
matching_indices = None
|
||||||
|
gt_attn_mask = None
|
||||||
|
|
||||||
|
return outputs_class, outputs_mask, attn_mask, matching_indices, gt_attn_mask
|
||||||
|
|
||||||
|
@torch.jit.unused
|
||||||
|
def _set_aux_loss(self, outputs_class, outputs_seg_masks, output_indices,
|
||||||
|
output_query_locations):
|
||||||
|
return [{
|
||||||
|
'query_locations': output_query_locations,
|
||||||
|
'pred_logits': a,
|
||||||
|
'pred_masks': b,
|
||||||
|
'pred_matching_indices': c
|
||||||
|
} for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],
|
||||||
|
output_indices[:-1])]
|
||||||
@@ -0,0 +1,180 @@
|
|||||||
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||||
|
import logging
|
||||||
|
from typing import Callable, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
|
||||||
|
Conv2d
|
||||||
|
|
||||||
|
|
||||||
|
# This is a modified FPN decoder.
|
||||||
|
class BaseFPN(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_shape,
|
||||||
|
*,
|
||||||
|
convs_dim: int,
|
||||||
|
mask_dim: int,
|
||||||
|
norm: Optional[Union[str, Callable]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
input_shape: shapes (channels and stride) of the input features
|
||||||
|
convs_dim: number of output channels for the intermediate conv layers.
|
||||||
|
mask_dim: number of output channels for the final conv layer.
|
||||||
|
norm (str or callable): normalization for all conv layers
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
|
||||||
|
self.in_features = [k for k, v in input_shape
|
||||||
|
] # starting from "res3" to "res5"
|
||||||
|
feature_channels = [v['channels'] for k, v in input_shape]
|
||||||
|
|
||||||
|
lateral_convs = []
|
||||||
|
output_convs = []
|
||||||
|
|
||||||
|
use_bias = norm == ''
|
||||||
|
for idx, in_channels in enumerate(feature_channels):
|
||||||
|
lateral_norm = nn.GroupNorm(32, convs_dim)
|
||||||
|
output_norm = nn.GroupNorm(32, convs_dim)
|
||||||
|
|
||||||
|
lateral_conv = Conv2d(
|
||||||
|
in_channels,
|
||||||
|
convs_dim,
|
||||||
|
kernel_size=1,
|
||||||
|
bias=use_bias,
|
||||||
|
norm=lateral_norm)
|
||||||
|
output_conv = Conv2d(
|
||||||
|
convs_dim,
|
||||||
|
convs_dim,
|
||||||
|
kernel_size=3,
|
||||||
|
stride=1,
|
||||||
|
padding=1,
|
||||||
|
bias=use_bias,
|
||||||
|
norm=output_norm,
|
||||||
|
activation=F.relu,
|
||||||
|
)
|
||||||
|
self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
|
||||||
|
self.add_module('layer_{}'.format(idx + 1), output_conv)
|
||||||
|
|
||||||
|
lateral_convs.append(lateral_conv)
|
||||||
|
output_convs.append(output_conv)
|
||||||
|
# Place convs into top-down order (from low to high resolution)
|
||||||
|
# to make the top-down computation in forward clearer.
|
||||||
|
self.lateral_convs = lateral_convs[::-1]
|
||||||
|
self.output_convs = output_convs[::-1]
|
||||||
|
|
||||||
|
self.convs_dim = convs_dim
|
||||||
|
self.num_feature_levels = 3 # always use 3 scales
|
||||||
|
|
||||||
|
def forward_features(self, features):
|
||||||
|
multi_scale_features = []
|
||||||
|
num_cur_levels = 0
|
||||||
|
# Reverse feature maps into top-down order (from low to high resolution)
|
||||||
|
for idx, f in enumerate(self.in_features[::-1]):
|
||||||
|
x = features[f]
|
||||||
|
lateral_conv = self.lateral_convs[idx]
|
||||||
|
output_conv = self.output_convs[idx]
|
||||||
|
if idx == 0:
|
||||||
|
y = lateral_conv(x)
|
||||||
|
else:
|
||||||
|
cur_fpn = lateral_conv(x)
|
||||||
|
y = cur_fpn + F.interpolate(
|
||||||
|
y,
|
||||||
|
size=cur_fpn.shape[-2:],
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False)
|
||||||
|
y = output_conv(y)
|
||||||
|
|
||||||
|
if num_cur_levels < self.num_feature_levels:
|
||||||
|
multi_scale_features.append(y)
|
||||||
|
num_cur_levels += 1
|
||||||
|
return None, multi_scale_features
|
||||||
|
|
||||||
|
def forward(self, features, targets=None):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.warning(
|
||||||
|
'Calling forward() may cause unpredicted behavior of PixelDecoder module.'
|
||||||
|
)
|
||||||
|
return self.forward_features(features)
|
||||||
|
|
||||||
|
|
||||||
|
class PyramidPoolingModule(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
|
||||||
|
super().__init__()
|
||||||
|
self.stages = []
|
||||||
|
self.stages = nn.ModuleList(
|
||||||
|
[self._make_stage(in_channels, channels, size) for size in sizes])
|
||||||
|
self.bottleneck = Conv2d(in_channels + len(sizes) * channels,
|
||||||
|
in_channels, 1)
|
||||||
|
|
||||||
|
def _make_stage(self, features, out_features, size):
|
||||||
|
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
|
||||||
|
conv = Conv2d(features, out_features, 1)
|
||||||
|
return nn.Sequential(prior, conv)
|
||||||
|
|
||||||
|
def forward(self, feats):
|
||||||
|
h, w = feats.size(2), feats.size(3)
|
||||||
|
priors = [
|
||||||
|
F.interpolate(
|
||||||
|
input=F.relu_(stage(feats)),
|
||||||
|
size=(h, w),
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False) for stage in self.stages
|
||||||
|
] + [feats]
|
||||||
|
out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
class PyramidPoolingModuleFPN(BaseFPN):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_shape,
|
||||||
|
*,
|
||||||
|
convs_dim: int,
|
||||||
|
mask_dim: int,
|
||||||
|
norm: Optional[Union[str, Callable]] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
NOTE: this interface is experimental.
|
||||||
|
Args:
|
||||||
|
input_shape: shapes (channels and stride) of the input features
|
||||||
|
convs_dim: number of output channels for the intermediate conv layers.
|
||||||
|
mask_dim: number of output channels for the final conv layer.
|
||||||
|
norm (str or callable): normalization for all conv layers
|
||||||
|
"""
|
||||||
|
super().__init__(
|
||||||
|
input_shape, convs_dim=convs_dim, mask_dim=mask_dim, norm=norm)
|
||||||
|
self.ppm = PyramidPoolingModule(convs_dim, convs_dim // 4)
|
||||||
|
|
||||||
|
def forward_features(self, features):
|
||||||
|
multi_scale_features = []
|
||||||
|
num_cur_levels = 0
|
||||||
|
# Reverse feature maps into top-down order (from low to high resolution)
|
||||||
|
for idx, f in enumerate(self.in_features[::-1]):
|
||||||
|
x = features[f]
|
||||||
|
lateral_conv = self.lateral_convs[idx]
|
||||||
|
output_conv = self.output_convs[idx]
|
||||||
|
if idx == 0:
|
||||||
|
y = self.ppm(lateral_conv(x))
|
||||||
|
else:
|
||||||
|
cur_fpn = lateral_conv(x)
|
||||||
|
y = cur_fpn + F.interpolate(
|
||||||
|
y,
|
||||||
|
size=cur_fpn.shape[-2:],
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False)
|
||||||
|
y = output_conv(y)
|
||||||
|
|
||||||
|
if num_cur_levels < self.num_feature_levels:
|
||||||
|
multi_scale_features.append(y)
|
||||||
|
num_cur_levels += 1
|
||||||
|
|
||||||
|
return None, multi_scale_features
|
||||||
@@ -0,0 +1,221 @@
|
|||||||
|
# Part of implementation is borrowed and modified from Mask2Former, publicly available at
|
||||||
|
# https://github.com/facebookresearch/Mask2Former.
|
||||||
|
import os
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from modelscope.metainfo import Models
|
||||||
|
from modelscope.models.base import TorchModel
|
||||||
|
from modelscope.models.builder import MODELS
|
||||||
|
from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \
|
||||||
|
ImageList
|
||||||
|
from modelscope.utils.constant import ModelFile, Tasks
|
||||||
|
from modelscope.utils.logger import get_logger
|
||||||
|
from .backbones import build_resnet_backbone
|
||||||
|
from .fastinst.fastinst_decoder import FastInstDecoder
|
||||||
|
from .fastinst.fastinst_encoder import PyramidPoolingModuleFPN
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(Tasks.image_segmentation, module_name=Models.fastinst)
|
||||||
|
class FastInst(TorchModel):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
model_dir,
|
||||||
|
backbone=None,
|
||||||
|
encoder=None,
|
||||||
|
decoder=None,
|
||||||
|
pretrained=None,
|
||||||
|
classes=None,
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394
|
||||||
|
Args:
|
||||||
|
backbone (dict): backbone config.
|
||||||
|
encoder (dict): encoder config.
|
||||||
|
decoder (dict): decoder config.
|
||||||
|
pretrained (bool): whether to use pretrained model
|
||||||
|
classes (list): class names
|
||||||
|
"""
|
||||||
|
super(FastInst, self).__init__(model_dir, **kwargs)
|
||||||
|
|
||||||
|
self.backbone = build_resnet_backbone(
|
||||||
|
**backbone, input_shape={'channels': 3})
|
||||||
|
in_features = encoder.pop('in_features')
|
||||||
|
input_shape = {
|
||||||
|
k: v
|
||||||
|
for k, v in self.backbone.output_shape().items()
|
||||||
|
if k in in_features
|
||||||
|
}
|
||||||
|
encoder = PyramidPoolingModuleFPN(input_shape=input_shape, **encoder)
|
||||||
|
decoder = FastInstDecoder(in_channels=encoder.convs_dim, **decoder)
|
||||||
|
self.sem_seg_head = FastInstHead(
|
||||||
|
pixel_decoder=encoder, transformer_predictor=decoder)
|
||||||
|
|
||||||
|
self.num_classes = decoder.num_classes
|
||||||
|
self.num_queries = decoder.num_queries
|
||||||
|
self.size_divisibility = 32
|
||||||
|
self.register_buffer(
|
||||||
|
'pixel_mean',
|
||||||
|
torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
|
||||||
|
self.register_buffer(
|
||||||
|
'pixel_std',
|
||||||
|
torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
|
||||||
|
self.classes = classes
|
||||||
|
self.test_topk_per_image = 100
|
||||||
|
|
||||||
|
if pretrained:
|
||||||
|
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
|
||||||
|
logger.info(f'loading model from {model_path}')
|
||||||
|
weight = torch.load(model_path, map_location='cpu')['model']
|
||||||
|
tgt_weight = self.state_dict()
|
||||||
|
for name in list(weight.keys()):
|
||||||
|
if name in tgt_weight:
|
||||||
|
load_size = weight[name].size()
|
||||||
|
tgt_size = tgt_weight[name].size()
|
||||||
|
mis_match = False
|
||||||
|
if len(load_size) != len(tgt_size):
|
||||||
|
mis_match = True
|
||||||
|
else:
|
||||||
|
for n1, n2 in zip(load_size, tgt_size):
|
||||||
|
if n1 != n2:
|
||||||
|
mis_match = True
|
||||||
|
break
|
||||||
|
if mis_match:
|
||||||
|
logger.info(
|
||||||
|
f'size mismatch for {name} '
|
||||||
|
f'({load_size} -> {tgt_size}), skip loading.')
|
||||||
|
del weight[name]
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f'{name} doesn\'t exist in current model, skip loading.'
|
||||||
|
)
|
||||||
|
|
||||||
|
self.load_state_dict(weight, strict=False)
|
||||||
|
logger.info('load model done')
|
||||||
|
|
||||||
|
def forward(self, batched_inputs: List[dict]) -> Dict[str, Any]:
|
||||||
|
images = [x['image'].to(self.device) for x in batched_inputs]
|
||||||
|
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
||||||
|
images = ImageList.from_tensors(images, self.size_divisibility)
|
||||||
|
|
||||||
|
features = self.backbone(images.tensor)
|
||||||
|
outputs = self.sem_seg_head(features)
|
||||||
|
|
||||||
|
return dict(
|
||||||
|
outputs=outputs, batched_inputs=batched_inputs, images=images)
|
||||||
|
|
||||||
|
def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
outputs = input['outputs']
|
||||||
|
batched_inputs = input['batched_inputs']
|
||||||
|
images = input['images']
|
||||||
|
if self.training:
|
||||||
|
raise NotImplementedError
|
||||||
|
else:
|
||||||
|
mask_cls_results = outputs['pred_logits'] # (B, Q, C+1)
|
||||||
|
mask_pred_results = outputs['pred_masks'] # (B, Q, H, W)
|
||||||
|
# upsample masks
|
||||||
|
mask_pred_results = F.interpolate(
|
||||||
|
mask_pred_results,
|
||||||
|
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
del outputs
|
||||||
|
|
||||||
|
processed_results = []
|
||||||
|
for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
|
||||||
|
mask_cls_results, mask_pred_results, batched_inputs,
|
||||||
|
images.image_sizes):
|
||||||
|
height = input_per_image.get('height', image_size[0])
|
||||||
|
width = input_per_image.get('width', image_size[1])
|
||||||
|
processed_results.append({}) # for each image
|
||||||
|
|
||||||
|
mask_pred_result = self.sem_seg_postprocess(
|
||||||
|
mask_pred_result, image_size, height, width)
|
||||||
|
mask_cls_result = mask_cls_result.to(mask_pred_result)
|
||||||
|
|
||||||
|
instance_r = self.instance_inference(mask_cls_result,
|
||||||
|
mask_pred_result)
|
||||||
|
processed_results[-1]['instances'] = instance_r
|
||||||
|
|
||||||
|
return dict(eval_result=processed_results)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def device(self):
|
||||||
|
return self.pixel_mean.device
|
||||||
|
|
||||||
|
def sem_seg_postprocess(self, result, img_size, output_height,
|
||||||
|
output_width):
|
||||||
|
result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
|
||||||
|
result = F.interpolate(
|
||||||
|
result,
|
||||||
|
size=(output_height, output_width),
|
||||||
|
mode='bilinear',
|
||||||
|
align_corners=False)[0]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def instance_inference(self, mask_cls, mask_pred):
|
||||||
|
# mask_pred is already processed to have the same shape as original input
|
||||||
|
image_size = mask_pred.shape[-2:]
|
||||||
|
|
||||||
|
# [Q, K]
|
||||||
|
scores = F.softmax(mask_cls, dim=-1)[:, :-1]
|
||||||
|
labels = torch.arange(
|
||||||
|
self.num_classes,
|
||||||
|
device=self.device).unsqueeze(0).repeat(self.num_queries,
|
||||||
|
1).flatten(0, 1)
|
||||||
|
scores_per_image, topk_indices = scores.flatten(0, 1).topk(
|
||||||
|
self.test_topk_per_image, sorted=False)
|
||||||
|
labels_per_image = labels[topk_indices]
|
||||||
|
|
||||||
|
topk_indices = topk_indices // self.num_classes
|
||||||
|
mask_pred = mask_pred[topk_indices]
|
||||||
|
|
||||||
|
result = {'image_size': image_size}
|
||||||
|
# mask (before sigmoid)
|
||||||
|
mask_pred_sigmoid = mask_pred.sigmoid()
|
||||||
|
result['pred_masks'] = (mask_pred_sigmoid > 0.5).float()
|
||||||
|
|
||||||
|
# calculate average mask prob
|
||||||
|
mask_scores_per_image = (mask_pred_sigmoid.flatten(1)
|
||||||
|
* result['pred_masks'].flatten(1)).sum(1) / (
|
||||||
|
result['pred_masks'].flatten(1).sum(1)
|
||||||
|
+ 1e-6)
|
||||||
|
result['scores'] = scores_per_image * mask_scores_per_image
|
||||||
|
result['pred_classes'] = labels_per_image
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class FastInstHead(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
pixel_decoder: nn.Module,
|
||||||
|
# extra parameters
|
||||||
|
transformer_predictor: nn.Module):
|
||||||
|
"""
|
||||||
|
NOTE: this interface is experimental.
|
||||||
|
Args:
|
||||||
|
pixel_decoder: the pixel decoder module
|
||||||
|
transformer_predictor: the transformer decoder that makes prediction
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.pixel_decoder = pixel_decoder
|
||||||
|
self.predictor = transformer_predictor
|
||||||
|
|
||||||
|
def forward(self, features, targets=None):
|
||||||
|
return self.layers(features, targets)
|
||||||
|
|
||||||
|
def layers(self, features, targets=None):
|
||||||
|
mask_features, multi_scale_features = self.pixel_decoder.forward_features(
|
||||||
|
features)
|
||||||
|
predictions = self.predictor(multi_scale_features, mask_features,
|
||||||
|
targets)
|
||||||
|
return predictions
|
||||||
@@ -108,16 +108,16 @@ def get_img_ins_seg_result(img_seg_result=None,
|
|||||||
for seg_result in img_seg_result:
|
for seg_result in img_seg_result:
|
||||||
|
|
||||||
box = [
|
box = [
|
||||||
np.int(seg_result[0]),
|
int(seg_result[0]),
|
||||||
np.int(seg_result[1]),
|
int(seg_result[1]),
|
||||||
np.int(seg_result[2]),
|
int(seg_result[2]),
|
||||||
np.int(seg_result[3])
|
int(seg_result[3])
|
||||||
]
|
]
|
||||||
score = np.float(seg_result[4])
|
score = float(seg_result[4])
|
||||||
category = seg_result[5]
|
category = seg_result[5]
|
||||||
|
|
||||||
mask = np.array(seg_result[6], order='F', dtype='uint8')
|
mask = np.array(seg_result[6], order='F', dtype='uint8')
|
||||||
mask = mask.astype(np.float)
|
mask = mask.astype(float)
|
||||||
|
|
||||||
results_dict[OutputKeys.BOXES].append(box)
|
results_dict[OutputKeys.BOXES].append(box)
|
||||||
results_dict[OutputKeys.MASKS].append(mask)
|
results_dict[OutputKeys.MASKS].append(mask)
|
||||||
|
|||||||
@@ -382,7 +382,7 @@ def processing_single_scene(args):
|
|||||||
points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
|
points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
|
||||||
points3d[p3d_id].xyz[2], 1
|
points3d[p3d_id].xyz[2], 1
|
||||||
])
|
])
|
||||||
zs.append(np.asscalar(transformed[2]))
|
zs.append(transformed[2].item())
|
||||||
zs_sorted = sorted(zs)
|
zs_sorted = sorted(zs)
|
||||||
# relaxed depth range
|
# relaxed depth range
|
||||||
max_ratio = 0.1
|
max_ratio = 0.1
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ def read_mask(filename):
|
|||||||
|
|
||||||
# save a binary mask
|
# save a binary mask
|
||||||
def save_mask(filename, mask):
|
def save_mask(filename, mask):
|
||||||
assert mask.dtype == np.bool
|
assert mask.dtype == bool
|
||||||
mask = mask.astype(np.uint8) * 255
|
mask = mask.astype(np.uint8) * 255
|
||||||
Image.fromarray(mask).save(filename)
|
Image.fromarray(mask).save(filename)
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule
|
|||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .panseg_model import SwinLPanopticSegmentation
|
from .panseg_model import SwinLPanopticSegmentation
|
||||||
from .r50_panseg_model import R50PanopticSegmentation
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
|
|||||||
@@ -1,18 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
|
|
||||||
from easycv.models.segmentation import Mask2Former
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.image_segmentation,
|
|
||||||
module_name=Models.r50_panoptic_segmentation)
|
|
||||||
class R50PanopticSegmentation(EasyCVBaseModel, Mask2Former):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
Mask2Former.__init__(self, *args, **kwargs)
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.segmentation import EncoderDecoder
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.image_segmentation, module_name=Models.segformer)
|
|
||||||
class Segformer(EasyCVBaseModel, EncoderDecoder):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
EncoderDecoder.__init__(self, *args, **kwargs)
|
|
||||||
@@ -60,7 +60,7 @@ class SemanticSegmentation(TorchModel):
|
|||||||
ids = ids[legal_indices]
|
ids = ids[legal_indices]
|
||||||
|
|
||||||
segms = (semantic_result[None] == ids[:, None, None])
|
segms = (semantic_result[None] == ids[:, None, None])
|
||||||
masks = [it.astype(np.int) for it in segms]
|
masks = [it.astype(int) for it in segms]
|
||||||
labels_txt = np.array(self.CLASSES)[ids].tolist()
|
labels_txt = np.array(self.CLASSES)[ids].tolist()
|
||||||
|
|
||||||
results = {
|
results = {
|
||||||
|
|||||||
@@ -458,7 +458,7 @@ class HrnetBackBone(nn.Module):
|
|||||||
self.stage4, pre_stage_channels = self._make_stage(
|
self.stage4, pre_stage_channels = self._make_stage(
|
||||||
self.stage4_cfg, num_channels, multi_scale_output=True)
|
self.stage4_cfg, num_channels, multi_scale_output=True)
|
||||||
|
|
||||||
self.backbone_last_inp_channels = np.int(np.sum(pre_stage_channels))
|
self.backbone_last_inp_channels = int(np.sum(pre_stage_channels))
|
||||||
|
|
||||||
def _make_transition_layer(self, num_channels_pre_layer,
|
def _make_transition_layer(self, num_channels_pre_layer,
|
||||||
num_channels_cur_layer):
|
num_channels_cur_layer):
|
||||||
|
|||||||
@@ -259,7 +259,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
|
|||||||
num_channels = [64, last_inp_channels]
|
num_channels = [64, last_inp_channels]
|
||||||
self.stage_super, super_stage_channels = self._make_stage(
|
self.stage_super, super_stage_channels = self._make_stage(
|
||||||
self.super_dict, num_channels)
|
self.super_dict, num_channels)
|
||||||
last_inp_channels = np.int(np.sum(super_stage_channels))
|
last_inp_channels = int(np.sum(super_stage_channels))
|
||||||
|
|
||||||
if self.is_contain_aspp:
|
if self.is_contain_aspp:
|
||||||
aspp_param = kwargs['aspp']
|
aspp_param = kwargs['aspp']
|
||||||
@@ -372,7 +372,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
|
|||||||
num_channels = [64, ocr_mid_channels]
|
num_channels = [64, ocr_mid_channels]
|
||||||
self.stage_super, super_stage_channels = self._make_stage(
|
self.stage_super, super_stage_channels = self._make_stage(
|
||||||
self.super_dict, num_channels)
|
self.super_dict, num_channels)
|
||||||
last_inp_channels = np.int(np.sum(super_stage_channels))
|
last_inp_channels = int(np.sum(super_stage_channels))
|
||||||
|
|
||||||
self.cls_head = nn.Sequential(
|
self.cls_head = nn.Sequential(
|
||||||
nn.Conv2d(
|
nn.Conv2d(
|
||||||
|
|||||||
@@ -13,7 +13,8 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import torchvision.transforms as TF
|
import torchvision.transforms as TF
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from shotdetect_scenedetect_lgss import shot_detect
|
from shotdetect_scenedetect_lgss import shot_detector
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
from modelscope.metainfo import Models
|
||||||
from modelscope.models.base.base_torch_model import TorchModel
|
from modelscope.models.base.base_torch_model import TorchModel
|
||||||
@@ -60,6 +61,9 @@ class MovieSceneSegmentationModel(TorchModel):
|
|||||||
self.head_sbd = nn.Linear(hdim, 2)
|
self.head_sbd = nn.Linear(hdim, 2)
|
||||||
load_param_with_prefix('head_sbd', self.head_sbd, params)
|
load_param_with_prefix('head_sbd', self.head_sbd, params)
|
||||||
|
|
||||||
|
self.shot_detector = shot_detector()
|
||||||
|
self.shot_detector.init(**self.cfg.preprocessor.shot_detect)
|
||||||
|
|
||||||
self.test_transform = TF.Compose([
|
self.test_transform = TF.Compose([
|
||||||
TF.Resize(size=256, interpolation=Image.BICUBIC),
|
TF.Resize(size=256, interpolation=Image.BICUBIC),
|
||||||
TF.CenterCrop(224),
|
TF.CenterCrop(224),
|
||||||
@@ -98,29 +102,45 @@ class MovieSceneSegmentationModel(TorchModel):
|
|||||||
def inference(self, batch):
|
def inference(self, batch):
|
||||||
logger.info('Begin scene detect ......')
|
logger.info('Begin scene detect ......')
|
||||||
bs = self.cfg.pipeline.batch_size_per_gpu
|
bs = self.cfg.pipeline.batch_size_per_gpu
|
||||||
sids = batch['sid']
|
device = self.crn.attention_mask.device
|
||||||
inputs = batch['shot_feat']
|
|
||||||
|
|
||||||
shot_num = len(sids)
|
shot_timecode_lst = batch['shot_timecode_lst']
|
||||||
|
shot_idx_lst = batch['shot_idx_lst']
|
||||||
|
|
||||||
|
shot_num = len(shot_timecode_lst)
|
||||||
cnt = math.ceil(shot_num / bs)
|
cnt = math.ceil(shot_num / bs)
|
||||||
|
|
||||||
infer_sid, infer_pred = [], []
|
infer_pred = []
|
||||||
infer_result = {}
|
infer_result = {}
|
||||||
for i in range(cnt):
|
self.shot_detector.start()
|
||||||
|
|
||||||
|
for i in tqdm(range(cnt)):
|
||||||
start = i * bs
|
start = i * bs
|
||||||
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
|
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
|
||||||
input_ = inputs[start:end]
|
|
||||||
sid_ = sids[start:end]
|
batch_shot_idx_lst = shot_idx_lst[start:end]
|
||||||
input_ = torch.stack(input_)
|
|
||||||
|
shot_start_idx = batch_shot_idx_lst[0][0]
|
||||||
|
shot_end_idx = batch_shot_idx_lst[-1][-1]
|
||||||
|
batch_timecode_lst = {
|
||||||
|
i: shot_timecode_lst[i]
|
||||||
|
for i in range(shot_start_idx, shot_end_idx + 1)
|
||||||
|
}
|
||||||
|
batch_shot_keyf_lst = self.shot_detector.get_frame_img(
|
||||||
|
batch_timecode_lst, shot_start_idx, shot_num)
|
||||||
|
inputs = self.get_batch_input(batch_shot_keyf_lst, shot_start_idx,
|
||||||
|
batch_shot_idx_lst)
|
||||||
|
|
||||||
|
input_ = torch.stack(inputs).to(device)
|
||||||
outputs = self.shared_step(input_) # shape [b,2]
|
outputs = self.shared_step(input_) # shape [b,2]
|
||||||
prob = F.softmax(outputs, dim=1)
|
prob = F.softmax(outputs, dim=1)
|
||||||
infer_sid.extend(sid_.cpu().detach().numpy())
|
|
||||||
infer_pred.extend(prob[:, 1].cpu().detach().numpy())
|
infer_pred.extend(prob[:, 1].cpu().detach().numpy())
|
||||||
infer_result.update({'pred': np.stack(infer_pred)})
|
|
||||||
infer_result.update({'sid': infer_sid})
|
|
||||||
|
|
||||||
assert len(infer_result['sid']) == len(sids)
|
infer_result.update({'pred': np.stack(infer_pred)})
|
||||||
assert len(infer_result['pred']) == len(inputs)
|
infer_result.update({'sid': np.arange(shot_num)})
|
||||||
|
|
||||||
|
assert len(infer_result['pred']) == shot_num
|
||||||
|
self.shot_detector.release()
|
||||||
return infer_result
|
return infer_result
|
||||||
|
|
||||||
def shared_step(self, inputs):
|
def shared_step(self, inputs):
|
||||||
@@ -162,38 +182,48 @@ class MovieSceneSegmentationModel(TorchModel):
|
|||||||
logger.info('Generate scene .......')
|
logger.info('Generate scene .......')
|
||||||
|
|
||||||
pred_dict = inputs['feat']
|
pred_dict = inputs['feat']
|
||||||
|
shot2keyf = inputs['shot2keyf']
|
||||||
thres = self.cfg.pipeline.save_threshold
|
thres = self.cfg.pipeline.save_threshold
|
||||||
|
|
||||||
anno_dict = get_pred_boundary(pred_dict, thres)
|
anno_dict = get_pred_boundary(pred_dict, thres)
|
||||||
scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
|
scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
|
||||||
self.shot2keyf, anno_dict)
|
shot2keyf, anno_dict)
|
||||||
if self.cfg.pipeline.save_split_scene:
|
if self.cfg.pipeline.save_split_scene:
|
||||||
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
|
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
|
||||||
print(f'Split scene video saved to {re_dir}')
|
print(f'Split scene video saved to {re_dir}')
|
||||||
return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
|
return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
|
||||||
|
|
||||||
def preprocess(self, inputs):
|
def get_batch_input(self, shot_keyf_lst, shot_start_idx, shot_idx_lst):
|
||||||
logger.info('Begin shot detect......')
|
|
||||||
shot_keyf_lst, anno, shot2keyf = shot_detect(
|
|
||||||
inputs, **self.cfg.preprocessor.shot_detect)
|
|
||||||
logger.info('Shot detect done!')
|
|
||||||
|
|
||||||
single_shot_feat, sid = [], []
|
single_shot_feat = []
|
||||||
for idx, one_shot in enumerate(shot_keyf_lst):
|
for idx, one_shot in enumerate(shot_keyf_lst):
|
||||||
one_shot = [
|
one_shot = [
|
||||||
self.test_transform(one_frame) for one_frame in one_shot
|
self.test_transform(one_frame) for one_frame in one_shot
|
||||||
]
|
]
|
||||||
one_shot = torch.stack(one_shot, dim=0)
|
one_shot = torch.stack(one_shot, dim=0)
|
||||||
single_shot_feat.append(one_shot)
|
single_shot_feat.append(one_shot)
|
||||||
sid.append(idx)
|
|
||||||
single_shot_feat = torch.stack(single_shot_feat, dim=0)
|
single_shot_feat = torch.stack(single_shot_feat, dim=0)
|
||||||
|
|
||||||
shot_feat = []
|
shot_feat = []
|
||||||
|
for idx, shot_idx in enumerate(shot_idx_lst):
|
||||||
|
shot_idx_ = shot_idx - shot_start_idx
|
||||||
|
_one_shot = single_shot_feat[shot_idx_]
|
||||||
|
shot_feat.append(_one_shot)
|
||||||
|
|
||||||
|
return shot_feat
|
||||||
|
|
||||||
|
def preprocess(self, inputs):
|
||||||
|
logger.info('Begin shot detect......')
|
||||||
|
shot_timecode_lst, anno, shot2keyf = self.shot_detector.shot_detect(
|
||||||
|
inputs, **self.cfg.preprocessor.shot_detect)
|
||||||
|
logger.info('Shot detect done!')
|
||||||
|
|
||||||
|
shot_idx_lst = []
|
||||||
for idx, one_shot in enumerate(anno):
|
for idx, one_shot in enumerate(anno):
|
||||||
shot_idx = int(one_shot['shot_id']) + np.arange(
|
shot_idx = int(one_shot['shot_id']) + np.arange(
|
||||||
-self.neighbor_size, self.neighbor_size + 1)
|
-self.neighbor_size, self.neighbor_size + 1)
|
||||||
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
|
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'] - 1)
|
||||||
_one_shot = single_shot_feat[shot_idx]
|
shot_idx_lst.append(shot_idx)
|
||||||
shot_feat.append(_one_shot)
|
|
||||||
self.shot2keyf = shot2keyf
|
return shot2keyf, anno, shot_timecode_lst, shot_idx_lst
|
||||||
self.anno = anno
|
|
||||||
return shot_feat, sid
|
|
||||||
|
|||||||
@@ -10,11 +10,12 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
|
|
||||||
def get_pred_boundary(pred_dict, threshold=0.5):
|
def get_pred_boundary(pred_dict, threshold=0.5):
|
||||||
pred = pred_dict['pred']
|
pred = pred_dict['pred'].cpu().numpy()
|
||||||
|
sid = pred_dict['sid'].cpu().numpy().astype(np.int32)
|
||||||
tmp = (pred > threshold).astype(np.int32)
|
tmp = (pred > threshold).astype(np.int32)
|
||||||
anno_dict = {}
|
anno_dict = {}
|
||||||
for idx in range(len(tmp)):
|
for idx in range(len(tmp)):
|
||||||
anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
|
anno_dict.update({str(sid[idx]).zfill(4): int(tmp[idx])})
|
||||||
return anno_dict
|
return anno_dict
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class ObjectSegmenter(object):
|
|||||||
elif img.shape[2] == 4:
|
elif img.shape[2] == 4:
|
||||||
img = img[:, :, :3]
|
img = img[:, :, :3]
|
||||||
img = img[:, :, ::-1]
|
img = img[:, :, ::-1]
|
||||||
img = img.astype(np.float)
|
img = img.astype(float)
|
||||||
return img
|
return img
|
||||||
|
|
||||||
def run_mask(self, img):
|
def run_mask(self, img):
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.detection.detectors import Detection as _Detection
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.image_object_detection, module_name=Models.dino)
|
|
||||||
class DINO(EasyCVBaseModel, _Detection):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
_Detection.__init__(self, *args, **kwargs)
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
||||||
from easycv.models.detection.detectors import YOLOX as _YOLOX
|
|
||||||
|
|
||||||
from modelscope.metainfo import Models
|
|
||||||
from modelscope.models.builder import MODELS
|
|
||||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
|
||||||
from modelscope.utils.constant import Tasks
|
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.image_object_detection, module_name=Models.yolox)
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.image_object_detection,
|
|
||||||
module_name=Models.image_object_detection_auto)
|
|
||||||
@MODELS.register_module(
|
|
||||||
group_key=Tasks.domain_specific_object_detection, module_name=Models.yolox)
|
|
||||||
class YOLOX(EasyCVBaseModel, _YOLOX):
|
|
||||||
|
|
||||||
def __init__(self, model_dir=None, *args, **kwargs):
|
|
||||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
|
||||||
_YOLOX.__init__(self, *args, **kwargs)
|
|
||||||
@@ -30,7 +30,7 @@ def depth2color(depth):
|
|||||||
if gray == 1:
|
if gray == 1:
|
||||||
return tuple(colors[-1].tolist())
|
return tuple(colors[-1].tolist())
|
||||||
num_rank = len(colors) - 1
|
num_rank = len(colors) - 1
|
||||||
rank = np.floor(gray * num_rank).astype(np.int)
|
rank = np.floor(gray * num_rank).astype(int)
|
||||||
diff = (gray - rank / num_rank) * num_rank
|
diff = (gray - rank / num_rank) * num_rank
|
||||||
tmp = colors[rank + 1] - colors[rank]
|
tmp = colors[rank + 1] - colors[rank]
|
||||||
return tuple((colors[rank] + tmp * diff).tolist())
|
return tuple((colors[rank] + tmp * diff).tolist())
|
||||||
@@ -136,7 +136,7 @@ def plot_result(res_path,
|
|||||||
l2g = get_lidar2global(infos)
|
l2g = get_lidar2global(infos)
|
||||||
corners_lidar = corners_global @ np.linalg.inv(l2g).T
|
corners_lidar = corners_global @ np.linalg.inv(l2g).T
|
||||||
corners_lidar = corners_lidar[:, :3]
|
corners_lidar = corners_lidar[:, :3]
|
||||||
pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
|
pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=bool)
|
||||||
scores = [
|
scores = [
|
||||||
pred_res[rid]['detection_score'] for rid in range(len(pred_res))
|
pred_res[rid]['detection_score'] for rid in range(len(pred_res))
|
||||||
]
|
]
|
||||||
@@ -151,7 +151,7 @@ def plot_result(res_path,
|
|||||||
origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
|
origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
|
||||||
corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
|
corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
|
||||||
axis=0)
|
axis=0)
|
||||||
gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
|
gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=bool)
|
||||||
pred_flag = np.concatenate(
|
pred_flag = np.concatenate(
|
||||||
[pred_flag, np.logical_not(gt_flag)], axis=0)
|
[pred_flag, np.logical_not(gt_flag)], axis=0)
|
||||||
scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
|
scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
|
||||||
@@ -169,7 +169,7 @@ def plot_result(res_path,
|
|||||||
check_point_in_img(corners_img, img.shape[0], img.shape[1]))
|
check_point_in_img(corners_img, img.shape[0], img.shape[1]))
|
||||||
valid = valid.reshape(
|
valid = valid.reshape(
|
||||||
-1, 8) # valid means: d>0 and visible in current view
|
-1, 8) # valid means: d>0 and visible in current view
|
||||||
corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
|
corners_img = corners_img.reshape(-1, 8, 2).astype(int)
|
||||||
for aid in range(valid.shape[0]):
|
for aid in range(valid.shape[0]):
|
||||||
if scores[aid] < vis_thred and pred_flag[aid]:
|
if scores[aid] < vis_thred and pred_flag[aid]:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -90,8 +90,15 @@ class OCRRecognition(TorchModel):
|
|||||||
f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
|
f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
|
||||||
)
|
)
|
||||||
if model_path != '':
|
if model_path != '':
|
||||||
self.recognizer.load_state_dict(
|
params_pretrained = torch.load(model_path, map_location='cpu')
|
||||||
torch.load(model_path, map_location='cpu'))
|
model_dict = self.recognizer.state_dict()
|
||||||
|
# remove prefix for finetuned models
|
||||||
|
check_point = {
|
||||||
|
k.replace('recognizer.', ''): v
|
||||||
|
for k, v in params_pretrained.items()
|
||||||
|
}
|
||||||
|
model_dict.update(check_point)
|
||||||
|
self.recognizer.load_state_dict(model_dict)
|
||||||
|
|
||||||
dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
|
dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
|
||||||
self.labelMapping = dict()
|
self.labelMapping = dict()
|
||||||
|
|||||||
@@ -176,8 +176,7 @@ class OpenVocabularyDetectionViLD(Model):
|
|||||||
# Filter out invalid rois (nmsed rois)
|
# Filter out invalid rois (nmsed rois)
|
||||||
valid_indices = np.where(
|
valid_indices = np.where(
|
||||||
np.logical_and(
|
np.logical_and(
|
||||||
np.isin(
|
np.isin(np.arange(len(roi_scores), dtype=int), nmsed_indices),
|
||||||
np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
|
|
||||||
np.logical_and(
|
np.logical_and(
|
||||||
np.logical_not(np.all(roi_boxes == 0., axis=-1)),
|
np.logical_not(np.all(roi_boxes == 0., axis=-1)),
|
||||||
np.logical_and(roi_scores >= min_rpn_score_thresh,
|
np.logical_and(roi_scores >= min_rpn_score_thresh,
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ class Cube2Equirec(nn.Module):
|
|||||||
self.equ_h, 0), 3 * self.equ_w // 8, 1)
|
self.equ_h, 0), 3 * self.equ_w // 8, 1)
|
||||||
|
|
||||||
# Prepare ceil mask
|
# Prepare ceil mask
|
||||||
mask = np.zeros((self.equ_h, self.equ_w // 4), np.bool)
|
mask = np.zeros((self.equ_h, self.equ_w // 4), bool)
|
||||||
idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
|
idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
|
||||||
idx = self.equ_h // 2 - np.round(
|
idx = self.equ_h // 2 - np.round(
|
||||||
np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)
|
np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def load_depth(file):
|
|||||||
elif file.endswith('png'):
|
elif file.endswith('png'):
|
||||||
depth_png = np.array(load_image(file), dtype=int)
|
depth_png = np.array(load_image(file), dtype=int)
|
||||||
assert (np.max(depth_png) > 255), 'Wrong .png depth file'
|
assert (np.max(depth_png) > 255), 'Wrong .png depth file'
|
||||||
return depth_png.astype(np.float) / 256.
|
return depth_png.astype(float) / 256.
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('Depth extension not supported.')
|
raise NotImplementedError('Depth extension not supported.')
|
||||||
|
|
||||||
|
|||||||
@@ -85,7 +85,7 @@ def do_scene_detect(F01_tensor, F10_tensor, img0_tensor, img1_tensor):
|
|||||||
img_diff = ori_img.float() - ref_img.float()
|
img_diff = ori_img.float() - ref_img.float()
|
||||||
img_diff = torch.abs(img_diff)
|
img_diff = torch.abs(img_diff)
|
||||||
|
|
||||||
kernel = np.ones([8, 8], np.float) / 64
|
kernel = np.ones([8, 8], float) / 64
|
||||||
kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
|
kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
|
||||||
diff = F.conv2d(img_diff, kernel, padding=4)
|
diff = F.conv2d(img_diff, kernel, padding=4)
|
||||||
|
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ def linear_assignment(cost_matrix, thresh):
|
|||||||
|
|
||||||
|
|
||||||
def ious(atlbrs, btlbrs):
|
def ious(atlbrs, btlbrs):
|
||||||
ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
|
ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=float)
|
||||||
if ious.size == 0:
|
if ious.size == 0:
|
||||||
return ious
|
return ious
|
||||||
|
|
||||||
@@ -60,13 +60,13 @@ def embedding_distance(tracks, detections, metric='cosine'):
|
|||||||
cost_matrix: np.ndarray
|
cost_matrix: np.ndarray
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
|
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=float)
|
||||||
if cost_matrix.size == 0:
|
if cost_matrix.size == 0:
|
||||||
return cost_matrix
|
return cost_matrix
|
||||||
det_features = np.asarray([track.curr_feat for track in detections],
|
det_features = np.asarray([track.curr_feat for track in detections],
|
||||||
dtype=np.float)
|
dtype=float)
|
||||||
track_features = np.asarray([track.smooth_feat for track in tracks],
|
track_features = np.asarray([track.smooth_feat for track in tracks],
|
||||||
dtype=np.float)
|
dtype=float)
|
||||||
cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
|
cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
|
||||||
return cost_matrix
|
return cost_matrix
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class STrack(BaseTrack):
|
|||||||
def __init__(self, tlwh, score, temp_feat, buffer_size=30):
|
def __init__(self, tlwh, score, temp_feat, buffer_size=30):
|
||||||
|
|
||||||
# wait activate
|
# wait activate
|
||||||
self._tlwh = np.asarray(tlwh, dtype=np.float)
|
self._tlwh = np.asarray(tlwh, dtype=float)
|
||||||
self.kalman_filter = None
|
self.kalman_filter = None
|
||||||
self.mean, self.covariance = None, None
|
self.mean, self.covariance = None, None
|
||||||
self.is_activated = False
|
self.is_activated = False
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
|
|||||||
from .vldoc import VLDocForDocVLEmbedding
|
from .vldoc import VLDocForDocVLEmbedding
|
||||||
from .video_synthesis import TextToVideoSynthesis
|
from .video_synthesis import TextToVideoSynthesis
|
||||||
from .efficient_diffusion_tuning import EfficientStableDiffusion
|
from .efficient_diffusion_tuning import EfficientStableDiffusion
|
||||||
|
from .mplug_owl import MplugOwlForConditionalGeneration
|
||||||
|
from .clip_interrogator import CLIP_Interrogator
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
@@ -37,7 +39,9 @@ else:
|
|||||||
['MultiStageDiffusionForTextToImageSynthesis'],
|
['MultiStageDiffusionForTextToImageSynthesis'],
|
||||||
'vldoc': ['VLDocForDocVLEmbedding'],
|
'vldoc': ['VLDocForDocVLEmbedding'],
|
||||||
'video_synthesis': ['TextToVideoSynthesis'],
|
'video_synthesis': ['TextToVideoSynthesis'],
|
||||||
'efficient_diffusion_tuning': ['EfficientStableDiffusion']
|
'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
|
||||||
|
'mplug_owl': ['MplugOwlForConditionalGeneration'],
|
||||||
|
'clip_interrogator': ['CLIP_Interrogator'],
|
||||||
}
|
}
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
from .model import CLIP_Interrogator
|
||||||
599
modelscope/models/multi_modal/clip_interrogator/model.py
Normal file
599
modelscope/models/multi_modal/clip_interrogator/model.py
Normal file
@@ -0,0 +1,599 @@
|
|||||||
|
# This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at
|
||||||
|
# https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import open_clip
|
||||||
|
import requests
|
||||||
|
import torch
|
||||||
|
import torchvision.transforms as transforms
|
||||||
|
from PIL import Image
|
||||||
|
from safetensors.numpy import load_file, save_file
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import (AutoModelForCausalLM, AutoProcessor,
|
||||||
|
Blip2ForConditionalGeneration,
|
||||||
|
BlipForConditionalGeneration)
|
||||||
|
|
||||||
|
from modelscope.metainfo import Models
|
||||||
|
from modelscope.models.base import TorchModel
|
||||||
|
from modelscope.models.builder import MODELS
|
||||||
|
from modelscope.outputs import OutputKeys
|
||||||
|
from modelscope.preprocessors import LoadImage
|
||||||
|
from modelscope.utils.constant import ModelFile, Tasks
|
||||||
|
from modelscope.utils.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
__all__ = ['CLIP_Interrogator']
|
||||||
|
|
||||||
|
CAPTION_MODELS = {
|
||||||
|
'blip-base': 'blip-image-captioning-base',
|
||||||
|
'blip-large': 'blip-image-captioning-large',
|
||||||
|
'blip2-2.7b': 'blip2-opt-2.7b',
|
||||||
|
'blip2-flan-t5-xl': 'blip2-flan-t5-xl',
|
||||||
|
'git-large-coco': 'git-large-coco',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Config:
|
||||||
|
# models can optionally be passed in directly
|
||||||
|
caption_model = None
|
||||||
|
caption_processor = None
|
||||||
|
clip_model = None
|
||||||
|
clip_preprocess = None
|
||||||
|
|
||||||
|
# blip settings
|
||||||
|
caption_max_length: int = 32
|
||||||
|
caption_model_name: Optional[
|
||||||
|
str] = 'blip-large' # use a key from CAPTION_MODELS or None
|
||||||
|
caption_offload: bool = False
|
||||||
|
|
||||||
|
# clip settings
|
||||||
|
clip_model_name: str = 'ViT-L-14/openai'
|
||||||
|
clip_model_path: Optional[str] = None
|
||||||
|
clip_offload: bool = False
|
||||||
|
|
||||||
|
# interrogator settings
|
||||||
|
cache_path: str = 'cache' # path to store cached text embeddings
|
||||||
|
download_cache: bool = False # when true, cached embeds are downloaded from huggingface
|
||||||
|
chunk_size: int = 2048 # batch size for CLIP, use smaller for lower VRAM
|
||||||
|
data_path: str = os.path.join(os.path.dirname(__file__), 'data')
|
||||||
|
device: str = ('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
flavor_intermediate_count: int = 2048
|
||||||
|
quiet: bool = False # when quiet progress bars are not shown
|
||||||
|
|
||||||
|
def apply_low_vram_defaults(self):
|
||||||
|
self.caption_model_name = 'blip-base'
|
||||||
|
self.caption_offload = True
|
||||||
|
self.clip_offload = True
|
||||||
|
self.chunk_size = 1024
|
||||||
|
self.flavor_intermediate_count = 1024
|
||||||
|
|
||||||
|
|
||||||
|
# CLIP-Interrogator utilize CLIP and BLIP to generate rich caption for images.
|
||||||
|
# CLIP is a zero-shot image classifier which can be used to generate image and text embeddings.
|
||||||
|
# BLIP is a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks.
|
||||||
|
# BLIP effectively utilizes the noisy web data by bootstrapping the captions, where
|
||||||
|
# a captioner generates synthetic captions and a filter removes the noisy ones.
|
||||||
|
# Please infer to the paper CLIP: Learning Transferable Visual Models From Natural Language Supervision
|
||||||
|
# https://arxiv.org/abs/2103.00020
|
||||||
|
# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
|
||||||
|
# https://arxiv.org/abs/2201.12086
|
||||||
|
|
||||||
|
|
||||||
|
class Interrogator():
|
||||||
|
|
||||||
|
def __init__(self, config: Config):
|
||||||
|
self.config = config
|
||||||
|
self.device = config.device
|
||||||
|
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
|
||||||
|
self.caption_offloaded = True
|
||||||
|
self.clip_offloaded = True
|
||||||
|
self.load_caption_model()
|
||||||
|
self.load_clip_model()
|
||||||
|
|
||||||
|
def load_caption_model(self):
|
||||||
|
if self.config.caption_model is None and self.config.caption_model_name:
|
||||||
|
if not self.config.quiet:
|
||||||
|
print(
|
||||||
|
f'Loading caption model {self.config.caption_model_name}...'
|
||||||
|
)
|
||||||
|
|
||||||
|
model_path = CAPTION_MODELS[self.config.caption_model_name]
|
||||||
|
if self.config.caption_model_name.startswith('git-'):
|
||||||
|
caption_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
os.path.join(self.config.cache_path, model_path),
|
||||||
|
torch_dtype=torch.float32)
|
||||||
|
elif self.config.caption_model_name.startswith('blip2-'):
|
||||||
|
caption_model = Blip2ForConditionalGeneration.from_pretrained(
|
||||||
|
os.path.join(self.config.cache_path, model_path),
|
||||||
|
torch_dtype=self.dtype)
|
||||||
|
else:
|
||||||
|
caption_model = BlipForConditionalGeneration.from_pretrained(
|
||||||
|
os.path.join(self.config.cache_path, model_path),
|
||||||
|
torch_dtype=self.dtype)
|
||||||
|
self.caption_processor = AutoProcessor.from_pretrained(
|
||||||
|
os.path.join(self.config.cache_path, model_path))
|
||||||
|
|
||||||
|
caption_model.eval()
|
||||||
|
if not self.config.caption_offload:
|
||||||
|
caption_model = caption_model.to(self.config.device)
|
||||||
|
self.caption_model = caption_model
|
||||||
|
else:
|
||||||
|
self.caption_model = self.config.caption_model
|
||||||
|
self.caption_processor = self.config.caption_processor
|
||||||
|
|
||||||
|
def load_clip_model(self):
|
||||||
|
start_time = time.time()
|
||||||
|
config = self.config
|
||||||
|
|
||||||
|
clip_model_name, clip_model_pretrained_name = config.clip_model_name.split(
|
||||||
|
'/', 2)
|
||||||
|
|
||||||
|
if config.clip_model is None:
|
||||||
|
if not config.quiet:
|
||||||
|
print(f'Loading CLIP model {config.clip_model_name}...')
|
||||||
|
|
||||||
|
self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
|
||||||
|
clip_model_name,
|
||||||
|
pretrained=clip_model_pretrained_name,
|
||||||
|
precision='fp16' if config.device == 'cuda' else 'fp32',
|
||||||
|
device=config.device,
|
||||||
|
jit=False,
|
||||||
|
cache_dir=config.clip_model_path)
|
||||||
|
self.clip_model.eval()
|
||||||
|
else:
|
||||||
|
self.clip_model = config.clip_model
|
||||||
|
self.clip_preprocess = config.clip_preprocess
|
||||||
|
self.tokenize = open_clip.get_tokenizer(clip_model_name)
|
||||||
|
|
||||||
|
sites = [
|
||||||
|
'Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart',
|
||||||
|
'dribbble', 'flickr', 'instagram', 'pexels', 'pinterest',
|
||||||
|
'pixabay', 'pixiv', 'polycount', 'reddit', 'shutterstock',
|
||||||
|
'tumblr', 'unsplash', 'zbrush central'
|
||||||
|
]
|
||||||
|
trending_list = [site for site in sites]
|
||||||
|
trending_list.extend(['trending on ' + site for site in sites])
|
||||||
|
trending_list.extend(['featured on ' + site for site in sites])
|
||||||
|
trending_list.extend([site + ' contest winner' for site in sites])
|
||||||
|
|
||||||
|
raw_artists = load_list(config.data_path, 'artists.txt')
|
||||||
|
artists = [f'by {a}' for a in raw_artists]
|
||||||
|
artists.extend([f'inspired by {a}' for a in raw_artists])
|
||||||
|
|
||||||
|
self._prepare_clip()
|
||||||
|
self.artists = LabelTable(artists, 'artists', self)
|
||||||
|
self.flavors = LabelTable(
|
||||||
|
load_list(config.data_path, 'flavors.txt'), 'flavors', self)
|
||||||
|
self.mediums = LabelTable(
|
||||||
|
load_list(config.data_path, 'mediums.txt'), 'mediums', self)
|
||||||
|
self.movements = LabelTable(
|
||||||
|
load_list(config.data_path, 'movements.txt'), 'movements', self)
|
||||||
|
self.trendings = LabelTable(trending_list, 'trendings', self)
|
||||||
|
self.negative = LabelTable(
|
||||||
|
load_list(config.data_path, 'negative.txt'), 'negative', self)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
if not config.quiet:
|
||||||
|
print(
|
||||||
|
f'Loaded CLIP model and data in {end_time-start_time:.2f} seconds.'
|
||||||
|
)
|
||||||
|
|
||||||
|
def chain(self,
|
||||||
|
image_features: torch.Tensor,
|
||||||
|
phrases: List[str],
|
||||||
|
best_prompt: str = '',
|
||||||
|
best_sim: float = 0,
|
||||||
|
min_count: int = 8,
|
||||||
|
max_count: int = 32,
|
||||||
|
desc='Chaining',
|
||||||
|
reverse: bool = False) -> str:
|
||||||
|
self._prepare_clip()
|
||||||
|
|
||||||
|
phrases = set(phrases)
|
||||||
|
if not best_prompt:
|
||||||
|
best_prompt = self.rank_top(
|
||||||
|
image_features, [f for f in phrases], reverse=reverse)
|
||||||
|
best_sim = self.similarity(image_features, best_prompt)
|
||||||
|
phrases.remove(best_prompt)
|
||||||
|
curr_prompt, curr_sim = best_prompt, best_sim
|
||||||
|
|
||||||
|
def check(addition: str, idx: int) -> bool:
|
||||||
|
nonlocal best_prompt, best_sim, curr_prompt, curr_sim
|
||||||
|
prompt = curr_prompt + ', ' + addition
|
||||||
|
sim = self.similarity(image_features, prompt)
|
||||||
|
if reverse:
|
||||||
|
sim = -sim
|
||||||
|
|
||||||
|
if sim > best_sim:
|
||||||
|
best_prompt, best_sim = prompt, sim
|
||||||
|
if sim > curr_sim or idx < min_count:
|
||||||
|
curr_prompt, curr_sim = prompt, sim
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
for idx in tqdm(
|
||||||
|
range(max_count), desc=desc, disable=self.config.quiet):
|
||||||
|
best = self.rank_top(
|
||||||
|
image_features, [f'{curr_prompt}, {f}' for f in phrases],
|
||||||
|
reverse=reverse)
|
||||||
|
flave = best[len(curr_prompt) + 2:]
|
||||||
|
if not check(flave, idx):
|
||||||
|
break
|
||||||
|
if _prompt_at_max_len(curr_prompt, self.tokenize):
|
||||||
|
break
|
||||||
|
phrases.remove(flave)
|
||||||
|
|
||||||
|
return best_prompt
|
||||||
|
|
||||||
|
def generate_caption(self, pil_image: Image) -> str:
|
||||||
|
assert self.caption_model is not None, 'No caption model loaded.'
|
||||||
|
self._prepare_caption()
|
||||||
|
inputs = self.caption_processor(
|
||||||
|
images=pil_image, return_tensors='pt').to(self.device)
|
||||||
|
if not self.config.caption_model_name.startswith('git-'):
|
||||||
|
inputs = inputs.to(self.dtype)
|
||||||
|
tokens = self.caption_model.generate(
|
||||||
|
**inputs, max_new_tokens=self.config.caption_max_length)
|
||||||
|
return self.caption_processor.batch_decode(
|
||||||
|
tokens, skip_special_tokens=True)[0].strip()
|
||||||
|
|
||||||
|
def image_to_features(self, image: Image) -> torch.Tensor:
|
||||||
|
self._prepare_clip()
|
||||||
|
images = self.clip_preprocess(image).unsqueeze(0).to(self.device)
|
||||||
|
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||||
|
image_features = self.clip_model.encode_image(images)
|
||||||
|
image_features /= image_features.norm(dim=-1, keepdim=True)
|
||||||
|
return image_features
|
||||||
|
|
||||||
|
def interrogate_classic(self,
|
||||||
|
image: Image,
|
||||||
|
max_flavors: int = 3,
|
||||||
|
caption: Optional[str] = None) -> str:
|
||||||
|
"""Classic mode creates a prompt in a standard format first describing the image,
|
||||||
|
then listing the artist, trending, movement, and flavor text modifiers."""
|
||||||
|
caption = caption or self.generate_caption(image)
|
||||||
|
image_features = self.image_to_features(image)
|
||||||
|
|
||||||
|
medium = self.mediums.rank(image_features, 1)[0]
|
||||||
|
artist = self.artists.rank(image_features, 1)[0]
|
||||||
|
trending = self.trendings.rank(image_features, 1)[0]
|
||||||
|
movement = self.movements.rank(image_features, 1)[0]
|
||||||
|
flaves = ', '.join(self.flavors.rank(image_features, max_flavors))
|
||||||
|
|
||||||
|
if caption.startswith(medium):
|
||||||
|
prompt = f'{caption} {artist}, {trending}, {movement}, {flaves}'
|
||||||
|
else:
|
||||||
|
prompt = f'{caption}, {medium} {artist}, {trending}, {movement}, {flaves}'
|
||||||
|
|
||||||
|
return _truncate_to_fit(prompt, self.tokenize)
|
||||||
|
|
||||||
|
def interrogate_fast(self,
|
||||||
|
image: Image,
|
||||||
|
max_flavors: int = 32,
|
||||||
|
caption: Optional[str] = None) -> str:
|
||||||
|
"""Fast mode simply adds the top ranked terms after a caption. It generally results in
|
||||||
|
better similarity between generated prompt and image than classic mode, but the prompts
|
||||||
|
are less readable."""
|
||||||
|
caption = caption or self.generate_caption(image)
|
||||||
|
image_features = self.image_to_features(image)
|
||||||
|
merged = _merge_tables([
|
||||||
|
self.artists, self.flavors, self.mediums, self.movements,
|
||||||
|
self.trendings
|
||||||
|
], self)
|
||||||
|
tops = merged.rank(image_features, max_flavors)
|
||||||
|
return _truncate_to_fit(caption + ', ' + ', '.join(tops),
|
||||||
|
self.tokenize)
|
||||||
|
|
||||||
|
def interrogate_negative(self, image: Image, max_flavors: int = 32) -> str:
|
||||||
|
"""Negative mode chains together the most dissimilar terms to the image. It can be used
|
||||||
|
to help build a negative prompt to pair with the regular positive prompt and often
|
||||||
|
improve the results of generated images particularly with Stable Diffusion 2."""
|
||||||
|
image_features = self.image_to_features(image)
|
||||||
|
flaves = self.flavors.rank(
|
||||||
|
image_features,
|
||||||
|
self.config.flavor_intermediate_count,
|
||||||
|
reverse=True)
|
||||||
|
flaves = flaves + self.negative.labels
|
||||||
|
return self.chain(
|
||||||
|
image_features,
|
||||||
|
flaves,
|
||||||
|
max_count=max_flavors,
|
||||||
|
reverse=True,
|
||||||
|
desc='Negative chain')
|
||||||
|
|
||||||
|
def interrogate(self,
|
||||||
|
image: Image,
|
||||||
|
min_flavors: int = 8,
|
||||||
|
max_flavors: int = 32,
|
||||||
|
caption: Optional[str] = None) -> str:
|
||||||
|
caption = caption or self.generate_caption(image)
|
||||||
|
image_features = self.image_to_features(image)
|
||||||
|
|
||||||
|
merged = _merge_tables([
|
||||||
|
self.artists, self.flavors, self.mediums, self.movements,
|
||||||
|
self.trendings
|
||||||
|
], self)
|
||||||
|
flaves = merged.rank(image_features,
|
||||||
|
self.config.flavor_intermediate_count)
|
||||||
|
best_prompt, best_sim = caption, self.similarity(
|
||||||
|
image_features, caption)
|
||||||
|
best_prompt = self.chain(
|
||||||
|
image_features,
|
||||||
|
flaves,
|
||||||
|
best_prompt,
|
||||||
|
best_sim,
|
||||||
|
min_count=min_flavors,
|
||||||
|
max_count=max_flavors,
|
||||||
|
desc='Flavor chain')
|
||||||
|
|
||||||
|
fast_prompt = self.interrogate_fast(
|
||||||
|
image, max_flavors, caption=caption)
|
||||||
|
classic_prompt = self.interrogate_classic(
|
||||||
|
image, max_flavors, caption=caption)
|
||||||
|
candidates = [caption, classic_prompt, fast_prompt, best_prompt]
|
||||||
|
return candidates[np.argmax(
|
||||||
|
self.similarities(image_features, candidates))]
|
||||||
|
|
||||||
|
def rank_top(self,
|
||||||
|
image_features: torch.Tensor,
|
||||||
|
text_array: List[str],
|
||||||
|
reverse: bool = False) -> str:
|
||||||
|
self._prepare_clip()
|
||||||
|
text_tokens = self.tokenize([text
|
||||||
|
for text in text_array]).to(self.device)
|
||||||
|
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||||
|
text_features = self.clip_model.encode_text(text_tokens)
|
||||||
|
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||||
|
similarity = text_features @ image_features.T
|
||||||
|
if reverse:
|
||||||
|
similarity = -similarity
|
||||||
|
return text_array[similarity.argmax().item()]
|
||||||
|
|
||||||
|
def similarity(self, image_features: torch.Tensor, text: str) -> float:
|
||||||
|
self._prepare_clip()
|
||||||
|
text_tokens = self.tokenize([text]).to(self.device)
|
||||||
|
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||||
|
text_features = self.clip_model.encode_text(text_tokens)
|
||||||
|
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||||
|
similarity = text_features @ image_features.T
|
||||||
|
return similarity[0][0].item()
|
||||||
|
|
||||||
|
def similarities(self, image_features: torch.Tensor,
|
||||||
|
text_array: List[str]) -> List[float]:
|
||||||
|
self._prepare_clip()
|
||||||
|
text_tokens = self.tokenize([text
|
||||||
|
for text in text_array]).to(self.device)
|
||||||
|
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||||
|
text_features = self.clip_model.encode_text(text_tokens)
|
||||||
|
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||||
|
similarity = text_features @ image_features.T
|
||||||
|
return similarity.T[0].tolist()
|
||||||
|
|
||||||
|
def _prepare_caption(self):
|
||||||
|
if self.config.clip_offload and not self.clip_offloaded:
|
||||||
|
self.clip_model = self.clip_model.to('cpu')
|
||||||
|
self.clip_offloaded = True
|
||||||
|
if self.caption_offloaded:
|
||||||
|
self.caption_model = self.caption_model.to(self.device)
|
||||||
|
self.caption_offloaded = False
|
||||||
|
|
||||||
|
def _prepare_clip(self):
|
||||||
|
if self.config.caption_offload and not self.caption_offloaded:
|
||||||
|
self.caption_model = self.caption_model.to('cpu')
|
||||||
|
self.caption_offloaded = True
|
||||||
|
if self.clip_offloaded:
|
||||||
|
self.clip_model = self.clip_model.to(self.device)
|
||||||
|
self.clip_offloaded = False
|
||||||
|
|
||||||
|
|
||||||
|
class LabelTable():
|
||||||
|
|
||||||
|
def __init__(self, labels: List[str], desc: str, ci: Interrogator):
|
||||||
|
clip_model, config = ci.clip_model, ci.config
|
||||||
|
self.chunk_size = config.chunk_size
|
||||||
|
self.config = config
|
||||||
|
self.device = config.device
|
||||||
|
self.embeds = []
|
||||||
|
self.labels = labels
|
||||||
|
self.tokenize = ci.tokenize
|
||||||
|
|
||||||
|
hash = hashlib.sha256(','.join(labels).encode()).hexdigest()
|
||||||
|
sanitized_name = self.config.clip_model_name.replace('/', '_').replace(
|
||||||
|
'@', '_')
|
||||||
|
self._load_cached(desc, hash, sanitized_name)
|
||||||
|
|
||||||
|
if len(self.labels) != len(self.embeds):
|
||||||
|
self.embeds = []
|
||||||
|
chunks = np.array_split(
|
||||||
|
self.labels, max(1,
|
||||||
|
len(self.labels) / config.chunk_size))
|
||||||
|
for chunk in tqdm(
|
||||||
|
chunks,
|
||||||
|
desc=f'Preprocessing {desc}' if desc else None,
|
||||||
|
disable=self.config.quiet):
|
||||||
|
text_tokens = self.tokenize(chunk).to(self.device)
|
||||||
|
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||||
|
text_features = clip_model.encode_text(text_tokens)
|
||||||
|
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||||
|
text_features = text_features.half().cpu().numpy()
|
||||||
|
for i in range(text_features.shape[0]):
|
||||||
|
self.embeds.append(text_features[i])
|
||||||
|
|
||||||
|
if desc and self.config.cache_path:
|
||||||
|
os.makedirs(self.config.cache_path, exist_ok=True)
|
||||||
|
cache_filepath = os.path.join(
|
||||||
|
self.config.cache_path,
|
||||||
|
f'{sanitized_name}_{desc}.safetensors')
|
||||||
|
tensors = {
|
||||||
|
'embeds': np.stack(self.embeds),
|
||||||
|
'hash': np.array([ord(c) for c in hash], dtype=np.int8)
|
||||||
|
}
|
||||||
|
save_file(tensors, cache_filepath)
|
||||||
|
|
||||||
|
if self.device == 'cpu' or self.device == torch.device('cpu'):
|
||||||
|
self.embeds = [e.astype(np.float32) for e in self.embeds]
|
||||||
|
|
||||||
|
def _load_cached(self, desc: str, hash: str, sanitized_name: str) -> bool:
|
||||||
|
if self.config.cache_path is None or desc is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
cached_safetensors = os.path.join(
|
||||||
|
self.config.cache_path, f'{sanitized_name}_{desc}.safetensors')
|
||||||
|
|
||||||
|
if os.path.exists(cached_safetensors):
|
||||||
|
try:
|
||||||
|
tensors = load_file(cached_safetensors)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Failed to load {cached_safetensors}')
|
||||||
|
print(e)
|
||||||
|
return False
|
||||||
|
if 'hash' in tensors and 'embeds' in tensors:
|
||||||
|
if np.array_equal(
|
||||||
|
tensors['hash'],
|
||||||
|
np.array([ord(c) for c in hash], dtype=np.int8)):
|
||||||
|
self.embeds = tensors['embeds']
|
||||||
|
if len(self.embeds.shape) == 2:
|
||||||
|
self.embeds = [
|
||||||
|
self.embeds[i] for i in range(self.embeds.shape[0])
|
||||||
|
]
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _rank(self,
|
||||||
|
image_features: torch.Tensor,
|
||||||
|
text_embeds: torch.Tensor,
|
||||||
|
top_count: int = 1,
|
||||||
|
reverse: bool = False) -> str:
|
||||||
|
top_count = min(top_count, len(text_embeds))
|
||||||
|
text_embeds = torch.stack([torch.from_numpy(t)
|
||||||
|
for t in text_embeds]).to(self.device)
|
||||||
|
with torch.cuda.amp.autocast():
|
||||||
|
similarity = image_features @ text_embeds.T
|
||||||
|
if reverse:
|
||||||
|
similarity = -similarity
|
||||||
|
_, top_labels = similarity.float().cpu().topk(top_count, dim=-1)
|
||||||
|
return [top_labels[0][i].numpy() for i in range(top_count)]
|
||||||
|
|
||||||
|
def rank(self,
|
||||||
|
image_features: torch.Tensor,
|
||||||
|
top_count: int = 1,
|
||||||
|
reverse: bool = False) -> List[str]:
|
||||||
|
if len(self.labels) <= self.chunk_size:
|
||||||
|
tops = self._rank(
|
||||||
|
image_features,
|
||||||
|
self.embeds,
|
||||||
|
top_count=top_count,
|
||||||
|
reverse=reverse)
|
||||||
|
return [self.labels[i] for i in tops]
|
||||||
|
|
||||||
|
num_chunks = int(math.ceil(len(self.labels) / self.chunk_size))
|
||||||
|
keep_per_chunk = int(self.chunk_size / num_chunks)
|
||||||
|
|
||||||
|
top_labels, top_embeds = [], []
|
||||||
|
for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
|
||||||
|
start = chunk_idx * self.chunk_size
|
||||||
|
stop = min(start + self.chunk_size, len(self.embeds))
|
||||||
|
tops = self._rank(
|
||||||
|
image_features,
|
||||||
|
self.embeds[start:stop],
|
||||||
|
top_count=keep_per_chunk,
|
||||||
|
reverse=reverse)
|
||||||
|
top_labels.extend([self.labels[start + i] for i in tops])
|
||||||
|
top_embeds.extend([self.embeds[start + i] for i in tops])
|
||||||
|
|
||||||
|
tops = self._rank(image_features, top_embeds, top_count=top_count)
|
||||||
|
return [top_labels[i] for i in tops]
|
||||||
|
|
||||||
|
|
||||||
|
def _download_file(url: str,
|
||||||
|
filepath: str,
|
||||||
|
chunk_size: int = 4 * 1024 * 1024,
|
||||||
|
quiet: bool = False):
|
||||||
|
r = requests.get(url, stream=True)
|
||||||
|
if r.status_code != 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
file_size = int(r.headers.get('Content-Length', 0))
|
||||||
|
filename = url.split('/')[-1]
|
||||||
|
progress = tqdm(
|
||||||
|
total=file_size,
|
||||||
|
unit='B',
|
||||||
|
unit_scale=True,
|
||||||
|
desc=filename,
|
||||||
|
disable=quiet)
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=chunk_size):
|
||||||
|
if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
progress.update(len(chunk))
|
||||||
|
progress.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_tables(tables: List[LabelTable], ci: Interrogator) -> LabelTable:
|
||||||
|
m = LabelTable([], None, ci)
|
||||||
|
for table in tables:
|
||||||
|
m.labels.extend(table.labels)
|
||||||
|
m.embeds.extend(table.embeds)
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
def _prompt_at_max_len(text: str, tokenize) -> bool:
|
||||||
|
tokens = tokenize([text])
|
||||||
|
return tokens[0][-1] != 0
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate_to_fit(text: str, tokenize) -> str:
|
||||||
|
parts = text.split(', ')
|
||||||
|
new_text = parts[0]
|
||||||
|
for part in parts[1:]:
|
||||||
|
if _prompt_at_max_len(new_text + part, tokenize):
|
||||||
|
break
|
||||||
|
new_text += ', ' + part
|
||||||
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def list_caption_models() -> List[str]:
|
||||||
|
return list(CAPTION_MODELS.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def list_clip_models() -> List[str]:
|
||||||
|
return ['/'.join(x) for x in open_clip.list_pretrained()]
|
||||||
|
|
||||||
|
|
||||||
|
def load_list(data_path: str, filename: Optional[str] = None) -> List[str]:
|
||||||
|
"""Load a list of strings from a file."""
|
||||||
|
if filename is not None:
|
||||||
|
data_path = os.path.join(data_path, filename)
|
||||||
|
with open(data_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||||
|
items = [line.strip() for line in f.readlines()]
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(
|
||||||
|
Tasks.image_captioning, module_name=Models.clip_interrogator)
|
||||||
|
class CLIP_Interrogator(TorchModel):
|
||||||
|
|
||||||
|
def __init__(self, model_dir, device='cuda', device_id=0, *args, **kwargs):
|
||||||
|
super().__init__(
|
||||||
|
model_dir=model_dir, device_id=device_id, *args, **kwargs)
|
||||||
|
self.device = device
|
||||||
|
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
|
||||||
|
cf = Config(clip_model_name='ViT-L-14/openai')
|
||||||
|
cf.data_path = os.path.join(model_dir, 'data')
|
||||||
|
cf.clip_model_path = model_dir
|
||||||
|
cf.cache_path = model_dir
|
||||||
|
self.ci = Interrogator(cf)
|
||||||
|
|
||||||
|
def forward(self, inputs):
|
||||||
|
image = transforms.ToPILImage()(inputs)
|
||||||
|
return {'caption': self.ci.interrogate(image)}
|
||||||
@@ -128,13 +128,13 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
|
|||||||
local_transform,
|
local_transform,
|
||||||
s=None,
|
s=None,
|
||||||
e=None):
|
e=None):
|
||||||
video_mask = np.zeros(self.max_frames, dtype=np.long)
|
video_mask = np.zeros(self.max_frames, dtype=int)
|
||||||
max_video_length = 0
|
max_video_length = 0
|
||||||
|
|
||||||
# T x 3 x H x W
|
# T x 3 x H x W
|
||||||
video = np.zeros((self.max_frames, 3, rawVideoExtractor.size,
|
video = np.zeros((self.max_frames, 3, rawVideoExtractor.size,
|
||||||
rawVideoExtractor.size),
|
rawVideoExtractor.size),
|
||||||
dtype=np.float)
|
dtype=float)
|
||||||
|
|
||||||
if s is None:
|
if s is None:
|
||||||
start_time, end_time = None, None
|
start_time, end_time = None, None
|
||||||
|
|||||||
18
modelscope/models/multi_modal/mplug_owl/__init__.py
Normal file
18
modelscope/models/multi_modal/mplug_owl/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
|
||||||
|
MplugOwlVisualAbstractorConfig)
|
||||||
|
from .modeling_mplug_owl import MplugOwlForConditionalGeneration
|
||||||
@@ -0,0 +1,257 @@
|
|||||||
|
# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
|
||||||
|
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||||
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
""" MPLUG OWL model configuration """
|
||||||
|
import copy
|
||||||
|
import os
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from transformers import PretrainedConfig
|
||||||
|
from transformers.models.auto import CONFIG_MAPPING
|
||||||
|
from transformers.utils import logging
|
||||||
|
|
||||||
|
from modelscope.utils.constant import Tasks
|
||||||
|
|
||||||
|
logger = logging.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class MplugOwlVisionConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
Args:
|
||||||
|
hidden_size (`int`, *optional*, defaults to 768):
|
||||||
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
|
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||||
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
|
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||||
|
Number of hidden layers in the Transformer encoder.
|
||||||
|
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||||
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
image_size (`int`, *optional*, defaults to 224):
|
||||||
|
The size (resolution) of each image.
|
||||||
|
patch_size (`int`, *optional*, defaults to 32):
|
||||||
|
The size (resolution) of each patch.
|
||||||
|
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||||
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
|
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||||
|
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||||
|
The epsilon used by the layer normalization layers.
|
||||||
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout ratio for the attention probabilities.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
initializer_factor (`float`, *optional*, defaults to 1):
|
||||||
|
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||||
|
testing).
|
||||||
|
```"""
|
||||||
|
|
||||||
|
model_type = 'mplug_owl_vision_model'
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size=1024,
|
||||||
|
intermediate_size=4096,
|
||||||
|
projection_dim=768,
|
||||||
|
num_hidden_layers=24,
|
||||||
|
num_attention_heads=16,
|
||||||
|
num_channels=3,
|
||||||
|
image_size=224,
|
||||||
|
patch_size=14,
|
||||||
|
hidden_act='quick_gelu',
|
||||||
|
layer_norm_eps=1e-6,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
initializer_range=0.02,
|
||||||
|
initializer_factor=1.0,
|
||||||
|
use_flash_attn=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.projection_dim = projection_dim
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.num_channels = num_channels
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.image_size = image_size
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.initializer_factor = initializer_factor
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.use_flash_attn = use_flash_attn
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||||
|
os.PathLike],
|
||||||
|
**kwargs) -> 'PretrainedConfig':
|
||||||
|
config_dict, kwargs = cls.get_config_dict(
|
||||||
|
pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
# get the vision config dict if we are loading from MplugOwlConfig
|
||||||
|
if config_dict.get('model_type') == 'mplug_owl':
|
||||||
|
config_dict = config_dict['vision_config']
|
||||||
|
|
||||||
|
if 'model_type' in config_dict and hasattr(
|
||||||
|
cls,
|
||||||
|
'model_type') and config_dict['model_type'] != cls.model_type:
|
||||||
|
logger.warning(
|
||||||
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||||
|
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class MplugOwlVisualAbstractorConfig(PretrainedConfig):
|
||||||
|
|
||||||
|
model_type = 'MPlugOwlVisualAbstractor'
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size=1024,
|
||||||
|
num_hidden_layers=6,
|
||||||
|
num_attention_heads=16,
|
||||||
|
intermediate_size=4096,
|
||||||
|
attention_probs_dropout_prob=0.1,
|
||||||
|
initializer_range=0.02,
|
||||||
|
layer_norm_eps=1e-6,
|
||||||
|
encoder_hidden_size=1024,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.layer_norm_eps = layer_norm_eps
|
||||||
|
self.encoder_hidden_size = encoder_hidden_size
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||||
|
os.PathLike],
|
||||||
|
**kwargs) -> 'PretrainedConfig':
|
||||||
|
config_dict, kwargs = cls.get_config_dict(
|
||||||
|
pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
# get the qformer config dict if we are loading from MplugOwlConfig
|
||||||
|
if config_dict.get('model_type') == 'mplug_owl':
|
||||||
|
config_dict = config_dict['abstractor_config']
|
||||||
|
|
||||||
|
if 'model_type' in config_dict and hasattr(
|
||||||
|
cls,
|
||||||
|
'model_type') and config_dict['model_type'] != cls.model_type:
|
||||||
|
logger.warning(
|
||||||
|
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||||
|
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class MplugOwlConfig(PretrainedConfig):
|
||||||
|
r"""
|
||||||
|
Args:
|
||||||
|
vision_config (`dict`, *optional*):
|
||||||
|
Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
|
||||||
|
qformer_config (`dict`, *optional*):
|
||||||
|
Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
|
||||||
|
text_config (`dict`, *optional*):
|
||||||
|
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
|
||||||
|
num_query_tokens (`int`, *optional*, defaults to 32):
|
||||||
|
The number of query tokens passed through the Transformer.
|
||||||
|
|
||||||
|
kwargs (*optional*):
|
||||||
|
Dictionary of keyword arguments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_type = 'mplug_owl'
|
||||||
|
is_composition = True
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
task=Tasks.multimodal_dialogue,
|
||||||
|
vision_config=None,
|
||||||
|
visual_abstractor_config=None,
|
||||||
|
text_config=None,
|
||||||
|
num_query_tokens=64,
|
||||||
|
**kwargs):
|
||||||
|
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.task = task
|
||||||
|
if vision_config is None:
|
||||||
|
vision_config = MplugOwlVisionConfig().to_dict()
|
||||||
|
logger.info('vision_config is None.')
|
||||||
|
|
||||||
|
if visual_abstractor_config is None:
|
||||||
|
visual_abstractor_config = {}
|
||||||
|
logger.info('abstractor_config is None. ')
|
||||||
|
|
||||||
|
if text_config is None:
|
||||||
|
# we use LLAMA 7b by default
|
||||||
|
from transformers.models.llama.configuration_llama import \
|
||||||
|
LlamaConfig
|
||||||
|
text_config = LlamaConfig(pad_token_id=2).to_dict()
|
||||||
|
logger.info('text_config is None.')
|
||||||
|
|
||||||
|
self.vision_config = MplugOwlVisionConfig(**vision_config)
|
||||||
|
self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
|
||||||
|
**visual_abstractor_config)
|
||||||
|
text_model_type = text_config[
|
||||||
|
'model_type'] if 'model_type' in text_config else 'llama'
|
||||||
|
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
|
||||||
|
|
||||||
|
self.tie_word_embeddings = self.text_config.tie_word_embeddings
|
||||||
|
|
||||||
|
self.num_query_tokens = num_query_tokens
|
||||||
|
self.initializer_factor = 1.0
|
||||||
|
self.initializer_range = 0.02
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_vision_abstractor_text_configs(
|
||||||
|
cls,
|
||||||
|
vision_config: MplugOwlVisionConfig,
|
||||||
|
visual_abstractor_config: MplugOwlVisualAbstractorConfig,
|
||||||
|
text_config: PretrainedConfig,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
r"""
|
||||||
|
Returns:
|
||||||
|
[`MplugOwlConfig`]: An instance of a configuration object
|
||||||
|
"""
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
vision_config=vision_config.to_dict(),
|
||||||
|
visual_abstractor_config=visual_abstractor_config.to_dict(),
|
||||||
|
text_config=text_config.to_dict(),
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
output['vision_config'] = self.vision_config.to_dict()
|
||||||
|
tmp = self.visual_abstractor_config.to_dict()
|
||||||
|
output['visual_abstractor_config'] = tmp
|
||||||
|
output['text_config'] = self.text_config.to_dict()
|
||||||
|
output['model_type'] = self.__class__.model_type
|
||||||
|
return output
|
||||||
1551
modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
Normal file
1551
modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -212,10 +212,10 @@ class ConstructBlockStrategy:
|
|||||||
block_spans,
|
block_spans,
|
||||||
rng,
|
rng,
|
||||||
task='bert'):
|
task='bert'):
|
||||||
position_ids = np.arange(len(tokens), dtype=np.long)
|
position_ids = np.arange(len(tokens), dtype=int)
|
||||||
targets = copy.deepcopy(tokens)
|
targets = copy.deepcopy(tokens)
|
||||||
mask_id = self.tokenizer.get_command('MASK').Id
|
mask_id = self.tokenizer.get_command('MASK').Id
|
||||||
mlm_masks = np.zeros(len(tokens), dtype=np.long)
|
mlm_masks = np.zeros(len(tokens), dtype=int)
|
||||||
for start, end in block_spans:
|
for start, end in block_spans:
|
||||||
for idx in range(start, end):
|
for idx in range(start, end):
|
||||||
tokens[idx] = mask_id
|
tokens[idx] = mask_id
|
||||||
@@ -231,7 +231,7 @@ class ConstructBlockStrategy:
|
|||||||
rng,
|
rng,
|
||||||
task='bert'):
|
task='bert'):
|
||||||
text_length = len(tokens)
|
text_length = len(tokens)
|
||||||
position_ids = np.ones(len(tokens), dtype=np.long)
|
position_ids = np.ones(len(tokens), dtype=int)
|
||||||
for start, end in block_spans:
|
for start, end in block_spans:
|
||||||
position_ids[start + 1:end] = 0
|
position_ids[start + 1:end] = 0
|
||||||
position_ids = np.cumsum(position_ids) - 1
|
position_ids = np.cumsum(position_ids) - 1
|
||||||
@@ -270,7 +270,7 @@ class ConstructBlockStrategy:
|
|||||||
(end - start + 1))
|
(end - start + 1))
|
||||||
if self.block_position_encoding:
|
if self.block_position_encoding:
|
||||||
target_block_position_ids.append(
|
target_block_position_ids.append(
|
||||||
np.arange(1, end - start + 2, dtype=np.long))
|
np.arange(1, end - start + 2, dtype=int))
|
||||||
else:
|
else:
|
||||||
target_block_position_ids.append([1] * (end - start + 1))
|
target_block_position_ids.append([1] * (end - start + 1))
|
||||||
block_spans.sort(key=lambda x: x[0])
|
block_spans.sort(key=lambda x: x[0])
|
||||||
@@ -307,7 +307,7 @@ class ConstructBlockStrategy:
|
|||||||
target_tokens = target_tokens + [
|
target_tokens = target_tokens + [
|
||||||
self.tokenizer.get_command('eop').Id
|
self.tokenizer.get_command('eop').Id
|
||||||
]
|
]
|
||||||
loss_masks = np.ones(len(target_tokens), dtype=np.long)
|
loss_masks = np.ones(len(target_tokens), dtype=int)
|
||||||
return source_tokens, target_tokens, loss_masks
|
return source_tokens, target_tokens, loss_masks
|
||||||
else:
|
else:
|
||||||
tokens = np.concatenate(source_tokens + target_tokens)
|
tokens = np.concatenate(source_tokens + target_tokens)
|
||||||
@@ -326,12 +326,12 @@ class ConstructBlockStrategy:
|
|||||||
for pos in mask_pos:
|
for pos in mask_pos:
|
||||||
tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
|
tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
|
||||||
targets = np.concatenate(source_tokens + targets)
|
targets = np.concatenate(source_tokens + targets)
|
||||||
loss_masks = np.ones(len(tokens), dtype=np.long)
|
loss_masks = np.ones(len(tokens), dtype=int)
|
||||||
loss_masks[:source_length] = 0
|
loss_masks[:source_length] = 0
|
||||||
position_ids = np.concatenate(source_position_ids
|
position_ids = np.concatenate(source_position_ids
|
||||||
+ target_position_ids)
|
+ target_position_ids)
|
||||||
block_position_ids = np.concatenate(
|
block_position_ids = np.concatenate(
|
||||||
[np.zeros(source_length, dtype=np.long)]
|
[np.zeros(source_length, dtype=int)]
|
||||||
+ target_block_position_ids)
|
+ target_block_position_ids)
|
||||||
position_ids = np.stack([position_ids, block_position_ids], axis=0)
|
position_ids = np.stack([position_ids, block_position_ids], axis=0)
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
@@ -539,22 +539,21 @@ class ConstructBlockStrategy:
|
|||||||
(source_tokens, [self.generation_mask], target_tokens))
|
(source_tokens, [self.generation_mask], target_tokens))
|
||||||
loss_masks = np.concatenate(
|
loss_masks = np.concatenate(
|
||||||
(np.zeros(len(source_tokens) + 1,
|
(np.zeros(len(source_tokens) + 1,
|
||||||
dtype=np.long), target_masks))
|
dtype=int), target_masks))
|
||||||
token_batch.append(tokens)
|
token_batch.append(tokens)
|
||||||
target_batch.append(targets)
|
target_batch.append(targets)
|
||||||
loss_mask_batch.append(loss_masks)
|
loss_mask_batch.append(loss_masks)
|
||||||
position_ids = np.arange(
|
position_ids = np.arange(
|
||||||
len(source_tokens) + len(target_tokens) + 1,
|
len(source_tokens) + len(target_tokens) + 1, dtype=int)
|
||||||
dtype=np.long)
|
|
||||||
position_ids[len(source_tokens) + 1:] = len(source_tokens)
|
position_ids[len(source_tokens) + 1:] = len(source_tokens)
|
||||||
if self.block_position_encoding:
|
if self.block_position_encoding:
|
||||||
block_position_ids = np.concatenate(
|
block_position_ids = np.concatenate(
|
||||||
(np.zeros(len(source_tokens), dtype=np.long),
|
(np.zeros(len(source_tokens), dtype=int),
|
||||||
np.arange(len(target_tokens) + 1, dtype=np.long)))
|
np.arange(len(target_tokens) + 1, dtype=int)))
|
||||||
else:
|
else:
|
||||||
block_position_ids = np.concatenate(
|
block_position_ids = np.concatenate(
|
||||||
(np.zeros(len(source_tokens) + 1, dtype=np.long),
|
(np.zeros(len(source_tokens) + 1, dtype=int),
|
||||||
np.ones(len(target_tokens) + 1, dtype=np.long)))
|
np.ones(len(target_tokens) + 1, dtype=int)))
|
||||||
position_id_batch.append(
|
position_id_batch.append(
|
||||||
np.stack([position_ids, block_position_ids], axis=0))
|
np.stack([position_ids, block_position_ids], axis=0))
|
||||||
else:
|
else:
|
||||||
@@ -597,27 +596,25 @@ class ConstructBlockStrategy:
|
|||||||
max_length = max(seq_lengths)
|
max_length = max(seq_lengths)
|
||||||
token_batch = [
|
token_batch = [
|
||||||
np.concatenate(
|
np.concatenate(
|
||||||
(tokens, np.zeros(max_length - len(tokens),
|
(tokens, np.zeros(max_length - len(tokens), dtype=int)))
|
||||||
dtype=np.long)))
|
|
||||||
for tokens in token_batch
|
for tokens in token_batch
|
||||||
]
|
]
|
||||||
target_batch = [
|
target_batch = [
|
||||||
np.concatenate(
|
np.concatenate(
|
||||||
(targets,
|
(targets, np.zeros(max_length - len(targets), dtype=int)))
|
||||||
np.zeros(max_length - len(targets), dtype=np.long)))
|
|
||||||
for targets in target_batch
|
for targets in target_batch
|
||||||
]
|
]
|
||||||
loss_mask_batch = [
|
loss_mask_batch = [
|
||||||
np.concatenate(
|
np.concatenate(
|
||||||
(loss_masks,
|
(loss_masks,
|
||||||
np.zeros(max_length - len(loss_masks), dtype=np.long)))
|
np.zeros(max_length - len(loss_masks), dtype=int)))
|
||||||
for loss_masks in loss_mask_batch
|
for loss_masks in loss_mask_batch
|
||||||
]
|
]
|
||||||
position_id_batch = [
|
position_id_batch = [
|
||||||
np.concatenate((position_ids,
|
np.concatenate(
|
||||||
np.zeros(
|
(position_ids,
|
||||||
(2, max_length - position_ids.shape[1]),
|
np.zeros(
|
||||||
dtype=np.long)),
|
(2, max_length - position_ids.shape[1]), dtype=int)),
|
||||||
axis=1) for position_ids in position_id_batch
|
axis=1) for position_ids in position_id_batch
|
||||||
]
|
]
|
||||||
return token_batch, target_batch, loss_mask_batch, position_id_batch
|
return token_batch, target_batch, loss_mask_batch, position_id_batch
|
||||||
|
|||||||
@@ -583,8 +583,8 @@ class XLDataset(data.Dataset):
|
|||||||
def getidx(self, idx):
|
def getidx(self, idx):
|
||||||
tokens, targets, loss_masks = [], [], []
|
tokens, targets, loss_masks = [], [], []
|
||||||
attention_mask = np.concatenate(
|
attention_mask = np.concatenate(
|
||||||
(np.zeros((self.max_seq_len, self.mem_len), dtype=np.long),
|
(np.zeros((self.max_seq_len, self.mem_len), dtype=int),
|
||||||
np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)),
|
np.ones((self.max_seq_len, self.max_seq_len), dtype=int)),
|
||||||
axis=1)
|
axis=1)
|
||||||
sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
|
sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
|
||||||
last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
|
last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ def main():
|
|||||||
counts = np.array([0] * 10)
|
counts = np.array([0] * 10)
|
||||||
for _ in range(10000):
|
for _ in range(10000):
|
||||||
spans = strategy.sample_span_in_document(
|
spans = strategy.sample_span_in_document(
|
||||||
np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
|
np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=int), [1, 1],
|
||||||
random.Random())
|
random.Random())
|
||||||
for start, end in spans:
|
for start, end in spans:
|
||||||
counts[start:end] += 1
|
counts[start:end] += 1
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ def main():
|
|||||||
num_iters=300000,
|
num_iters=300000,
|
||||||
decay_style='cosine',
|
decay_style='cosine',
|
||||||
decay_ratio=0.1)
|
decay_ratio=0.1)
|
||||||
steps = np.arange(0, 400000, 10, dtype=np.long)
|
steps = np.arange(0, 400000, 10, dtype=int)
|
||||||
rates = []
|
rates = []
|
||||||
for step in steps:
|
for step in steps:
|
||||||
lr_scheduler.num_iters = step
|
lr_scheduler.num_iters = step
|
||||||
|
|||||||
@@ -5,12 +5,12 @@ from typing import TYPE_CHECKING
|
|||||||
from modelscope.utils.import_utils import LazyImportModule
|
from modelscope.utils.import_utils import LazyImportModule
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .configuration_unite import UniTEConfig
|
from .configuration import UniTEConfig
|
||||||
from .modeling_unite import UniTEForTranslationEvaluation
|
from .translation_evaluation import UniTEForTranslationEvaluation
|
||||||
else:
|
else:
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
'configuration_unite': ['UniTEConfig'],
|
'configuration': ['UniTEConfig'],
|
||||||
'modeling_unite': ['UniTEForTranslationEvaluation'],
|
'translation_evaluation': ['UniTEForTranslationEvaluation'],
|
||||||
}
|
}
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ from modelscope.utils.config import Config
|
|||||||
logger = logging.get_logger()
|
logger = logging.get_logger()
|
||||||
|
|
||||||
|
|
||||||
class EvaluationMode(Enum):
|
class InputFormat(Enum):
|
||||||
SRC = 'src'
|
SRC = 'src'
|
||||||
REF = 'ref'
|
REF = 'ref'
|
||||||
SRC_REF = 'src-ref'
|
SRC_REF = 'src-ref'
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user