mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
add 1.6
This commit is contained in:
@@ -1,13 +1,12 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from modelscope import MsDataset, TrainingArgs
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.msdatasets.ms_dataset import MsDataset
|
||||
from modelscope.trainers.builder import build_trainer
|
||||
from modelscope.trainers.training_args import TrainingArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class ImageClassificationTrainingArgs(TrainingArgs):
|
||||
num_classes: int = field(
|
||||
default=None,
|
||||
@@ -46,26 +45,35 @@ def create_dataset(name, split):
|
||||
dataset_name, namespace=namespace, subset_name='default', split=split)
|
||||
|
||||
|
||||
def train():
|
||||
args = ImageClassificationTrainingArgs.from_cli(
|
||||
training_args = ImageClassificationTrainingArgs(
|
||||
model='damo/cv_vit-base_image-classification_ImageNet-labels',
|
||||
max_epochs=1,
|
||||
lr=1e-4,
|
||||
optimizer='AdamW',
|
||||
warmup_iters=1,
|
||||
topk=(1, ))
|
||||
if args.dataset_name is not None:
|
||||
train_dataset = create_dataset(args.dataset_name, split='train')
|
||||
val_dataset = create_dataset(args.dataset_name, split='validation')
|
||||
topk=(1, )).parse_cli()
|
||||
config, args = training_args.to_config()
|
||||
|
||||
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
train_dataset = create_dataset(args.train_dataset_name, split='train')
|
||||
val_dataset = create_dataset(args.val_dataset_name, split='validation')
|
||||
cfg = config
|
||||
return cfg
|
||||
|
||||
|
||||
def train():
|
||||
train_dataset = create_dataset(
|
||||
training_args.train_dataset_name, split=training_args.train_split)
|
||||
val_dataset = create_dataset(
|
||||
training_args.val_dataset_name, split=training_args.val_split)
|
||||
|
||||
kwargs = dict(
|
||||
model=args.model, # model id
|
||||
train_dataset=train_dataset, # training dataset
|
||||
eval_dataset=val_dataset, # validation dataset
|
||||
cfg_modify_fn=args # callback to modify configuration
|
||||
cfg_modify_fn=cfg_modify_fn # callback to modify configuration
|
||||
)
|
||||
|
||||
# in distributed training, specify pytorch launcher
|
||||
|
||||
@@ -2,4 +2,7 @@ PYTHONPATH=. python -m torch.distributed.launch --nproc_per_node=2 \
|
||||
examples/pytorch/image_classification/finetune_image_classification.py \
|
||||
--num_classes 2 \
|
||||
--train_dataset_name 'tany0699/cats_and_dogs' \
|
||||
--val_dataset_name 'tany0699/cats_and_dogs'
|
||||
--val_dataset_name 'tany0699/cats_and_dogs' \
|
||||
--train_split train \
|
||||
--val_split validation \
|
||||
--use_model_config true \
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
|
||||
from modelscope import MsDataset, TrainingArgs
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
|
||||
set_flatten_value)
|
||||
from modelscope.trainers.training_args import set_flatten_value
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
|
||||
trainer: str = field(
|
||||
@@ -17,6 +15,12 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
'help': 'The trainer used',
|
||||
})
|
||||
|
||||
work_dir: str = field(
|
||||
default='./tmp',
|
||||
metadata={
|
||||
'help': 'The working path for saving checkpoint',
|
||||
})
|
||||
|
||||
use_fp16: bool = field(
|
||||
default=None,
|
||||
metadata={
|
||||
@@ -35,7 +39,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
default=None,
|
||||
metadata={
|
||||
'cfg_node': 'train.optimizer_hparams',
|
||||
'cfg_getter': partial(get_flatten_value, exclusions=['lr']),
|
||||
'cfg_setter': set_flatten_value,
|
||||
'help': 'The optimizer init params except `lr`',
|
||||
})
|
||||
@@ -51,7 +54,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
default=None,
|
||||
metadata={
|
||||
'cfg_node': 'dataset.column_map',
|
||||
'cfg_getter': get_flatten_value,
|
||||
'cfg_setter': set_flatten_value,
|
||||
'help': 'The column map for dataset',
|
||||
})
|
||||
@@ -67,7 +69,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
default=None,
|
||||
metadata={
|
||||
'cfg_node': 'train.lr_scheduler_hook',
|
||||
'cfg_getter': get_flatten_value,
|
||||
'cfg_setter': set_flatten_value,
|
||||
'help': 'The parameters for lr scheduler hook',
|
||||
})
|
||||
@@ -76,7 +77,6 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
default=None,
|
||||
metadata={
|
||||
'cfg_node': 'train.optimizer_hook',
|
||||
'cfg_getter': get_flatten_value,
|
||||
'cfg_setter': set_flatten_value,
|
||||
'help': 'The parameters for optimizer hook',
|
||||
})
|
||||
@@ -92,23 +92,28 @@ class MultiModalEmbeddingArguments(TrainingArgs):
|
||||
'help': 'The data parallel world size',
|
||||
})
|
||||
|
||||
def __call__(self, config):
|
||||
config = super().__call__(config)
|
||||
config.merge_from_dict({'pretrained_model.model_name': self.model})
|
||||
if self.clip_clamp:
|
||||
config.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
|
||||
if self.world_size > 1:
|
||||
config.train.launcher = 'pytorch'
|
||||
return config
|
||||
|
||||
config, args = MultiModalEmbeddingArguments().parse_cli().to_config()
|
||||
print(config, args)
|
||||
|
||||
|
||||
args = MultiModalEmbeddingArguments.from_cli(task='multi-modal-embedding')
|
||||
print(args)
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
cfg = config
|
||||
cfg.merge_from_dict({'pretrained_model.model_name': args.model})
|
||||
if args.clip_clamp:
|
||||
cfg.train.hooks.append({'type': 'ClipClampLogitScaleHook'})
|
||||
if args.world_size > 1:
|
||||
cfg.train.launcher = 'pytorch'
|
||||
return cfg
|
||||
|
||||
|
||||
train_dataset = MsDataset.load(
|
||||
args.dataset_name, namespace='modelscope', split='train')
|
||||
args.train_dataset_name, namespace='modelscope', split='train')
|
||||
eval_dataset = MsDataset.load(
|
||||
args.dataset_name, namespace='modelscope', split='validation')
|
||||
args.train_dataset_name, namespace='modelscope', split='validation')
|
||||
|
||||
os.makedirs(args.work_dir, exist_ok=True)
|
||||
kwargs = dict(
|
||||
@@ -116,6 +121,6 @@ kwargs = dict(
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
work_dir=args.work_dir,
|
||||
cfg_modify_fn=args)
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
trainer = build_trainer(name=args.trainer, default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
@@ -6,14 +6,16 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
|
||||
--trainer 'clip-multi-modal-embedding' \
|
||||
--work_dir './workspace/ckpts/clip' \
|
||||
--model 'damo/multi-modal_clip-vit-base-patch16_zh' \
|
||||
--dataset_name 'muge' \
|
||||
--train_dataset_name 'muge' \
|
||||
--dataset_column_map 'img=image,text=query' \
|
||||
--max_epochs 1 \
|
||||
--use_fp16 true \
|
||||
--per_device_train_batch_size 180 \
|
||||
--train_data_worker 0 \
|
||||
--train_shuffle true \
|
||||
--train_drop_last true \
|
||||
--per_device_eval_batch_size 128 \
|
||||
--eval_data_worker 0 \
|
||||
--eval_shuffle true \
|
||||
--eval_drop_last true \
|
||||
--save_ckpt_best true \
|
||||
@@ -33,3 +35,4 @@ PYTHONPATH=. torchrun --nproc_per_node $DATA_PARALLEL_SIZE \
|
||||
--optimizer_hook 'type=TorchAMPOptimizerHook,cumulative_iters=1,loss_keys=loss' \
|
||||
--clip_clamp true \
|
||||
--world_size $DATA_PARALLEL_SIZE \
|
||||
--use_model_config true \
|
||||
|
||||
@@ -4,30 +4,32 @@ from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||
from modelscope.trainers.training_args import TrainingArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
class StableDiffusionArguments(TrainingArgs):
|
||||
|
||||
def __call__(self, config):
|
||||
config = super().__call__(config)
|
||||
config.train.lr_scheduler.T_max = self.max_epochs
|
||||
config.model.inference = False
|
||||
return config
|
||||
|
||||
|
||||
args = StableDiffusionArguments.from_cli(task='efficient-diffusion-tuning')
|
||||
training_args = TrainingArgs(task='efficient-diffusion-tuning').parse_cli()
|
||||
config, args = training_args.to_config()
|
||||
print(args)
|
||||
|
||||
dataset = MsDataset.load(args.dataset_name, namespace=args.namespace)
|
||||
dataset = MsDataset.load(
|
||||
args.train_dataset_name, namespace=args.train_dataset_namespace)
|
||||
train_dataset = dataset['train']
|
||||
validation_dataset = dataset['validation']
|
||||
|
||||
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
cfg = config
|
||||
cfg.train.lr_scheduler.T_max = training_args.max_epochs
|
||||
cfg.model.inference = False
|
||||
return cfg
|
||||
|
||||
|
||||
kwargs = dict(
|
||||
model=args.model,
|
||||
work_dir=args.work_dir,
|
||||
model=training_args.model,
|
||||
work_dir=training_args.work_dir,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=validation_dataset,
|
||||
cfg_modify_fn=args)
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
|
||||
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
||||
trainer.train()
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
PYTHONPATH=. torchrun examples/pytorch/stable_diffusion/finetune_stable_diffusion.py \
|
||||
--model 'damo/multi-modal_efficient-diffusion-tuning-lora' \
|
||||
--work_dir './tmp/stable_diffusion_tuning' \
|
||||
--namespace 'damo' \
|
||||
--dataset_name 'buptwq/lora-stable-diffusion-finetune-dog' \
|
||||
--max_epochs 150 \
|
||||
--train_dataset_namespace 'damo' \
|
||||
--train_dataset_name 'controlnet_dataset_condition_fill50k' \
|
||||
--max_epochs 1 \
|
||||
--save_ckpt_strategy 'by_epoch' \
|
||||
--logging_interval 100 \
|
||||
--train.dataloader.workers_per_gpu 0 \
|
||||
--evaluation.dataloader.workers_per_gpu 0 \
|
||||
--train.optimizer.lr 1e-4
|
||||
--train.optimizer.lr 1e-5 \
|
||||
--use_model_config true
|
||||
|
||||
@@ -1,26 +1,18 @@
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||
from modelscope.trainers.training_args import TrainingArgs
|
||||
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
|
||||
build_dataset_from_file)
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
|
||||
def get_labels(cfg, metadata):
|
||||
label2id = cfg.safe_get(metadata['cfg_node'])
|
||||
if label2id is not None:
|
||||
return ','.join(label2id.keys())
|
||||
|
||||
|
||||
def set_labels(cfg, labels, metadata):
|
||||
def set_labels(labels):
|
||||
if isinstance(labels, str):
|
||||
labels = labels.split(',')
|
||||
cfg.merge_from_dict(
|
||||
{metadata['cfg_node']: {label: id
|
||||
for id, label in enumerate(labels)}})
|
||||
return {label: id for id, label in enumerate(labels)}
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class TextClassificationArguments(TrainingArgs):
|
||||
|
||||
first_sequence: str = field(
|
||||
@@ -49,7 +41,6 @@ class TextClassificationArguments(TrainingArgs):
|
||||
metadata={
|
||||
'help': 'The labels of the dataset',
|
||||
'cfg_node': 'preprocessor.label2id',
|
||||
'cfg_getter': get_labels,
|
||||
'cfg_setter': set_labels,
|
||||
})
|
||||
|
||||
@@ -60,30 +51,39 @@ class TextClassificationArguments(TrainingArgs):
|
||||
'cfg_node': 'preprocessor.type'
|
||||
})
|
||||
|
||||
def __call__(self, config):
|
||||
config = super().__call__(config)
|
||||
config.model['num_labels'] = len(self.labels)
|
||||
if config.train.lr_scheduler.type == 'LinearLR':
|
||||
config.train.lr_scheduler['total_iters'] = \
|
||||
int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
|
||||
return config
|
||||
|
||||
config, args = TextClassificationArguments().parse_cli().to_config()
|
||||
|
||||
print(config, args)
|
||||
|
||||
|
||||
args = TextClassificationArguments.from_cli(
|
||||
task='text-classification', eval_metrics='seq-cls-metric')
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
cfg = config
|
||||
cfg.model['num_labels'] = len(cfg.preprocessor.label2id)
|
||||
if cfg.train.lr_scheduler.type == 'LinearLR':
|
||||
cfg.train.lr_scheduler['total_iters'] = \
|
||||
int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
|
||||
return cfg
|
||||
|
||||
print(args)
|
||||
|
||||
dataset = MsDataset.load(args.dataset_name, subset_name=args.subset_name)
|
||||
train_dataset = dataset['train']
|
||||
validation_dataset = dataset['validation']
|
||||
if args.dataset_json_file is None:
|
||||
dataset = MsDataset.load(
|
||||
args.train_dataset_name, subset_name=args.train_subset_name)
|
||||
train_dataset = dataset['train']
|
||||
validation_dataset = dataset['validation']
|
||||
else:
|
||||
train_dataset, validation_dataset = build_dataset_from_file(
|
||||
args.dataset_json_file)
|
||||
|
||||
kwargs = dict(
|
||||
model=args.model,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=validation_dataset,
|
||||
seed=args.seed,
|
||||
cfg_modify_fn=args)
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
|
||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
PYTHONPATH=. python examples/pytorch/text_classification/finetune_text_classification.py \
|
||||
--task 'text-classification' \
|
||||
--model 'damo/nlp_structbert_backbone_base_std' \
|
||||
--dataset_name 'clue' \
|
||||
--subset_name 'tnews' \
|
||||
--train_dataset_name 'clue' \
|
||||
--train_subset_name 'tnews' \
|
||||
--first_sequence 'sentence' \
|
||||
--preprocessor.label label \
|
||||
--model.num_labels 15 \
|
||||
--labels '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14' \
|
||||
--preprocessor 'sen-cls-tokenizer' \
|
||||
--use_model_config True \
|
||||
--max_epochs 1 \
|
||||
--train.dataloader.workers_per_gpu 0 \
|
||||
--evaluation.dataloader.workers_per_gpu 0 \
|
||||
--train.optimizer.lr 1e-5 \
|
||||
--eval_metrics 'seq-cls-metric' \
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from modelscope import EpochBasedTrainer, MsDataset, TrainingArgs
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||
from modelscope.trainers.training_args import TrainingArgs
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class TextGenerationArguments(TrainingArgs):
|
||||
|
||||
trainer: str = field(
|
||||
@@ -67,30 +66,35 @@ class TextGenerationArguments(TrainingArgs):
|
||||
'help': 'Whether to use MegatronHook',
|
||||
})
|
||||
|
||||
def __call__(self, config):
|
||||
config = super().__call__(config)
|
||||
if config.train.lr_scheduler.type == 'noam':
|
||||
config.train.lr_scheduler = {
|
||||
'type': 'LambdaLR',
|
||||
'lr_lambda': noam_lambda,
|
||||
'options': {
|
||||
'by_epoch': False
|
||||
}
|
||||
}
|
||||
if self.use_megatron:
|
||||
config.train.hooks.append({'type': 'MegatronHook'})
|
||||
return config
|
||||
|
||||
|
||||
def noam_lambda(current_step: int):
|
||||
current_step += 1
|
||||
return min(current_step**(-0.5), current_step * 100**(-1.5))
|
||||
|
||||
|
||||
args = TextGenerationArguments.from_cli(task='text-generation')
|
||||
print(args)
|
||||
config, args = TextGenerationArguments().parse_cli().to_config()
|
||||
print(config, args)
|
||||
|
||||
dataset = MsDataset.load(args.dataset_name)
|
||||
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
cfg = config
|
||||
if cfg.train.lr_scheduler.type == 'noam':
|
||||
cfg.train.lr_scheduler = {
|
||||
'type': 'LambdaLR',
|
||||
'lr_lambda': noam_lambda,
|
||||
'options': {
|
||||
'by_epoch': False
|
||||
}
|
||||
}
|
||||
if args.use_megatron:
|
||||
cfg.train.hooks.append({'type': 'MegatronHook'})
|
||||
return cfg
|
||||
|
||||
|
||||
dataset = MsDataset.load(args.train_dataset_name)
|
||||
train_dataset = dataset['train']
|
||||
eval_dataset = dataset['validation' if 'validation' in dataset else 'test']
|
||||
|
||||
@@ -100,7 +104,7 @@ kwargs = dict(
|
||||
eval_dataset=eval_dataset,
|
||||
seed=args.seed,
|
||||
work_dir=args.work_dir,
|
||||
cfg_modify_fn=args)
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
|
||||
trainer: EpochBasedTrainer = build_trainer(
|
||||
name=args.trainer, default_args=kwargs)
|
||||
|
||||
@@ -8,7 +8,7 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
|
||||
--trainer 'nlp-gpt3-trainer' \
|
||||
--work_dir './tmp' \
|
||||
--model 'damo/nlp_gpt3_text-generation_1.3B' \
|
||||
--dataset_name 'chinese-poetry-collection' \
|
||||
--train_dataset_name 'chinese-poetry-collection' \
|
||||
--preprocessor 'text-gen-jieba-tokenizer' \
|
||||
--src_txt 'text1' \
|
||||
--tgt_txt 'text2' \
|
||||
@@ -20,4 +20,5 @@ PYTHONPATH=. torchrun --nproc_per_node $WORLD_SIZE examples/pytorch/text_generat
|
||||
--world_size $WORLD_SIZE \
|
||||
--tensor_model_parallel_size $TENSOR_MODEL_PARALLEL_SIZE \
|
||||
--use_megatron true \
|
||||
# --dataset_name 'DuReader_robust-QG' \ # input&output
|
||||
--use_model_config true \
|
||||
# --train_dataset_name 'DuReader_robust-QG' \ # input&output
|
||||
|
||||
13
examples/pytorch/text_generation/run_train_mt5.sh
Normal file
13
examples/pytorch/text_generation/run_train_mt5.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.py \
|
||||
--trainer 'text-generation-trainer' \
|
||||
--work_dir './tmp' \
|
||||
--task 'text2text-generation' \
|
||||
--model 'damo/nlp_mt5_zero-shot-augment_chinese-base' \
|
||||
--train_dataset_name 'DuReader_robust-QG' \
|
||||
--src_txt 'text1' \
|
||||
--tgt_txt 'text2' \
|
||||
--max_epochs 1 \
|
||||
--use_model_config True \
|
||||
--per_device_train_batch_size 8 \
|
||||
--lr 1e-3 \
|
||||
--lr_scheduler 'noam' \
|
||||
@@ -2,10 +2,11 @@ PYTHONPATH=. torchrun examples/pytorch/text_generation/finetune_text_generation.
|
||||
--trainer 'text-generation-trainer' \
|
||||
--work_dir './tmp' \
|
||||
--model 'damo/nlp_palm2.0_pretrained_chinese-base' \
|
||||
--dataset_name 'DuReader_robust-QG' \
|
||||
--train_dataset_name 'DuReader_robust-QG' \
|
||||
--src_txt 'text1' \
|
||||
--tgt_txt 'text2' \
|
||||
--max_epochs 15 \
|
||||
--max_epochs 1 \
|
||||
--use_model_config True \
|
||||
--per_device_train_batch_size 8 \
|
||||
--lr 1e-3 \
|
||||
--lr_scheduler 'noam' \
|
||||
|
||||
@@ -1,20 +1,22 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.trainers import build_trainer
|
||||
from modelscope.trainers.training_args import (TrainingArgs, get_flatten_value,
|
||||
set_flatten_value)
|
||||
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
|
||||
build_dataset_from_file)
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class TokenClassificationArguments(TrainingArgs):
|
||||
|
||||
trainer: str = field(
|
||||
default=Trainers.default, metadata={
|
||||
default=None, metadata={
|
||||
'help': 'The trainer used',
|
||||
})
|
||||
|
||||
work_dir: str = field(
|
||||
default='./tmp',
|
||||
metadata={
|
||||
'help': 'The working path for saving checkpoint',
|
||||
})
|
||||
|
||||
preprocessor: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
@@ -29,31 +31,41 @@ class TokenClassificationArguments(TrainingArgs):
|
||||
'cfg_node': 'preprocessor.padding'
|
||||
})
|
||||
|
||||
train_dataset_params: str = field(
|
||||
mode: str = field(
|
||||
default='inference',
|
||||
metadata={
|
||||
'help': 'The preprocessor padding',
|
||||
'cfg_node': 'preprocessor.mode'
|
||||
})
|
||||
|
||||
first_sequence: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'cfg_node': 'dataset.train',
|
||||
'cfg_getter': get_flatten_value,
|
||||
'cfg_setter': set_flatten_value,
|
||||
'cfg_node': 'preprocessor.first_sequence',
|
||||
'help': 'The parameters for train dataset',
|
||||
})
|
||||
|
||||
def __call__(self, config):
|
||||
config = super().__call__(config)
|
||||
if config.safe_get('dataset.train.label') == 'ner_tags':
|
||||
ner_tags_labels = train_dataset['ner_tags'] + eval_dataset[
|
||||
'ner_tags']
|
||||
label_enumerate_values = self._get_label_list(ner_tags_labels)
|
||||
config.merge_from_dict(
|
||||
{'dataset.train.labels': label_enumerate_values})
|
||||
if config.train.lr_scheduler.type == 'LinearLR':
|
||||
config.train.lr_scheduler['total_iters'] = \
|
||||
int(len(train_dataset) / self.per_device_train_batch_size) * self.max_epochs
|
||||
return config
|
||||
label: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'cfg_node': 'preprocessor.label',
|
||||
'help': 'The parameters for train dataset',
|
||||
})
|
||||
|
||||
# TODO: Future performance optimization in MsDataset
|
||||
@staticmethod
|
||||
def _get_label_list(labels):
|
||||
sequence_length: int = field(
|
||||
default=128,
|
||||
metadata={
|
||||
'cfg_node': 'preprocessor.sequence_length',
|
||||
'help': 'The parameters for train dataset',
|
||||
})
|
||||
|
||||
|
||||
training_args = TokenClassificationArguments().parse_cli()
|
||||
config, args = training_args.to_config()
|
||||
print(args)
|
||||
|
||||
|
||||
def get_label_list(labels):
|
||||
unique_labels = set()
|
||||
for label in labels:
|
||||
unique_labels = unique_labels | set(label)
|
||||
@@ -62,27 +74,56 @@ class TokenClassificationArguments(TrainingArgs):
|
||||
return label_list
|
||||
|
||||
|
||||
args = TokenClassificationArguments.from_cli(task='token-classification')
|
||||
print(args)
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
cfg = config
|
||||
labels = train_dataset[training_args.label] + validation_dataset[
|
||||
training_args.label]
|
||||
label_enumerate_values = get_label_list(labels)
|
||||
cfg.merge_from_dict({
|
||||
'preprocessor.label2id':
|
||||
{label: id
|
||||
for id, label in enumerate(label_enumerate_values)}
|
||||
})
|
||||
cfg.merge_from_dict({'model.num_labels': len(label_enumerate_values)})
|
||||
cfg.merge_from_dict({'preprocessor.use_fast': True})
|
||||
cfg.merge_from_dict({
|
||||
'evaluation.metrics': {
|
||||
'type': 'token-cls-metric',
|
||||
'label2id':
|
||||
{label: id
|
||||
for id, label in enumerate(label_enumerate_values)}
|
||||
}
|
||||
})
|
||||
if cfg.train.lr_scheduler.type == 'LinearLR':
|
||||
cfg.train.lr_scheduler['total_iters'] = \
|
||||
int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
|
||||
return cfg
|
||||
|
||||
# load dataset
|
||||
train_dataset = MsDataset.load(
|
||||
args.dataset_name,
|
||||
subset_name=args.subset_name,
|
||||
|
||||
if args.dataset_json_file is None:
|
||||
train_dataset = MsDataset.load(
|
||||
args.train_dataset_name,
|
||||
subset_name=args.train_subset_name,
|
||||
split='train',
|
||||
namespace='damo')['train']
|
||||
eval_dataset = MsDataset.load(
|
||||
args.dataset_name,
|
||||
subset_name=args.subset_name,
|
||||
namespace=args.train_dataset_namespace)['train']
|
||||
validation_dataset = MsDataset.load(
|
||||
args.train_dataset_name,
|
||||
subset_name=args.train_subset_name,
|
||||
split='validation',
|
||||
namespace='damo')['validation']
|
||||
namespace=args.train_dataset_namespace)['validation']
|
||||
else:
|
||||
train_dataset, validation_dataset = build_dataset_from_file(
|
||||
args.dataset_json_file)
|
||||
|
||||
kwargs = dict(
|
||||
model=args.model,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
eval_dataset=validation_dataset,
|
||||
work_dir=args.work_dir,
|
||||
cfg_modify_fn=args)
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
|
||||
trainer = build_trainer(name=args.trainer, default_args=kwargs)
|
||||
trainer = EpochBasedTrainer(**kwargs)
|
||||
trainer.train()
|
||||
|
||||
@@ -1,15 +1,22 @@
|
||||
PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
|
||||
PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
|
||||
--task 'token-classification' \
|
||||
--trainer 'nlp-base-trainer' \
|
||||
--work_dir './tmp' \
|
||||
--model 'damo/mgeo_backbone_chinese_base' \
|
||||
--dataset_name 'GeoGLUE' \
|
||||
--subset_name 'GeoETA' \
|
||||
--train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
|
||||
--train_dataset_name 'GeoGLUE' \
|
||||
--train_subset_name 'GeoETA' \
|
||||
--train_dataset_namespace 'damo' \
|
||||
--first_sequence 'tokens' \
|
||||
--eval_strategy by_step \
|
||||
--eval_interval 10 \
|
||||
--label 'ner_tags' \
|
||||
--sequence_length 128 \
|
||||
--preprocessor 'token-cls-tokenizer' \
|
||||
--preprocessor_padding 'max_length' \
|
||||
--max_epochs 1 \
|
||||
--mode 'inference' \
|
||||
--use_model_config True \
|
||||
--per_device_train_batch_size 32 \
|
||||
--train_data_worker 0 \
|
||||
--eval_data_worker 0 \
|
||||
--lr 3e-5 \
|
||||
--save_ckpt_strategy 'by_epoch' \
|
||||
--logging_interval 100 \
|
||||
--eval_strategy 'by_epoch' \
|
||||
|
||||
@@ -1,16 +1,22 @@
|
||||
PYTHONPATH=. torchrun examples/pytorch/token_classification/finetune_token_classification.py \
|
||||
PYTHONPATH=. python examples/pytorch/token_classification/finetune_token_classification.py \
|
||||
--task 'token-classification' \
|
||||
--trainer 'nlp-base-trainer' \
|
||||
--work_dir './tmp' \
|
||||
--model 'damo/nlp_structbert_backbone_base_std' \
|
||||
--dataset_name 'GeoGLUE' \
|
||||
--subset_name 'GeoETA' \
|
||||
--train_dataset_params 'first_sequence=tokens,label=ner_tags,sequence_length=128' \
|
||||
--train_dataset_name 'GeoGLUE' \
|
||||
--train_subset_name 'GeoETA' \
|
||||
--train_dataset_namespace 'damo' \
|
||||
--first_sequence 'tokens' \
|
||||
--eval_strategy by_step \
|
||||
--eval_interval 20 \
|
||||
--label 'ner_tags' \
|
||||
--sequence_length 128 \
|
||||
--preprocessor 'token-cls-tokenizer' \
|
||||
--preprocessor_padding 'max_length' \
|
||||
--max_epochs 2 \
|
||||
--mode 'inference' \
|
||||
--use_model_config True \
|
||||
--per_device_train_batch_size 32 \
|
||||
--train_data_worker 0 \
|
||||
--eval_data_worker 0 \
|
||||
--lr 3e-5 \
|
||||
--save_ckpt_strategy 'by_epoch' \
|
||||
--logging_interval 1 \
|
||||
--eval_strategy 'by_step' \
|
||||
--eval_interval 20 \
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
{"framework":"pytorch","train":{"work_dir":"/tmp","max_epochs":10,"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0},"optimizer":{"type":"SGD","lr":0.001},"lr_scheduler":{"type":"StepLR","step_size":2},"hooks":[{"type":"CheckpointHook","interval":1}]},"evaluation":{"dataloader":{"batch_size_per_gpu":16,"workers_per_gpu":0,"shuffle":false}}}
|
||||
@@ -5,11 +5,11 @@ from datasets import load_dataset
|
||||
from transformers import (BertForSequenceClassification, BertTokenizerFast,
|
||||
default_data_collator)
|
||||
|
||||
from modelscope import TrainingArgs
|
||||
from modelscope.trainers import EpochBasedTrainer, build_trainer
|
||||
from modelscope.trainers.default_config import DEFAULT_CONFIG, TrainingArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
@dataclass(init=False)
|
||||
class TransformersArguments(TrainingArgs):
|
||||
|
||||
num_labels: int = field(
|
||||
@@ -17,13 +17,27 @@ class TransformersArguments(TrainingArgs):
|
||||
'help': 'The number of labels',
|
||||
})
|
||||
|
||||
sentence: str = field(
|
||||
default=None, metadata={
|
||||
'help': 'The sentence key',
|
||||
})
|
||||
|
||||
args = TransformersArguments.from_cli(
|
||||
task='text-classification', eval_metrics='seq-cls-metric')
|
||||
label: str = field(
|
||||
default=None, metadata={
|
||||
'help': 'The label key',
|
||||
})
|
||||
|
||||
print(args)
|
||||
|
||||
dataset = load_dataset(args.dataset_name, args.subset_name)
|
||||
training_args = TransformersArguments(
|
||||
task='text-classification', eval_metrics='seq-cls-metric').parse_cli()
|
||||
config, args = training_args.to_config()
|
||||
|
||||
print(config, args)
|
||||
|
||||
train_dataset = load_dataset(
|
||||
args.train_dataset_name, args.train_subset_name, split=args.train_split)
|
||||
val_dataset = load_dataset(
|
||||
args.val_dataset_name, args.val_subset_name, split=args.val_split)
|
||||
|
||||
model = BertForSequenceClassification.from_pretrained(
|
||||
args.model, num_labels=args.num_labels)
|
||||
@@ -31,26 +45,30 @@ tokenizer = BertTokenizerFast.from_pretrained(args.model)
|
||||
|
||||
|
||||
def tokenize_sentence(row):
|
||||
return tokenizer(row['sentence'], padding='max_length', max_length=128)
|
||||
return tokenizer(
|
||||
row[training_args.sentence], padding='max_length', max_length=128)
|
||||
|
||||
|
||||
# Extra columns, Rename columns
|
||||
dataset = dataset.map(tokenize_sentence).remove_columns(['sentence',
|
||||
'idx']).rename_column(
|
||||
'label', 'labels')
|
||||
train_dataset = train_dataset.map(tokenize_sentence)
|
||||
val_dataset = val_dataset.map(tokenize_sentence)
|
||||
if training_args.label != 'labels':
|
||||
train_dataset = train_dataset.rename_columns(
|
||||
{training_args.label: 'labels'})
|
||||
val_dataset = val_dataset.rename_columns({training_args.label: 'labels'})
|
||||
|
||||
cfg_file = os.path.join(args.work_dir or './', 'configuration.json')
|
||||
DEFAULT_CONFIG.dump(cfg_file)
|
||||
config.dump(cfg_file)
|
||||
|
||||
kwargs = dict(
|
||||
model=model,
|
||||
cfg_file=cfg_file,
|
||||
# data_collator
|
||||
data_collator=default_data_collator,
|
||||
train_dataset=dataset['train'],
|
||||
eval_dataset=dataset['validation'],
|
||||
seed=args.seed,
|
||||
cfg_modify_fn=args)
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=val_dataset,
|
||||
remove_unused_data=True,
|
||||
seed=args.seed)
|
||||
|
||||
os.environ['LOCAL_RANK'] = str(args.local_rank)
|
||||
trainer: EpochBasedTrainer = build_trainer(name='trainer', default_args=kwargs)
|
||||
|
||||
@@ -1,5 +1,14 @@
|
||||
PYTHONPATH=. python examples/pytorch/transformers/finetune_transformers_model.py \
|
||||
--model bert-base-uncased \
|
||||
--num_labels 15 \
|
||||
--dataset_name clue \
|
||||
--subset_name tnews
|
||||
--train_dataset_name clue \
|
||||
--train_subset_name tnews \
|
||||
--train_split train \
|
||||
--val_dataset_name clue \
|
||||
--val_subset_name tnews \
|
||||
--train_split train \
|
||||
--val_split validation \
|
||||
--sentence sentence \
|
||||
--label label \
|
||||
--eval_strategy by_step \
|
||||
--eval_interval 100
|
||||
|
||||
@@ -1,4 +1,79 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .version import __release_datetime__, __version__
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
__all__ = ['__version__', '__release_datetime__']
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .version import __release_datetime__, __version__
|
||||
from .trainers import EpochBasedTrainer, TrainingArgs, build_dataset_from_file
|
||||
from .trainers import Hook, Priority
|
||||
from .exporters import Exporter
|
||||
from .exporters import TfModelExporter
|
||||
from .exporters import TorchModelExporter
|
||||
from .hub.api import HubApi
|
||||
from .hub.snapshot_download import snapshot_download
|
||||
from .hub.push_to_hub import push_to_hub, push_to_hub_async
|
||||
from .hub.check_model import check_model_is_id, check_local_model_is_latest
|
||||
from .metrics import AudioNoiseMetric, Metric, task_default_metrics, ImageColorEnhanceMetric, ImageDenoiseMetric, \
|
||||
ImageInstanceSegmentationCOCOMetric, ImagePortraitEnhancementMetric, SequenceClassificationMetric, \
|
||||
TextGenerationMetric, TokenClassificationMetric, VideoSummarizationMetric, MovieSceneSegmentationMetric, \
|
||||
AccuracyMetric, BleuMetric, ImageInpaintingMetric, ReferringVideoObjectSegmentationMetric, \
|
||||
VideoFrameInterpolationMetric, VideoStabilizationMetric, VideoSuperResolutionMetric, PplMetric, \
|
||||
ImageQualityAssessmentDegradationMetric, ImageQualityAssessmentMosMetric, TextRankingMetric, \
|
||||
LossMetric, ImageColorizationMetric, OCRRecognitionMetric
|
||||
from .models import Model, TorchModel
|
||||
from .preprocessors import Preprocessor
|
||||
from .pipelines import Pipeline, pipeline
|
||||
from .utils.hub import read_config, create_model_if_not_exist
|
||||
from .utils.logger import get_logger
|
||||
from .msdatasets import MsDataset
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'version': ['__release_datetime__', '__version__'],
|
||||
'trainers': [
|
||||
'EpochBasedTrainer', 'TrainingArgs', 'Hook', 'Priority',
|
||||
'build_dataset_from_file'
|
||||
],
|
||||
'exporters': [
|
||||
'Exporter',
|
||||
'TfModelExporter',
|
||||
'TorchModelExporter',
|
||||
],
|
||||
'hub.api': ['HubApi'],
|
||||
'hub.snapshot_download': ['snapshot_download'],
|
||||
'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'],
|
||||
'hub.check_model':
|
||||
['check_model_is_id', 'check_local_model_is_latest'],
|
||||
'metrics': [
|
||||
'AudioNoiseMetric', 'Metric', 'task_default_metrics',
|
||||
'ImageColorEnhanceMetric', 'ImageDenoiseMetric',
|
||||
'ImageInstanceSegmentationCOCOMetric',
|
||||
'ImagePortraitEnhancementMetric', 'SequenceClassificationMetric',
|
||||
'TextGenerationMetric', 'TokenClassificationMetric',
|
||||
'VideoSummarizationMetric', 'MovieSceneSegmentationMetric',
|
||||
'AccuracyMetric', 'BleuMetric', 'ImageInpaintingMetric',
|
||||
'ReferringVideoObjectSegmentationMetric',
|
||||
'VideoFrameInterpolationMetric', 'VideoStabilizationMetric',
|
||||
'VideoSuperResolutionMetric', 'PplMetric',
|
||||
'ImageQualityAssessmentDegradationMetric',
|
||||
'ImageQualityAssessmentMosMetric', 'TextRankingMetric',
|
||||
'LossMetric', 'ImageColorizationMetric', 'OCRRecognitionMetric'
|
||||
],
|
||||
'models': ['Model', 'TorchModel'],
|
||||
'preprocessors': ['Preprocessor'],
|
||||
'pipelines': ['Pipeline', 'pipeline'],
|
||||
'utils.hub': ['read_config', 'create_model_if_not_exist'],
|
||||
'utils.logger': ['get_logger'],
|
||||
'msdatasets': ['MsDataset']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
@@ -122,10 +122,11 @@ class ${pipeline_name}(Pipeline):
|
||||
# Tips: usr_config_path is the temporary save configuration location, after upload modelscope hub, it is the model_id
|
||||
usr_config_path = '${configuration_path}'
|
||||
config = Config({
|
||||
'framework': 'pytorch',
|
||||
'task': '${task_name}',
|
||||
'model': {'type': 'my-custom-model'},
|
||||
"pipeline": {"type": "my-custom-pipeline"}
|
||||
"framework": 'pytorch',
|
||||
"task": '${task_name}',
|
||||
"model": {'type': 'my-custom-model'},
|
||||
"pipeline": {"type": "my-custom-pipeline"},
|
||||
"allow_remote": True
|
||||
})
|
||||
config.dump('${configuration_path}' + 'configuration.json')
|
||||
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .human_wholebody_keypoint import HumanWholeBodyKeypoint
|
||||
|
||||
from .ans_dfsmn_exporter import ANSDFSMNExporter
|
||||
else:
|
||||
_import_structure = {
|
||||
'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
|
||||
'ans_dfsmn_exporter': ['ANSDFSMNExporter'],
|
||||
}
|
||||
|
||||
import sys
|
||||
62
modelscope/exporters/audio/ans_dfsmn_exporter.py
Normal file
62
modelscope/exporters/audio/ans_dfsmn_exporter.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.exporters.builder import EXPORTERS
|
||||
from modelscope.exporters.torch_model_exporter import TorchModelExporter
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
|
||||
INPUT_NAME = 'input'
|
||||
OUTPUT_NAME = 'output'
|
||||
|
||||
|
||||
@EXPORTERS.register_module(
|
||||
Tasks.acoustic_noise_suppression, module_name=Models.speech_dfsmn_ans)
|
||||
class ANSDFSMNExporter(TorchModelExporter):
|
||||
|
||||
def export_onnx(self, output_dir: str, opset=9, **kwargs):
|
||||
"""Export the model as onnx format files.
|
||||
|
||||
Args:
|
||||
output_dir: The output dir.
|
||||
opset: The version of the ONNX operator set to use.
|
||||
kwargs:
|
||||
device: The device used to forward.
|
||||
Returns:
|
||||
A dict containing the model key - model file path pairs.
|
||||
"""
|
||||
model = self.model if 'model' not in kwargs else kwargs.pop('model')
|
||||
device_name = 'cpu' if 'device' not in kwargs else kwargs.pop('device')
|
||||
model_bin_file = os.path.join(model.model_dir,
|
||||
ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
if os.path.exists(model_bin_file):
|
||||
checkpoint = torch.load(model_bin_file, map_location='cpu')
|
||||
model.load_state_dict(checkpoint)
|
||||
onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
|
||||
|
||||
with torch.no_grad():
|
||||
model.eval()
|
||||
device = torch.device(device_name)
|
||||
model.to(device)
|
||||
model_script = torch.jit.script(model)
|
||||
fbank_input = torch.zeros((1, 3, 120), dtype=torch.float32)
|
||||
torch.onnx.export(
|
||||
model_script,
|
||||
fbank_input,
|
||||
onnx_file,
|
||||
opset_version=opset,
|
||||
input_names=[INPUT_NAME],
|
||||
output_names=[OUTPUT_NAME],
|
||||
dynamic_axes={
|
||||
INPUT_NAME: {
|
||||
0: 'batch_size',
|
||||
1: 'number_of_frame'
|
||||
},
|
||||
OUTPUT_NAME: {
|
||||
0: 'batch_size',
|
||||
1: 'number_of_frame'
|
||||
}
|
||||
})
|
||||
return {'model': onnx_file}
|
||||
@@ -6,6 +6,7 @@ import functools
|
||||
import os
|
||||
import pickle
|
||||
import platform
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
@@ -15,10 +16,10 @@ from http.cookiejar import CookieJar
|
||||
from os.path import expanduser
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import requests
|
||||
from requests import Session
|
||||
from requests.adapters import HTTPAdapter, Retry
|
||||
|
||||
from modelscope import __version__
|
||||
from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
|
||||
API_RESPONSE_FIELD_DATA,
|
||||
API_RESPONSE_FIELD_EMAIL,
|
||||
@@ -45,7 +46,7 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
|
||||
MASTER_MODEL_BRANCH, DatasetFormations,
|
||||
DatasetMetaFormats,
|
||||
DatasetVisibilityMap, DownloadChannel,
|
||||
ModelFile)
|
||||
ModelFile, VirgoDatasetConfig)
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .utils.utils import (get_endpoint, get_release_datetime,
|
||||
model_id_to_group_owner_name)
|
||||
@@ -160,6 +161,7 @@ class HubApi:
|
||||
'Visibility': visibility, # server check
|
||||
'License': license,
|
||||
'OriginalModelId': original_model_id,
|
||||
'TrainId': os.environ.get('MODELSCOPE_TRAIN_ID', ''),
|
||||
}
|
||||
r = self.session.post(
|
||||
path, json=body, cookies=cookies, headers=self.headers)
|
||||
@@ -236,8 +238,10 @@ class HubApi:
|
||||
license: Optional[str] = Licenses.APACHE_V2,
|
||||
chinese_name: Optional[str] = None,
|
||||
commit_message: Optional[str] = 'upload model',
|
||||
tag: Optional[str] = None,
|
||||
revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
||||
original_model_id: Optional[str] = None):
|
||||
original_model_id: Optional[str] = None,
|
||||
ignore_file_pattern: Optional[Union[List[str], str]] = None):
|
||||
"""Upload model from a given directory to given repository. A valid model directory
|
||||
must contain a configuration.json file.
|
||||
|
||||
@@ -268,10 +272,13 @@ class HubApi:
|
||||
chinese name of the new created model.
|
||||
commit_message(`str`, *optional*, defaults to `None`):
|
||||
commit message of the push request.
|
||||
tag(`str`, *optional*, defaults to `None`):
|
||||
The tag on this commit
|
||||
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
|
||||
which branch to push. If the branch is not exists, It will create a new
|
||||
branch and push to it.
|
||||
original_model_id (str, optional): The base model id which this model is trained from
|
||||
ignore_file_pattern (`Union[List[str], str]`, optional): The file pattern to ignore uploading
|
||||
|
||||
Raises:
|
||||
InvalidParameter: Parameter invalid.
|
||||
@@ -292,6 +299,10 @@ class HubApi:
|
||||
if cookies is None:
|
||||
raise NotLoginException('Must login before upload!')
|
||||
files_to_save = os.listdir(model_dir)
|
||||
if ignore_file_pattern is None:
|
||||
ignore_file_pattern = []
|
||||
if isinstance(ignore_file_pattern, str):
|
||||
ignore_file_pattern = [ignore_file_pattern]
|
||||
try:
|
||||
self.get_model(model_id=model_id)
|
||||
except Exception:
|
||||
@@ -325,6 +336,8 @@ class HubApi:
|
||||
shutil.rmtree(src, ignore_errors=True)
|
||||
for f in files_to_save:
|
||||
if f[0] != '.':
|
||||
if any([re.search(pattern, f) is not None for pattern in ignore_file_pattern]):
|
||||
continue
|
||||
src = os.path.join(model_dir, f)
|
||||
if os.path.isdir(src):
|
||||
shutil.copytree(src, os.path.join(tmp_dir, f))
|
||||
@@ -338,6 +351,8 @@ class HubApi:
|
||||
commit_message=commit_message,
|
||||
local_branch=revision,
|
||||
remote_branch=revision)
|
||||
if tag is not None:
|
||||
repo.tag_and_push(tag, tag)
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
@@ -581,6 +596,17 @@ class HubApi:
|
||||
file_list = file_list['Files']
|
||||
return file_list
|
||||
|
||||
@staticmethod
|
||||
def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
|
||||
"""
|
||||
Dump the data_type as a local file, in order to get the dataset formation without calling the datahub.
|
||||
More details, please refer to the class `modelscope.utils.constant.DatasetFormations`.
|
||||
"""
|
||||
dataset_type_file_path = os.path.join(meta_cache_dir,
|
||||
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
|
||||
with open(dataset_type_file_path, 'w') as fp:
|
||||
fp.write('*** Automatically-generated file, do not modify ***')
|
||||
|
||||
def get_dataset_meta_files_local_paths(self, dataset_name: str,
|
||||
namespace: str,
|
||||
revision: str,
|
||||
@@ -591,10 +617,7 @@ class HubApi:
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
|
||||
# Dump the data_type as a local file
|
||||
dataset_type_file_path = os.path.join(meta_cache_dir,
|
||||
f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
|
||||
with open(dataset_type_file_path, 'w') as fp:
|
||||
fp.write('*** Automatically-generated file, do not modify ***')
|
||||
HubApi.dump_datatype_file(dataset_type=dataset_type, meta_cache_dir=meta_cache_dir)
|
||||
|
||||
for file_info in file_list:
|
||||
file_path = file_info['Path']
|
||||
@@ -661,7 +684,6 @@ class HubApi:
|
||||
cookies = self._check_cookie(use_cookies=True)
|
||||
else:
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
r = self.session.get(url=datahub_url, cookies=cookies, headers=self.headers)
|
||||
|
||||
r = self.session.get(
|
||||
url=datahub_url, cookies=cookies, headers=self.headers)
|
||||
@@ -669,6 +691,31 @@ class HubApi:
|
||||
raise_on_error(resp)
|
||||
return resp['Data']
|
||||
|
||||
def get_virgo_meta(self, dataset_id: str, version: int = 1) -> dict:
|
||||
"""
|
||||
Get virgo dataset meta info.
|
||||
"""
|
||||
virgo_endpoint = os.environ.get(VirgoDatasetConfig.env_virgo_endpoint, '')
|
||||
if not virgo_endpoint:
|
||||
raise RuntimeError(f'Virgo endpoint is not set in env: {VirgoDatasetConfig.env_virgo_endpoint}')
|
||||
|
||||
virgo_dataset_url = f'{virgo_endpoint}/data/set/download'
|
||||
cookies = requests.utils.dict_from_cookiejar(ModelScopeConfig.get_cookies())
|
||||
|
||||
dataset_info = dict(
|
||||
dataSetId=dataset_id,
|
||||
dataSetVersion=version
|
||||
)
|
||||
data = dict(
|
||||
data=dataset_info,
|
||||
)
|
||||
r = self.session.post(url=virgo_dataset_url, json=data, cookies=cookies, headers=self.headers, timeout=900)
|
||||
resp = r.json()
|
||||
if resp['code'] != 0:
|
||||
raise RuntimeError(f'Failed to get virgo dataset: {resp}')
|
||||
|
||||
return resp['data']
|
||||
|
||||
def get_dataset_access_config_for_unzipped(self,
|
||||
dataset_name: str,
|
||||
namespace: str,
|
||||
@@ -895,6 +942,7 @@ class ModelScopeConfig:
|
||||
if MODELSCOPE_CLOUD_USERNAME in os.environ:
|
||||
user_name = os.environ[MODELSCOPE_CLOUD_USERNAME]
|
||||
|
||||
from modelscope import __version__
|
||||
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
|
||||
__version__,
|
||||
platform.python_version(),
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from http import HTTPStatus
|
||||
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
@@ -57,13 +58,22 @@ def is_ok(rsp):
|
||||
return rsp['Code'] == HTTPStatus.OK and rsp['Success']
|
||||
|
||||
|
||||
def _decode_response_error(response: requests.Response):
|
||||
if 'application/json' in response.headers.get('content-type', ''):
|
||||
message = response.json()
|
||||
else:
|
||||
message = response.content.decode('utf-8')
|
||||
return message
|
||||
|
||||
|
||||
def handle_http_post_error(response, url, request_body):
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except HTTPError as error:
|
||||
logger.error('Request %s with body: %s exception' %
|
||||
(url, request_body))
|
||||
logger.error('Response details: %s' % response.content)
|
||||
message = _decode_response_error(response)
|
||||
logger.error('Response details: %s' % message)
|
||||
raise error
|
||||
|
||||
|
||||
@@ -75,7 +85,8 @@ def handle_http_response(response, logger, cookies, model_id):
|
||||
logger.error(
|
||||
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
|
||||
private. Please login first.')
|
||||
logger.error('Response details: %s' % response.content)
|
||||
message = _decode_response_error(response)
|
||||
logger.error('Response details: %s' % message)
|
||||
raise error
|
||||
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ import requests
|
||||
from requests.adapters import Retry
|
||||
from tqdm import tqdm
|
||||
|
||||
from modelscope import __version__
|
||||
from modelscope.hub.api import HubApi, ModelScopeConfig
|
||||
from modelscope.hub.constants import (API_FILE_DOWNLOAD_CHUNK_SIZE,
|
||||
API_FILE_DOWNLOAD_RETRY_TIMES,
|
||||
|
||||
@@ -55,16 +55,10 @@ class GitCommandWrapper(metaclass=Singleton):
|
||||
response.check_returncode()
|
||||
return response
|
||||
except subprocess.CalledProcessError as error:
|
||||
if response.returncode == 1:
|
||||
logger.info('Nothing to commit.')
|
||||
return response
|
||||
else:
|
||||
logger.error(
|
||||
'There are error run git command, you may need to login first.'
|
||||
)
|
||||
raise GitError('stdout: %s, stderr: %s' %
|
||||
(response.stdout.decode('utf8'),
|
||||
error.stderr.decode('utf8')))
|
||||
logger.error('There are error run git command.')
|
||||
raise GitError(
|
||||
'stdout: %s, stderr: %s' %
|
||||
(response.stdout.decode('utf8'), error.stderr.decode('utf8')))
|
||||
|
||||
def config_auth_token(self, repo_dir, auth_token):
|
||||
url = self.get_repo_remote_url(repo_dir)
|
||||
@@ -199,8 +193,11 @@ class GitCommandWrapper(metaclass=Singleton):
|
||||
else:
|
||||
return ['/'.join(line.split('/')[1:]) for line in info[1:]]
|
||||
|
||||
def pull(self, repo_dir: str):
|
||||
cmds = ['-C', repo_dir, 'pull']
|
||||
def pull(self,
|
||||
repo_dir: str,
|
||||
remote: str = 'origin',
|
||||
branch: str = 'master'):
|
||||
cmds = ['-C', repo_dir, 'pull', remote, branch]
|
||||
return self._run_git_command(*cmds)
|
||||
|
||||
def push(self,
|
||||
|
||||
@@ -4,8 +4,8 @@ import concurrent.futures
|
||||
import os
|
||||
|
||||
from modelscope.hub.api import HubApi
|
||||
from modelscope.hub.constants import Licenses, ModelVisibility
|
||||
from modelscope.hub.errors import NotExistError
|
||||
from modelscope.hub.constants import ModelVisibility
|
||||
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
@@ -18,7 +18,10 @@ def _api_push_to_hub(repo_name,
|
||||
token,
|
||||
private=True,
|
||||
commit_message='',
|
||||
source_repo=''):
|
||||
tag=None,
|
||||
source_repo='',
|
||||
ignore_file_pattern=None,
|
||||
revision=DEFAULT_REPOSITORY_REVISION):
|
||||
try:
|
||||
api = HubApi()
|
||||
api.login(token)
|
||||
@@ -29,7 +32,10 @@ def _api_push_to_hub(repo_name,
|
||||
if not private else ModelVisibility.PRIVATE,
|
||||
chinese_name=repo_name,
|
||||
commit_message=commit_message,
|
||||
original_model_id=source_repo)
|
||||
tag=tag,
|
||||
original_model_id=source_repo,
|
||||
ignore_file_pattern=ignore_file_pattern,
|
||||
revision=revision)
|
||||
commit_message = commit_message or 'No commit message'
|
||||
logger.info(
|
||||
f'Successfully upload the model to {repo_name} with message: {commit_message}'
|
||||
@@ -48,7 +54,10 @@ def push_to_hub(repo_name,
|
||||
private=True,
|
||||
retry=3,
|
||||
commit_message='',
|
||||
source_repo=''):
|
||||
tag=None,
|
||||
source_repo='',
|
||||
ignore_file_pattern=None,
|
||||
revision=DEFAULT_REPOSITORY_REVISION):
|
||||
"""
|
||||
Args:
|
||||
repo_name: The repo name for the modelhub repo
|
||||
@@ -57,13 +66,18 @@ def push_to_hub(repo_name,
|
||||
private: If is a private repo, default True
|
||||
retry: Retry times if something error in uploading, default 3
|
||||
commit_message: The commit message
|
||||
tag: The tag of this commit
|
||||
source_repo: The source repo (model id) which this model comes from
|
||||
|
||||
ignore_file_pattern: The file pattern to be ignored in uploading.
|
||||
revision: The branch to commit to
|
||||
Returns:
|
||||
The boolean value to represent whether the model is uploaded.
|
||||
"""
|
||||
if token is None:
|
||||
token = os.environ.get('MODELSCOPE_API_TOKEN')
|
||||
if ignore_file_pattern is None:
|
||||
ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
|
||||
assert repo_name is not None
|
||||
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
|
||||
assert os.path.isdir(output_dir)
|
||||
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
|
||||
@@ -73,7 +87,8 @@ def push_to_hub(repo_name,
|
||||
f'Uploading {output_dir} to {repo_name} with message {commit_message}')
|
||||
for i in range(retry):
|
||||
if _api_push_to_hub(repo_name, output_dir, token, private,
|
||||
commit_message, source_repo):
|
||||
commit_message, tag, source_repo,
|
||||
ignore_file_pattern, revision):
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -83,7 +98,10 @@ def push_to_hub_async(repo_name,
|
||||
token=None,
|
||||
private=True,
|
||||
commit_message='',
|
||||
source_repo=''):
|
||||
tag=None,
|
||||
source_repo='',
|
||||
ignore_file_pattern=None,
|
||||
revision=DEFAULT_REPOSITORY_REVISION):
|
||||
"""
|
||||
Args:
|
||||
repo_name: The repo name for the modelhub repo
|
||||
@@ -91,13 +109,18 @@ def push_to_hub_async(repo_name,
|
||||
token: The user api token, function will check the `MODELSCOPE_API_TOKEN` variable if this argument is None
|
||||
private: If is a private repo, default True
|
||||
commit_message: The commit message
|
||||
tag: The tag of this commit
|
||||
source_repo: The source repo (model id) which this model comes from
|
||||
|
||||
ignore_file_pattern: The file pattern to be ignored in uploading
|
||||
revision: The branch to commit to
|
||||
Returns:
|
||||
A handler to check the result and the status
|
||||
"""
|
||||
if token is None:
|
||||
token = os.environ.get('MODELSCOPE_API_TOKEN')
|
||||
if ignore_file_pattern is None:
|
||||
ignore_file_pattern = os.environ.get('UPLOAD_IGNORE_FILE_PATTERN')
|
||||
assert repo_name is not None
|
||||
assert token is not None, 'Either pass in a token or to set `MODELSCOPE_API_TOKEN` in the environment variables.'
|
||||
assert os.path.isdir(output_dir)
|
||||
assert 'configuration.json' in os.listdir(output_dir) or 'configuration.yaml' in os.listdir(output_dir) \
|
||||
@@ -106,4 +129,5 @@ def push_to_hub_async(repo_name,
|
||||
logger.info(
|
||||
f'Uploading {output_dir} to {repo_name} with message {commit_message}')
|
||||
return _executor.submit(_api_push_to_hub, repo_name, output_dir, token,
|
||||
private, commit_message, source_repo)
|
||||
private, commit_message, tag, source_repo,
|
||||
ignore_file_pattern, revision)
|
||||
|
||||
@@ -88,6 +88,26 @@ class Repository:
|
||||
remote = None
|
||||
return remote
|
||||
|
||||
def pull(self, remote: str = 'origin', branch: str = 'master'):
|
||||
"""Pull remote branch
|
||||
|
||||
Args:
|
||||
remote (str, optional): The remote name. Defaults to 'origin'.
|
||||
branch (str, optional): The remote branch. Defaults to 'master'.
|
||||
"""
|
||||
self.git_wrapper.pull(self.model_dir, remote=remote, branch=branch)
|
||||
|
||||
def add_lfs_type(self, file_name_suffix: str):
|
||||
"""Add file suffix to lfs list.
|
||||
|
||||
Args:
|
||||
file_name_suffix (str): The file name suffix.
|
||||
examples '*.safetensors'
|
||||
"""
|
||||
os.system(
|
||||
"printf '%s filter=lfs diff=lfs merge=lfs -text\n'>>%s" %
|
||||
(file_name_suffix, os.path.join(self.model_dir, '.gitattributes')))
|
||||
|
||||
def push(self,
|
||||
commit_message: str,
|
||||
local_branch: Optional[str] = DEFAULT_REPOSITORY_REVISION,
|
||||
@@ -120,7 +140,6 @@ class Repository:
|
||||
self.model_repo_name)
|
||||
|
||||
url = self.git_wrapper.get_repo_remote_url(self.model_dir)
|
||||
self.git_wrapper.pull(self.model_dir)
|
||||
|
||||
self.git_wrapper.add(self.model_dir, all_files=True)
|
||||
self.git_wrapper.commit(self.model_dir, commit_message)
|
||||
|
||||
@@ -116,15 +116,9 @@ class Models(object):
|
||||
bad_image_detecting = 'bad-image-detecting'
|
||||
controllable_image_generation = 'controllable-image-generation'
|
||||
longshortnet = 'longshortnet'
|
||||
fastinst = 'fastinst'
|
||||
pedestrian_attribute_recognition = 'pedestrian-attribute-recognition'
|
||||
|
||||
# EasyCV models
|
||||
yolox = 'YOLOX'
|
||||
segformer = 'Segformer'
|
||||
hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
|
||||
image_object_detection_auto = 'image-object-detection-auto'
|
||||
dino = 'DINO'
|
||||
|
||||
# nlp models
|
||||
bert = 'bert'
|
||||
palm = 'palm-v2'
|
||||
@@ -177,6 +171,7 @@ class Models(object):
|
||||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
|
||||
speech_dfsmn_ans = 'speech_dfsmn_ans'
|
||||
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
|
||||
speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
|
||||
speech_kws_fsmn_char_ctc_nearfield = 'speech_kws_fsmn_char_ctc_nearfield'
|
||||
speech_mossformer_separation_temporal_8k = 'speech_mossformer_separation_temporal_8k'
|
||||
kws_kwsbp = 'kws-kwsbp'
|
||||
@@ -187,6 +182,9 @@ class Models(object):
|
||||
generic_sv = 'generic-sv'
|
||||
ecapa_tdnn_sv = 'ecapa-tdnn-sv'
|
||||
campplus_sv = 'cam++-sv'
|
||||
eres2net_sv = 'eres2net-sv'
|
||||
scl_sd = 'scl-sd'
|
||||
rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
|
||||
generic_lm = 'generic-lm'
|
||||
|
||||
# multi-modal models
|
||||
@@ -205,6 +203,8 @@ class Models(object):
|
||||
hitea = 'hitea'
|
||||
soonet = 'soonet'
|
||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||
mplug_owl = 'mplug-owl'
|
||||
clip_interrogator = 'clip-interrogator'
|
||||
|
||||
# science models
|
||||
unifold = 'unifold'
|
||||
@@ -255,6 +255,7 @@ class Pipelines(object):
|
||||
should use task name for this pipeline.
|
||||
For pipeline which suuport only one model, we should use ${Model}-${Task} as its name.
|
||||
"""
|
||||
pipeline_template = 'pipeline-template'
|
||||
# vision tasks
|
||||
portrait_matting = 'unet-image-matting'
|
||||
universal_matting = 'unet-universal-matting'
|
||||
@@ -277,8 +278,6 @@ class Pipelines(object):
|
||||
tbs_detection = 'tbs-detection'
|
||||
object_detection = 'vit-object-detection'
|
||||
abnormal_object_detection = 'abnormal-object-detection'
|
||||
easycv_detection = 'easycv-detection'
|
||||
easycv_segmentation = 'easycv-segmentation'
|
||||
face_2d_keypoints = 'mobilenet_face-2d-keypoints_alignment'
|
||||
salient_detection = 'u2net-salient-detection'
|
||||
salient_boudary_detection = 'res2net-salient-detection'
|
||||
@@ -347,7 +346,6 @@ class Pipelines(object):
|
||||
video_single_object_tracking_procontext = 'procontext-vitb-video-single-object-tracking'
|
||||
video_multi_object_tracking = 'video-multi-object-tracking'
|
||||
image_panoptic_segmentation = 'image-panoptic-segmentation'
|
||||
image_panoptic_segmentation_easycv = 'image-panoptic-segmentation-easycv'
|
||||
video_summarization = 'googlenet_pgl_video_summarization'
|
||||
language_guided_video_summarization = 'clip-it-video-summarization'
|
||||
image_semantic_segmentation = 'image-semantic-segmentation'
|
||||
@@ -402,7 +400,7 @@ class Pipelines(object):
|
||||
nerf_recon_acc = 'nerf-recon-acc'
|
||||
bad_image_detecting = 'bad-image-detecting'
|
||||
controllable_image_generation = 'controllable-image-generation'
|
||||
|
||||
fast_instance_segmentation = 'fast-instance-segmentation'
|
||||
image_quality_assessment_mos = 'image-quality-assessment-mos'
|
||||
image_quality_assessment_man = 'image-quality-assessment-man'
|
||||
image_quality_assessment_degradation = 'image-quality-assessment-degradation'
|
||||
@@ -485,6 +483,9 @@ class Pipelines(object):
|
||||
speaker_diarization_inference = 'speaker-diarization-inference'
|
||||
vad_inference = 'vad-inference'
|
||||
speaker_verification = 'speaker-verification'
|
||||
speaker_verification_rdino = 'speaker-verification-rdino'
|
||||
speaker_verification_eres2net = 'speaker-verification-eres2net'
|
||||
speaker_change_locating = 'speaker-change-locating'
|
||||
lm_inference = 'language-score-prediction'
|
||||
speech_timestamp_inference = 'speech-timestamp-inference'
|
||||
|
||||
@@ -514,6 +515,7 @@ class Pipelines(object):
|
||||
gridvlp_multi_modal_embedding = 'gridvlp-multi-modal-embedding'
|
||||
soonet_video_temporal_grounding = 'soonet-video-temporal-grounding'
|
||||
efficient_diffusion_tuning = 'efficient-diffusion-tuning'
|
||||
multimodal_dialogue = 'multimodal-dialogue'
|
||||
|
||||
# science tasks
|
||||
protein_structure = 'unifold-protein-structure'
|
||||
@@ -881,6 +883,7 @@ class NLPTrainers(object):
|
||||
document_grounded_dialog_rerank_trainer = 'document-grounded-dialog-rerank-trainer'
|
||||
document_grounded_dialog_retrieval_trainer = 'document-grounded-dialog-retrieval-trainer'
|
||||
siamese_uie_trainer = 'siamese-uie-trainer'
|
||||
translation_evaluation_trainer = 'translation-evaluation-trainer'
|
||||
|
||||
|
||||
class MultiModalTrainers(object):
|
||||
@@ -911,7 +914,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
|
||||
"""
|
||||
|
||||
default = 'trainer'
|
||||
easycv = 'easycv'
|
||||
tinynas_damoyolo = 'tinynas-damoyolo'
|
||||
|
||||
@staticmethod
|
||||
@@ -933,8 +935,6 @@ class Trainers(CVTrainers, NLPTrainers, MultiModalTrainers, AudioTrainers):
|
||||
return Fields.multi_modal
|
||||
elif attribute_or_value == Trainers.default:
|
||||
return Trainers.default
|
||||
elif attribute_or_value == Trainers.easycv:
|
||||
return Trainers.easycv
|
||||
else:
|
||||
return 'unknown'
|
||||
|
||||
@@ -1034,6 +1034,8 @@ class Preprocessors(object):
|
||||
vldoc_preprocessor = 'vldoc-preprocessor'
|
||||
hitea_tasks_preprocessor = 'hitea-tasks-preprocessor'
|
||||
diffusion_image_generation_preprocessor = 'diffusion-image-generation-preprocessor'
|
||||
mplug_owl_preprocessor = 'mplug-owl-preprocessor'
|
||||
image_captioning_clip_interrogator_preprocessor = 'image-captioning-clip-interrogator-preprocessor'
|
||||
|
||||
# science preprocessor
|
||||
unifold_preprocessor = 'unifold-preprocessor'
|
||||
@@ -1098,6 +1100,8 @@ class Metrics(object):
|
||||
# metric for image-colorization task
|
||||
image_colorization_metric = 'image-colorization-metric'
|
||||
ocr_recognition_metric = 'ocr-recognition-metric'
|
||||
# metric for translation evaluation
|
||||
translation_evaluation_metric = 'translation-evaluation-metric'
|
||||
|
||||
|
||||
class Optimizers(object):
|
||||
@@ -1165,14 +1169,6 @@ class LR_Schedulers(object):
|
||||
class CustomDatasets(object):
|
||||
""" Names for different datasets.
|
||||
"""
|
||||
ClsDataset = 'ClsDataset'
|
||||
Face2dKeypointsDataset = 'FaceKeypointDataset'
|
||||
HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
|
||||
HumanWholeBodyKeypointDataset = 'WholeBodyCocoTopDownDataset'
|
||||
SegDataset = 'SegDataset'
|
||||
DetDataset = 'DetDataset'
|
||||
DetImagesMixDataset = 'DetImagesMixDataset'
|
||||
PanopticDataset = 'PanopticDataset'
|
||||
PairedDataset = 'PairedDataset'
|
||||
SiddDataset = 'SiddDataset'
|
||||
GoproDataset = 'GoproDataset'
|
||||
|
||||
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
|
||||
from .loss_metric import LossMetric
|
||||
from .image_colorization_metric import ImageColorizationMetric
|
||||
from .ocr_recognition_metric import OCRRecognitionMetric
|
||||
from .translation_evaluation_metric import TranslationEvaluationMetric
|
||||
else:
|
||||
_import_structure = {
|
||||
'audio_noise_metric': ['AudioNoiseMetric'],
|
||||
@@ -62,7 +63,8 @@ else:
|
||||
'text_ranking_metric': ['TextRankingMetric'],
|
||||
'loss_metric': ['LossMetric'],
|
||||
'image_colorization_metric': ['ImageColorizationMetric'],
|
||||
'ocr_recognition_metric': ['OCRRecognitionMetric']
|
||||
'ocr_recognition_metric': ['OCRRecognitionMetric'],
|
||||
'translation_evaluation_metric': ['TranslationEvaluationMetric']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -42,6 +42,7 @@ class MetricKeys(object):
|
||||
NDCG = 'ndcg'
|
||||
AR = 'AR'
|
||||
Colorfulness = 'colorfulness'
|
||||
Kendall_Tau_Correlation = 'kendall_tau_correlation'
|
||||
|
||||
|
||||
task_default_metrics = {
|
||||
@@ -76,6 +77,7 @@ task_default_metrics = {
|
||||
Tasks.bad_image_detecting: [Metrics.accuracy],
|
||||
Tasks.ocr_recognition: [Metrics.ocr_recognition_metric],
|
||||
Tasks.efficient_diffusion_tuning: [Metrics.loss_metric],
|
||||
Tasks.translation_evaluation: [Metrics.translation_evaluation_metric]
|
||||
}
|
||||
|
||||
|
||||
|
||||
174
modelscope/metrics/translation_evaluation_metric.py
Normal file
174
modelscope/metrics/translation_evaluation_metric.py
Normal file
@@ -0,0 +1,174 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import importlib
|
||||
from typing import Dict, List, Union
|
||||
|
||||
from pandas import DataFrame
|
||||
|
||||
from modelscope.metainfo import Metrics
|
||||
from modelscope.metrics.base import Metric
|
||||
from modelscope.metrics.builder import METRICS, MetricKeys
|
||||
from modelscope.models.nlp.unite.configuration import InputFormat
|
||||
from modelscope.utils.logger import get_logger
|
||||
from modelscope.utils.registry import default_group
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@METRICS.register_module(
|
||||
group_key=default_group, module_name=Metrics.translation_evaluation_metric)
|
||||
class TranslationEvaluationMetric(Metric):
|
||||
r"""The metric class for translation evaluation.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, gap_threshold: float = 25.0):
|
||||
r"""Build a translation evaluation metric, following the designed
|
||||
Kendall's tau correlation from WMT Metrics Shared Task competitions.
|
||||
|
||||
Args:
|
||||
gap_threshold: The score gap denoting the available hypothesis pair.
|
||||
|
||||
Returns:
|
||||
A metric for translation evaluation.
|
||||
"""
|
||||
self.gap_threshold = gap_threshold
|
||||
|
||||
self.lp = list()
|
||||
self.segment_id = list()
|
||||
self.raw_score = list()
|
||||
self.score = list()
|
||||
self.input_format = list()
|
||||
|
||||
def clear(self) -> None:
|
||||
r"""Clear all the stored variables.
|
||||
"""
|
||||
self.lp.clear()
|
||||
self.segment_id.clear()
|
||||
self.raw_score.clear()
|
||||
self.input_format.clear()
|
||||
|
||||
self.score.clear()
|
||||
|
||||
return
|
||||
|
||||
def add(self, outputs: Dict[str, List[float]],
|
||||
inputs: Dict[str, List[Union[float, int]]]) -> None:
|
||||
r"""Collect the related results for processing.
|
||||
|
||||
Args:
|
||||
outputs: Dict containing 'scores'
|
||||
inputs: Dict containing 'labels' and 'segment_ids'
|
||||
|
||||
"""
|
||||
|
||||
self.lp += inputs['lp']
|
||||
self.segment_id += inputs['segment_id']
|
||||
self.raw_score += inputs['raw_score']
|
||||
self.input_format += inputs['input_format']
|
||||
|
||||
self.score += outputs['score']
|
||||
|
||||
return
|
||||
|
||||
def evaluate(self) -> Dict[str, Dict[str, float]]:
|
||||
r"""Compute the Kendall's tau correlation.
|
||||
|
||||
Returns:
|
||||
A dict denoting Kendall's tau correlation.
|
||||
|
||||
"""
|
||||
|
||||
data = {
|
||||
'lp': self.lp,
|
||||
'segment_id': self.segment_id,
|
||||
'raw_score': self.raw_score,
|
||||
'input_format': self.input_format,
|
||||
'score': self.score
|
||||
}
|
||||
data = DataFrame(data=data)
|
||||
correlation = dict()
|
||||
|
||||
for input_format in data.input_format.unique():
|
||||
logger.info('Evaluation results for %s input format'
|
||||
% input_format.value)
|
||||
input_format_data = data[data.input_format == input_format]
|
||||
|
||||
temp_correlation = dict()
|
||||
|
||||
for lp in sorted(input_format_data.lp.unique()):
|
||||
sub_data = input_format_data[input_format_data.lp == lp]
|
||||
temp_correlation[input_format.value + '_'
|
||||
+ lp] = self.compute_kendall_tau(sub_data)
|
||||
logger.info(
|
||||
'\t%s: %f' %
|
||||
(lp,
|
||||
temp_correlation[input_format.value + '_' + lp] * 100))
|
||||
|
||||
avg_correlation = sum(
|
||||
temp_correlation.values()) / len(temp_correlation)
|
||||
correlation[input_format.value + '_avg'] = avg_correlation
|
||||
logger.info('Average evaluation result for %s input format: %f' %
|
||||
(input_format.value, avg_correlation))
|
||||
logger.info('')
|
||||
correlation.update(temp_correlation)
|
||||
|
||||
return correlation
|
||||
|
||||
def merge(self, other: 'TranslationEvaluationMetric') -> None:
|
||||
r"""Merge the predictions from other TranslationEvaluationMetric objects.
|
||||
|
||||
Args:
|
||||
other: Another TranslationEvaluationMetric object.
|
||||
|
||||
"""
|
||||
|
||||
self.lp += other.lp
|
||||
self.segment_id += other.segment_ids
|
||||
self.raw_score += other.raw_score
|
||||
self.input_format += other.input_format
|
||||
|
||||
self.score += other.score
|
||||
|
||||
return
|
||||
|
||||
def compute_kendall_tau(self, csv_data: DataFrame) -> float:
|
||||
r"""Compute kendall's tau correlation.
|
||||
|
||||
Args:
|
||||
csv_data: The pandas dataframe.
|
||||
|
||||
Returns:
|
||||
float: THe kendall's Tau correlation.
|
||||
|
||||
"""
|
||||
concor = discor = 0
|
||||
|
||||
for segment_id in sorted(csv_data.segment_id.unique()):
|
||||
group_csv_data = csv_data[csv_data.segment_id == segment_id]
|
||||
|
||||
examples = group_csv_data.to_dict('records')
|
||||
|
||||
for i in range(0, len(examples)):
|
||||
for j in range(i + 1, len(examples)):
|
||||
if self.raw_score[i] - self.raw_score[
|
||||
j] >= self.gap_threshold:
|
||||
if self.score[i] > self.score[j]:
|
||||
concor += 1
|
||||
elif self.score[i] < self.score[j]:
|
||||
discor += 1
|
||||
elif self.raw_score[i] - self.raw_score[
|
||||
j] <= -self.gap_threshold:
|
||||
if self.score[i] < self.score[j]:
|
||||
concor += 1
|
||||
elif self.score[i] > self.score[j]:
|
||||
discor += 1
|
||||
|
||||
if concor + discor == 0:
|
||||
logger.warning(
|
||||
'We don\'t have available pairs when evaluation. '
|
||||
'Marking the kendall tau correlation as the lowest value (-1.0).'
|
||||
)
|
||||
return -1.0
|
||||
else:
|
||||
return (concor - discor) / (concor + discor)
|
||||
@@ -39,7 +39,7 @@ class ConvSTFT(nn.Module):
|
||||
super(ConvSTFT, self).__init__()
|
||||
|
||||
if fft_len is None:
|
||||
self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
|
||||
self.fft_len = int(2**np.ceil(np.log2(win_len)))
|
||||
else:
|
||||
self.fft_len = fft_len
|
||||
|
||||
@@ -78,7 +78,7 @@ class ConviSTFT(nn.Module):
|
||||
fix=True):
|
||||
super(ConviSTFT, self).__init__()
|
||||
if fft_len is None:
|
||||
self.fft_len = np.int(2**np.ceil(np.log2(win_len)))
|
||||
self.fft_len = int(2**np.ceil(np.log2(win_len)))
|
||||
else:
|
||||
self.fft_len = fft_len
|
||||
kernel, window = init_kernels(
|
||||
|
||||
@@ -45,27 +45,5 @@ class GenericAutomaticSpeechRecognition(Model):
|
||||
def forward(self) -> Dict[str, Any]:
|
||||
"""preload model and return the info of the model
|
||||
"""
|
||||
if self.model_cfg['model_config']['type'] == Frameworks.tf:
|
||||
from easyasr import asr_inference_paraformer_tf
|
||||
if hasattr(asr_inference_paraformer_tf, 'preload'):
|
||||
model_workspace = self.model_cfg['model_workspace']
|
||||
model_path = os.path.join(model_workspace,
|
||||
self.model_cfg['am_model'])
|
||||
vocab_path = os.path.join(
|
||||
model_workspace,
|
||||
self.model_cfg['model_config']['vocab_file'])
|
||||
sampled_ids = 'seq2seq/sampled_ids'
|
||||
sampled_lengths = 'seq2seq/sampled_lengths'
|
||||
if 'sampled_ids' in self.model_cfg['model_config']:
|
||||
sampled_ids = self.model_cfg['model_config']['sampled_ids']
|
||||
if 'sampled_lengths' in self.model_cfg['model_config']:
|
||||
sampled_lengths = self.model_cfg['model_config'][
|
||||
'sampled_lengths']
|
||||
asr_inference_paraformer_tf.preload(
|
||||
ngpu=1,
|
||||
asr_model_file=model_path,
|
||||
vocab_file=vocab_path,
|
||||
sampled_ids=sampled_ids,
|
||||
sampled_lengths=sampled_lengths)
|
||||
|
||||
return self.model_cfg
|
||||
|
||||
233
modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
Normal file
233
modelscope/models/audio/kws/farfield/fsmn_sele_v3.py
Normal file
@@ -0,0 +1,233 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from .fsmn import AffineTransform, Fsmn, LinearTransform, RectifiedLinear
|
||||
from .model_def import HEADER_BLOCK_SIZE, ActivationType, LayerType, f32ToI32
|
||||
|
||||
|
||||
class DFSMNUnit(nn.Module):
|
||||
""" one multi-channel deep fsmn unit
|
||||
Args:
|
||||
dimin: input dimension
|
||||
dimexpand: feature expansion dimension
|
||||
dimout: output dimension
|
||||
lorder: left ofder
|
||||
rorder: right order
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dimin=64,
|
||||
dimexpand=128,
|
||||
dimout=64,
|
||||
lorder=10,
|
||||
rorder=1):
|
||||
super(DFSMNUnit, self).__init__()
|
||||
|
||||
self.expand = AffineTransform(dimin, dimexpand)
|
||||
self.shrink = LinearTransform(dimexpand, dimout)
|
||||
self.fsmn = Fsmn(dimout, dimout, lorder, rorder, 1, 1)
|
||||
|
||||
self.debug = False
|
||||
self.dataout = None
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
Args:
|
||||
input: [batch, time, feature]
|
||||
"""
|
||||
out1 = F.relu(self.expand(input))
|
||||
out2 = self.shrink(out1)
|
||||
out3 = self.fsmn(out2)
|
||||
|
||||
# add skip connection for matched data
|
||||
if input.shape[-1] == out3.shape[-1]:
|
||||
out3 = input + out3
|
||||
if self.debug:
|
||||
self.dataout = out3
|
||||
return out3
|
||||
|
||||
def print_model(self):
|
||||
self.expand.printModel()
|
||||
self.shrink.printModel()
|
||||
self.fsmn.printModel()
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = self.expand.toKaldiNNet()
|
||||
relu = RectifiedLinear(self.expand.linear.out_features,
|
||||
self.expand.linear.out_features)
|
||||
re_str += relu.toKaldiNNet()
|
||||
re_str = self.shrink.toKaldiNNet()
|
||||
re_str += self.fsmn.toKaldiNNet()
|
||||
return re_str
|
||||
|
||||
|
||||
class FSMNSeleNetV3(nn.Module):
|
||||
""" Deep FSMN model with channel selection performs multi-channel kws.
|
||||
Zhang, Shiliang, et al. "Deep-FSMN for large vocabulary continuous speech
|
||||
recognition." 2018 IEEE International Conference on Acoustics, Speech and
|
||||
Signal Processing (ICASSP). IEEE, 2018.
|
||||
|
||||
Args:
|
||||
input_dim: input dimension
|
||||
linear_dim: fsmn input dimension
|
||||
proj_dim: fsmn projection dimension
|
||||
lorder: fsmn left order
|
||||
rorder: fsmn right order
|
||||
num_syn: output dimension
|
||||
fsmn_layers: no. of fsmn units
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
input_dim=120,
|
||||
linear_dim=128,
|
||||
proj_dim=64,
|
||||
lorder=10,
|
||||
rorder=1,
|
||||
num_syn=5,
|
||||
fsmn_layers=5):
|
||||
super(FSMNSeleNetV3, self).__init__()
|
||||
|
||||
self.mem = []
|
||||
# the first unit, mapping input dim to proj dim
|
||||
unit = DFSMNUnit(input_dim, linear_dim, proj_dim, lorder, rorder)
|
||||
self.mem.append(unit)
|
||||
self.add_module('mem_{:d}'.format(0), unit)
|
||||
|
||||
# deep fsmn layers with skip connection
|
||||
for i in range(1, fsmn_layers):
|
||||
unit = DFSMNUnit(proj_dim, linear_dim, proj_dim, lorder, rorder)
|
||||
self.mem.append(unit)
|
||||
self.add_module('mem_{:d}'.format(i), unit)
|
||||
|
||||
self.expand2 = AffineTransform(proj_dim, linear_dim)
|
||||
self.decision = AffineTransform(linear_dim, num_syn)
|
||||
|
||||
def forward(self, input):
|
||||
# multi-channel temp space, [batch, time, channel, feature]
|
||||
if torch.cuda.is_available():
|
||||
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
|
||||
self.expand2.linear.out_features).cuda()
|
||||
else:
|
||||
x = torch.zeros(input.shape[0], input.shape[1], input.shape[2],
|
||||
self.expand2.linear.out_features)
|
||||
|
||||
for n in range(input.shape[2]):
|
||||
chin = input[:, :, n, :]
|
||||
|
||||
for unit in self.mem:
|
||||
chout = unit(chin)
|
||||
chin = chout
|
||||
|
||||
x[:, :, n, :] = F.relu(self.expand2(chout))
|
||||
|
||||
# perform max pooling
|
||||
pool = nn.MaxPool2d((x.shape[2], 1), stride=(x.shape[2], 1))
|
||||
y = pool(x)
|
||||
|
||||
# remove channel dimension
|
||||
y = torch.squeeze(y, -2)
|
||||
z = self.decision(y)
|
||||
|
||||
return z
|
||||
|
||||
def print_model(self):
|
||||
for unit in self.mem:
|
||||
unit.print_model()
|
||||
|
||||
self.expand2.printModel()
|
||||
self.decision.printModel()
|
||||
|
||||
def print_header(self):
|
||||
""" get DFSMN params
|
||||
"""
|
||||
input_dim = self.mem[0].expand.linear.in_features
|
||||
linear_dim = self.mem[0].expand.linear.out_features
|
||||
proj_dim = self.mem[0].shrink.linear.out_features
|
||||
lorder = self.mem[0].fsmn.conv_left.kernel_size[0]
|
||||
rorder = 0
|
||||
if self.mem[0].fsmn.conv_right is not None:
|
||||
rorder = self.mem[0].fsmn.conv_right.kernel_size[0]
|
||||
|
||||
num_syn = self.decision.linear.out_features
|
||||
fsmn_layers = len(self.mem)
|
||||
|
||||
# no. of output channels, 0.0 means the same as numins
|
||||
numouts = 1.0
|
||||
|
||||
#
|
||||
# write total header
|
||||
#
|
||||
header = [0.0] * HEADER_BLOCK_SIZE * 5
|
||||
# numins
|
||||
header[0] = 0.0
|
||||
# numouts
|
||||
header[1] = numouts
|
||||
# dimins
|
||||
header[2] = input_dim
|
||||
# dimouts
|
||||
header[3] = num_syn
|
||||
# numlayers
|
||||
header[4] = 4
|
||||
|
||||
#
|
||||
# write each layer's header
|
||||
#
|
||||
hidx = 1
|
||||
|
||||
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||
LayerType.LAYER_DFSMN.value)
|
||||
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
|
||||
header[HEADER_BLOCK_SIZE * hidx + 2] = input_dim
|
||||
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
|
||||
header[HEADER_BLOCK_SIZE * hidx + 4] = proj_dim
|
||||
header[HEADER_BLOCK_SIZE * hidx + 5] = lorder
|
||||
header[HEADER_BLOCK_SIZE * hidx + 6] = rorder
|
||||
header[HEADER_BLOCK_SIZE * hidx + 7] = fsmn_layers
|
||||
hidx += 1
|
||||
|
||||
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||
LayerType.LAYER_DENSE.value)
|
||||
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
|
||||
header[HEADER_BLOCK_SIZE * hidx + 2] = proj_dim
|
||||
header[HEADER_BLOCK_SIZE * hidx + 3] = linear_dim
|
||||
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
|
||||
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
|
||||
ActivationType.ACTIVATION_RELU.value)
|
||||
hidx += 1
|
||||
|
||||
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||
LayerType.LAYER_MAX_POOLING.value)
|
||||
header[HEADER_BLOCK_SIZE * hidx + 1] = 0.0
|
||||
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
|
||||
hidx += 1
|
||||
|
||||
header[HEADER_BLOCK_SIZE * hidx + 0] = float(
|
||||
LayerType.LAYER_DENSE.value)
|
||||
header[HEADER_BLOCK_SIZE * hidx + 1] = numouts
|
||||
header[HEADER_BLOCK_SIZE * hidx + 2] = linear_dim
|
||||
header[HEADER_BLOCK_SIZE * hidx + 3] = num_syn
|
||||
header[HEADER_BLOCK_SIZE * hidx + 4] = 1.0
|
||||
header[HEADER_BLOCK_SIZE * hidx + 5] = float(
|
||||
ActivationType.ACTIVATION_SOFTMAX.value)
|
||||
|
||||
for h in header:
|
||||
print(f32ToI32(h))
|
||||
|
||||
def to_kaldi_nnet(self):
|
||||
re_str = '<Nnet>\n'
|
||||
for unit in self.mem:
|
||||
re_str += unit.to_kaldi_nnet()
|
||||
re_str = self.expand2.toKaldiNNet()
|
||||
relu = RectifiedLinear(self.expand2.linear.out_features,
|
||||
self.expand2.linear.out_features)
|
||||
re_str += relu.toKaldiNNet()
|
||||
re_str += self.decision.toKaldiNNet()
|
||||
re_str += '<Softmax> %d %d\n' % (self.decision.linear.out_features,
|
||||
self.decision.linear.out_features)
|
||||
re_str += '<!EndOfComponent>\n'
|
||||
re_str += '</Nnet>\n'
|
||||
|
||||
return re_str
|
||||
@@ -11,6 +11,7 @@ from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.audio.audio_utils import update_conf
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .fsmn_sele_v2 import FSMNSeleNetV2
|
||||
from .fsmn_sele_v3 import FSMNSeleNetV3
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
@@ -18,6 +19,7 @@ from .fsmn_sele_v2 import FSMNSeleNetV2
|
||||
class FSMNSeleNetV2Decorator(TorchModel):
|
||||
r""" A decorator of FSMNSeleNetV2 for integrating into modelscope framework """
|
||||
|
||||
MODEL_CLASS = FSMNSeleNetV2
|
||||
MODEL_TXT = 'model.txt'
|
||||
SC_CONFIG = 'sound_connect.conf'
|
||||
|
||||
@@ -33,7 +35,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
if training:
|
||||
self.model = FSMNSeleNetV2(*args, **kwargs)
|
||||
self.model = self.MODEL_CLASS(*args, **kwargs)
|
||||
else:
|
||||
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
|
||||
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
|
||||
@@ -42,7 +44,7 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
||||
|
||||
self._sc = None
|
||||
if os.path.exists(model_txt_file):
|
||||
conf_dict = dict(mode=56542, kws_model=model_txt_file)
|
||||
conf_dict = dict(kws_model=model_txt_file)
|
||||
update_conf(sc_config_file, new_config_file, conf_dict)
|
||||
import py_sound_connect
|
||||
self._sc = py_sound_connect.SoundConnect(new_config_file)
|
||||
@@ -50,8 +52,8 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
||||
self.size_out = self._sc.bytesPerBlockOut()
|
||||
else:
|
||||
raise Exception(
|
||||
f'Invalid model directory! Failed to load model file: {model_txt_file}.'
|
||||
)
|
||||
f'Invalid model directory! Failed to load model file:'
|
||||
f' {model_txt_file}.')
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, 'tmp_dir'):
|
||||
@@ -73,3 +75,24 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
||||
'confidence': self._sc.kwsConfidence()
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.keyword_spotting,
|
||||
module_name=Models.speech_dfsmn_kws_char_farfield_iot)
|
||||
class FSMNSeleNetV3Decorator(FSMNSeleNetV2Decorator):
|
||||
r""" A decorator of FSMNSeleNetV3 for integrating into modelscope framework """
|
||||
|
||||
MODEL_CLASS = FSMNSeleNetV3
|
||||
|
||||
def __init__(self,
|
||||
model_dir: str,
|
||||
training: Optional[bool] = False,
|
||||
*args,
|
||||
**kwargs):
|
||||
"""initialize the dfsmn model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
super().__init__(model_dir, training, *args, **kwargs)
|
||||
|
||||
@@ -76,11 +76,13 @@ class CAMPPlus(nn.Module):
|
||||
bn_size=4,
|
||||
init_channels=128,
|
||||
config_str='batchnorm-relu',
|
||||
memory_efficient=True):
|
||||
memory_efficient=True,
|
||||
output_level='segment'):
|
||||
super(CAMPPlus, self).__init__()
|
||||
|
||||
self.head = FCM(feat_dim=feat_dim)
|
||||
channels = self.head.out_channels
|
||||
self.output_level = output_level
|
||||
|
||||
self.xvector = nn.Sequential(
|
||||
OrderedDict([
|
||||
@@ -118,10 +120,14 @@ class CAMPPlus(nn.Module):
|
||||
self.xvector.add_module('out_nonlinear',
|
||||
get_nonlinear(config_str, channels))
|
||||
|
||||
if self.output_level == 'segment':
|
||||
self.xvector.add_module('stats', StatsPool())
|
||||
self.xvector.add_module(
|
||||
'dense',
|
||||
DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
|
||||
DenseLayer(
|
||||
channels * 2, embedding_size, config_str='batchnorm_'))
|
||||
else:
|
||||
assert self.output_level == 'frame', '`output_level` should be set to \'segment\' or \'frame\'. '
|
||||
|
||||
for m in self.modules():
|
||||
if isinstance(m, (nn.Conv1d, nn.Linear)):
|
||||
@@ -133,6 +139,8 @@ class CAMPPlus(nn.Module):
|
||||
x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
|
||||
x = self.head(x)
|
||||
x = self.xvector(x)
|
||||
if self.output_level == 'frame':
|
||||
x = x.transpose(1, 2)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
344
modelscope/models/audio/sv/ERes2Net.py
Normal file
344
modelscope/models/audio/sv/ERes2Net.py
Normal file
@@ -0,0 +1,344 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
""" Res2Net implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||
ERes2Net incorporates both local and global feature fusion techniques to improve the performance. The local feature
|
||||
fusion (LFF) fuses the features within one single residual block to extract the local signal.
|
||||
The global feature fusion (GFF) takes acoustic features of different scales as input to aggregate global signal.
|
||||
"""
|
||||
import math
|
||||
import os
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchaudio.compliance.kaldi as Kaldi
|
||||
|
||||
import modelscope.models.audio.sv.pooling_layers as pooling_layers
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import MODELS, TorchModel
|
||||
from modelscope.models.audio.sv.fusion import AFF
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
class ReLU(nn.Hardtanh):
|
||||
|
||||
def __init__(self, inplace=False):
|
||||
super(ReLU, self).__init__(0, 20, inplace)
|
||||
|
||||
def __repr__(self):
|
||||
inplace_str = 'inplace' if self.inplace else ''
|
||||
return self.__class__.__name__ + ' (' \
|
||||
+ inplace_str + ')'
|
||||
|
||||
|
||||
def conv1x1(in_planes, out_planes, stride=1):
|
||||
'1x1 convolution without padding'
|
||||
return nn.Conv2d(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
padding=0,
|
||||
bias=False)
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
'3x3 convolution with padding'
|
||||
return nn.Conv2d(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
bias=False)
|
||||
|
||||
|
||||
class BasicBlockRes2Net(nn.Module):
|
||||
expansion = 2
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||
super(BasicBlockRes2Net, self).__init__()
|
||||
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||
self.conv1 = conv1x1(in_planes, width * scale, stride)
|
||||
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||
self.nums = scale
|
||||
|
||||
convs = []
|
||||
bns = []
|
||||
for i in range(self.nums):
|
||||
convs.append(conv3x3(width, width))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = conv1x1(width * scale, planes * self.expansion)
|
||||
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
in_planes,
|
||||
self.expansion * planes,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False), nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out, self.width, 1)
|
||||
for i in range(self.nums):
|
||||
if i == 0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = sp + spx[i]
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i == 0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out, sp), 1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class BasicBlockRes2Net_diff_AFF(nn.Module):
|
||||
expansion = 2
|
||||
|
||||
def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
|
||||
super(BasicBlockRes2Net_diff_AFF, self).__init__()
|
||||
width = int(math.floor(planes * (baseWidth / 64.0)))
|
||||
self.conv1 = conv1x1(in_planes, width * scale, stride)
|
||||
self.bn1 = nn.BatchNorm2d(width * scale)
|
||||
self.nums = scale
|
||||
|
||||
convs = []
|
||||
fuse_models = []
|
||||
bns = []
|
||||
for i in range(self.nums):
|
||||
convs.append(conv3x3(width, width))
|
||||
bns.append(nn.BatchNorm2d(width))
|
||||
for j in range(self.nums - 1):
|
||||
fuse_models.append(AFF(channels=width))
|
||||
|
||||
self.convs = nn.ModuleList(convs)
|
||||
self.bns = nn.ModuleList(bns)
|
||||
self.fuse_models = nn.ModuleList(fuse_models)
|
||||
self.relu = ReLU(inplace=True)
|
||||
|
||||
self.conv3 = conv1x1(width * scale, planes * self.expansion)
|
||||
self.bn3 = nn.BatchNorm2d(planes * self.expansion)
|
||||
self.shortcut = nn.Sequential()
|
||||
if stride != 1 or in_planes != self.expansion * planes:
|
||||
self.shortcut = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
in_planes,
|
||||
self.expansion * planes,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False), nn.BatchNorm2d(self.expansion * planes))
|
||||
self.stride = stride
|
||||
self.width = width
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
spx = torch.split(out, self.width, 1)
|
||||
for i in range(self.nums):
|
||||
if i == 0:
|
||||
sp = spx[i]
|
||||
else:
|
||||
sp = self.fuse_models[i - 1](sp, spx[i])
|
||||
|
||||
sp = self.convs[i](sp)
|
||||
sp = self.relu(self.bns[i](sp))
|
||||
if i == 0:
|
||||
out = sp
|
||||
else:
|
||||
out = torch.cat((out, sp), 1)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
residual = self.shortcut(x)
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ERes2Net(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
block=BasicBlockRes2Net,
|
||||
block_fuse=BasicBlockRes2Net_diff_AFF,
|
||||
num_blocks=[3, 4, 6, 3],
|
||||
m_channels=32,
|
||||
feat_dim=80,
|
||||
embed_dim=192,
|
||||
pooling_func='TSTP',
|
||||
two_emb_layer=False):
|
||||
super(ERes2Net, self).__init__()
|
||||
self.in_planes = m_channels
|
||||
self.feat_dim = feat_dim
|
||||
self.embed_dim = embed_dim
|
||||
self.stats_dim = int(feat_dim / 8) * m_channels * 8
|
||||
self.two_emb_layer = two_emb_layer
|
||||
|
||||
self.conv1 = nn.Conv2d(
|
||||
1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
|
||||
self.bn1 = nn.BatchNorm2d(m_channels)
|
||||
self.layer1 = self._make_layer(
|
||||
block, m_channels, num_blocks[0], stride=1)
|
||||
self.layer2 = self._make_layer(
|
||||
block, m_channels * 2, num_blocks[1], stride=2)
|
||||
self.layer3 = self._make_layer(
|
||||
block_fuse, m_channels * 4, num_blocks[2], stride=2)
|
||||
self.layer4 = self._make_layer(
|
||||
block_fuse, m_channels * 8, num_blocks[3], stride=2)
|
||||
|
||||
# downsampling
|
||||
self.layer1_downsample = nn.Conv2d(
|
||||
m_channels * 2,
|
||||
m_channels * 4,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
bias=False)
|
||||
self.layer2_downsample = nn.Conv2d(
|
||||
m_channels * 4,
|
||||
m_channels * 8,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
stride=2,
|
||||
bias=False)
|
||||
self.layer3_downsample = nn.Conv2d(
|
||||
m_channels * 8,
|
||||
m_channels * 16,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
stride=2,
|
||||
bias=False)
|
||||
|
||||
# bottom-up fusion
|
||||
self.fuse_mode12 = AFF(channels=m_channels * 4)
|
||||
self.fuse_mode123 = AFF(channels=m_channels * 8)
|
||||
self.fuse_mode1234 = AFF(channels=m_channels * 16)
|
||||
|
||||
self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
|
||||
self.pool = getattr(pooling_layers, pooling_func)(
|
||||
in_dim=self.stats_dim * block.expansion)
|
||||
self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
|
||||
embed_dim)
|
||||
if self.two_emb_layer:
|
||||
self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
|
||||
self.seg_2 = nn.Linear(embed_dim, embed_dim)
|
||||
else:
|
||||
self.seg_bn_1 = nn.Identity()
|
||||
self.seg_2 = nn.Identity()
|
||||
|
||||
def _make_layer(self, block, planes, num_blocks, stride):
|
||||
strides = [stride] + [1] * (num_blocks - 1)
|
||||
layers = []
|
||||
for stride in strides:
|
||||
layers.append(block(self.in_planes, planes, stride))
|
||||
self.in_planes = planes * block.expansion
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = x.permute(0, 2, 1)
|
||||
|
||||
x = x.unsqueeze_(1)
|
||||
out = F.relu(self.bn1(self.conv1(x)))
|
||||
out1 = self.layer1(out)
|
||||
|
||||
# bottom-up fusion
|
||||
out2 = self.layer2(out1)
|
||||
out1_downsample = self.layer1_downsample(out1)
|
||||
fuse_out12 = self.fuse_mode12(out2, out1_downsample)
|
||||
|
||||
out3 = self.layer3(out2)
|
||||
fuse_out12_downsample = self.layer2_downsample(fuse_out12)
|
||||
fuse_out123 = self.fuse_mode123(out3, fuse_out12_downsample)
|
||||
|
||||
out4 = self.layer4(out3)
|
||||
fuse_out123_downsample = self.layer3_downsample(fuse_out123)
|
||||
fuse_out1234 = self.fuse_mode1234(out4, fuse_out123_downsample)
|
||||
stats = self.pool(fuse_out1234)
|
||||
|
||||
embed_a = self.seg_1(stats)
|
||||
if self.two_emb_layer:
|
||||
out = F.relu(embed_a)
|
||||
out = self.seg_bn_1(out)
|
||||
embed_b = self.seg_2(out)
|
||||
return embed_b
|
||||
else:
|
||||
return embed_a
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.speaker_verification, module_name=Models.eres2net_sv)
|
||||
class SpeakerVerificationERes2Net(TorchModel):
|
||||
r"""Enhanced Res2Net architecture with local and global feature fusion. ERes2Net is mainly composed
|
||||
of LFF and GFF. The LFF extracts localization-preserved speaker features and strengthen the local information
|
||||
interaction. GFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
|
||||
Args:
|
||||
model_dir: A model dir.
|
||||
model_config: The model config.
|
||||
"""
|
||||
|
||||
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
|
||||
**kwargs):
|
||||
super().__init__(model_dir, model_config, *args, **kwargs)
|
||||
self.model_config = model_config
|
||||
self.other_config = kwargs
|
||||
self.feature_dim = 80
|
||||
|
||||
self.embedding_model = ERes2Net()
|
||||
|
||||
pretrained_model_name = kwargs['pretrained_model']
|
||||
self.__load_check_point(pretrained_model_name)
|
||||
|
||||
self.embedding_model.eval()
|
||||
|
||||
def forward(self, audio):
|
||||
assert len(audio.shape) == 2 and audio.shape[
|
||||
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
|
||||
# audio shape: [1, T]
|
||||
feature = self.__extract_feature(audio)
|
||||
embedding = self.embedding_model(feature)
|
||||
|
||||
return embedding
|
||||
|
||||
def __extract_feature(self, audio):
|
||||
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
|
||||
feature = feature - feature.mean(dim=0, keepdim=True)
|
||||
feature = feature.unsqueeze(0)
|
||||
return feature
|
||||
|
||||
def __load_check_point(self, pretrained_model_name, device=None):
|
||||
if not device:
|
||||
device = torch.device('cpu')
|
||||
self.embedding_model.load_state_dict(
|
||||
torch.load(
|
||||
os.path.join(self.model_dir, pretrained_model_name),
|
||||
map_location=device),
|
||||
strict=True)
|
||||
32
modelscope/models/audio/sv/fusion.py
Normal file
32
modelscope/models/audio/sv/fusion.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class AFF(nn.Module):
|
||||
|
||||
def __init__(self, channels=64, r=4):
|
||||
super(AFF, self).__init__()
|
||||
inter_channels = int(channels // r)
|
||||
|
||||
self.local_att = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
channels * 2,
|
||||
inter_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0),
|
||||
nn.BatchNorm2d(inter_channels),
|
||||
nn.SiLU(inplace=True),
|
||||
nn.Conv2d(
|
||||
inter_channels, channels, kernel_size=1, stride=1, padding=0),
|
||||
nn.BatchNorm2d(channels),
|
||||
)
|
||||
|
||||
def forward(self, x, ds_y):
|
||||
xa = torch.cat((x, ds_y), dim=1)
|
||||
x_att = self.local_att(xa)
|
||||
x_att = 1.0 + torch.tanh(x_att)
|
||||
xo = torch.mul(x, x_att) + torch.mul(ds_y, 2.0 - x_att)
|
||||
|
||||
return xo
|
||||
107
modelscope/models/audio/sv/pooling_layers.py
Normal file
107
modelscope/models/audio/sv/pooling_layers.py
Normal file
@@ -0,0 +1,107 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
""" This implementation is adapted from https://github.com/wenet-e2e/wespeaker.
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class TAP(nn.Module):
|
||||
"""
|
||||
Temporal average pooling, only first-order mean is considered
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(TAP, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
pooling_mean = x.mean(dim=-1)
|
||||
# To be compatable with 2D input
|
||||
pooling_mean = pooling_mean.flatten(start_dim=1)
|
||||
return pooling_mean
|
||||
|
||||
|
||||
class TSDP(nn.Module):
|
||||
"""
|
||||
Temporal standard deviation pooling, only second-order std is considered
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(TSDP, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
# The last dimension is the temporal axis
|
||||
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
|
||||
pooling_std = pooling_std.flatten(start_dim=1)
|
||||
return pooling_std
|
||||
|
||||
|
||||
class TSTP(nn.Module):
|
||||
"""
|
||||
Temporal statistics pooling, concatenate mean and std, which is used in
|
||||
x-vector
|
||||
Comment: simple concatenation can not make full use of both statistics
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(TSTP, self).__init__()
|
||||
|
||||
def forward(self, x):
|
||||
# The last dimension is the temporal axis
|
||||
pooling_mean = x.mean(dim=-1)
|
||||
pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-8)
|
||||
pooling_mean = pooling_mean.flatten(start_dim=1)
|
||||
pooling_std = pooling_std.flatten(start_dim=1)
|
||||
|
||||
stats = torch.cat((pooling_mean, pooling_std), 1)
|
||||
return stats
|
||||
|
||||
|
||||
class ASTP(nn.Module):
|
||||
""" Attentive statistics pooling: Channel- and context-dependent
|
||||
statistics pooling, first used in ECAPA_TDNN.
|
||||
"""
|
||||
|
||||
def __init__(self, in_dim, bottleneck_dim=128, global_context_att=False):
|
||||
super(ASTP, self).__init__()
|
||||
self.global_context_att = global_context_att
|
||||
|
||||
# Use Conv1d with stride == 1 rather than Linear, then we don't
|
||||
# need to transpose inputs.
|
||||
if global_context_att:
|
||||
self.linear1 = nn.Conv1d(
|
||||
in_dim * 3, bottleneck_dim,
|
||||
kernel_size=1) # equals W and b in the paper
|
||||
else:
|
||||
self.linear1 = nn.Conv1d(
|
||||
in_dim, bottleneck_dim,
|
||||
kernel_size=1) # equals W and b in the paper
|
||||
self.linear2 = nn.Conv1d(
|
||||
bottleneck_dim, in_dim,
|
||||
kernel_size=1) # equals V and k in the paper
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
|
||||
or a 4-dimensional tensor in resnet architecture (B,C,F,T)
|
||||
0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
|
||||
"""
|
||||
if len(x.shape) == 4:
|
||||
x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
|
||||
assert len(x.shape) == 3
|
||||
|
||||
if self.global_context_att:
|
||||
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
|
||||
context_std = torch.sqrt(
|
||||
torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
|
||||
x_in = torch.cat((x, context_mean, context_std), dim=1)
|
||||
else:
|
||||
x_in = x
|
||||
|
||||
# DON'T use ReLU here! ReLU may be hard to converge.
|
||||
alpha = torch.tanh(
|
||||
self.linear1(x_in)) # alpha = F.relu(self.linear1(x_in))
|
||||
alpha = torch.softmax(self.linear2(alpha), dim=2)
|
||||
mean = torch.sum(alpha * x, dim=2)
|
||||
var = torch.sum(alpha * (x**2), dim=2) - mean**2
|
||||
std = torch.sqrt(var.clamp(min=1e-10))
|
||||
return torch.cat([mean, std], dim=1)
|
||||
573
modelscope/models/audio/sv/rdino.py
Normal file
573
modelscope/models/audio/sv/rdino.py
Normal file
@@ -0,0 +1,573 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
|
||||
RDINOHead implementation is adapted from DINO framework.
|
||||
"""
|
||||
import math
|
||||
import os
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchaudio.compliance.kaldi as Kaldi
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import MODELS, TorchModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
def length_to_mask(length, max_len=None, dtype=None, device=None):
|
||||
assert len(length.shape) == 1
|
||||
|
||||
if max_len is None:
|
||||
max_len = length.max().long().item()
|
||||
mask = torch.arange(
|
||||
max_len, device=length.device, dtype=length.dtype).expand(
|
||||
len(length), max_len) < length.unsqueeze(1)
|
||||
|
||||
if dtype is None:
|
||||
dtype = length.dtype
|
||||
|
||||
if device is None:
|
||||
device = length.device
|
||||
|
||||
mask = torch.as_tensor(mask, dtype=dtype, device=device)
|
||||
return mask
|
||||
|
||||
|
||||
def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
|
||||
if stride > 1:
|
||||
n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
|
||||
L_out = stride * (n_steps - 1) + kernel_size * dilation
|
||||
padding = [kernel_size // 2, kernel_size // 2]
|
||||
|
||||
else:
|
||||
L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
|
||||
|
||||
padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
|
||||
return padding
|
||||
|
||||
|
||||
class Conv1d(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
in_channels,
|
||||
stride=1,
|
||||
dilation=1,
|
||||
padding='same',
|
||||
groups=1,
|
||||
bias=True,
|
||||
padding_mode='reflect',
|
||||
):
|
||||
super().__init__()
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.dilation = dilation
|
||||
self.padding = padding
|
||||
self.padding_mode = padding_mode
|
||||
|
||||
self.conv = nn.Conv1d(
|
||||
in_channels,
|
||||
out_channels,
|
||||
self.kernel_size,
|
||||
stride=self.stride,
|
||||
dilation=self.dilation,
|
||||
padding=0,
|
||||
groups=groups,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
if self.padding == 'same':
|
||||
x = self._manage_padding(x, self.kernel_size, self.dilation,
|
||||
self.stride)
|
||||
|
||||
elif self.padding == 'causal':
|
||||
num_pad = (self.kernel_size - 1) * self.dilation
|
||||
x = F.pad(x, (num_pad, 0))
|
||||
|
||||
elif self.padding == 'valid':
|
||||
pass
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Padding must be 'same', 'valid' or 'causal'. Got "
|
||||
+ self.padding)
|
||||
|
||||
wx = self.conv(x)
|
||||
|
||||
return wx
|
||||
|
||||
def _manage_padding(
|
||||
self,
|
||||
x,
|
||||
kernel_size: int,
|
||||
dilation: int,
|
||||
stride: int,
|
||||
):
|
||||
L_in = x.shape[-1]
|
||||
padding = get_padding_elem(L_in, stride, kernel_size, dilation)
|
||||
x = F.pad(x, padding, mode=self.padding_mode)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class BatchNorm1d(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_size,
|
||||
eps=1e-05,
|
||||
momentum=0.1,
|
||||
):
|
||||
super().__init__()
|
||||
self.norm = nn.BatchNorm1d(
|
||||
input_size,
|
||||
eps=eps,
|
||||
momentum=momentum,
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.norm(x)
|
||||
|
||||
|
||||
class TDNNBlock(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
dilation,
|
||||
activation=nn.ReLU,
|
||||
groups=1,
|
||||
):
|
||||
super(TDNNBlock, self).__init__()
|
||||
self.conv = Conv1d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
)
|
||||
self.activation = activation()
|
||||
self.norm = BatchNorm1d(input_size=out_channels)
|
||||
|
||||
def forward(self, x):
|
||||
return self.norm(self.activation(self.conv(x)))
|
||||
|
||||
|
||||
class Res2NetBlock(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
scale=8,
|
||||
kernel_size=3,
|
||||
dilation=1):
|
||||
super(Res2NetBlock, self).__init__()
|
||||
assert in_channels % scale == 0
|
||||
assert out_channels % scale == 0
|
||||
|
||||
in_channel = in_channels // scale
|
||||
hidden_channel = out_channels // scale
|
||||
|
||||
self.blocks = nn.ModuleList([
|
||||
TDNNBlock(
|
||||
in_channel,
|
||||
hidden_channel,
|
||||
kernel_size=kernel_size,
|
||||
dilation=dilation,
|
||||
) for i in range(scale - 1)
|
||||
])
|
||||
self.scale = scale
|
||||
|
||||
def forward(self, x):
|
||||
y = []
|
||||
for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
|
||||
if i == 0:
|
||||
y_i = x_i
|
||||
elif i == 1:
|
||||
y_i = self.blocks[i - 1](x_i)
|
||||
else:
|
||||
y_i = self.blocks[i - 1](x_i + y_i)
|
||||
y.append(y_i)
|
||||
y = torch.cat(y, dim=1)
|
||||
return y
|
||||
|
||||
|
||||
class SEBlock(nn.Module):
|
||||
|
||||
def __init__(self, in_channels, se_channels, out_channels):
|
||||
super(SEBlock, self).__init__()
|
||||
|
||||
self.conv1 = Conv1d(
|
||||
in_channels=in_channels, out_channels=se_channels, kernel_size=1)
|
||||
self.relu = torch.nn.ReLU(inplace=True)
|
||||
self.conv2 = Conv1d(
|
||||
in_channels=se_channels, out_channels=out_channels, kernel_size=1)
|
||||
self.sigmoid = torch.nn.Sigmoid()
|
||||
|
||||
def forward(self, x, lengths=None):
|
||||
L = x.shape[-1]
|
||||
if lengths is not None:
|
||||
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
|
||||
mask = mask.unsqueeze(1)
|
||||
total = mask.sum(dim=2, keepdim=True)
|
||||
s = (x * mask).sum(dim=2, keepdim=True) / total
|
||||
else:
|
||||
s = x.mean(dim=2, keepdim=True)
|
||||
|
||||
s = self.relu(self.conv1(s))
|
||||
s = self.sigmoid(self.conv2(s))
|
||||
|
||||
return s * x
|
||||
|
||||
|
||||
class AttentiveStatisticsPooling(nn.Module):
|
||||
|
||||
def __init__(self, channels, attention_channels=128, global_context=True):
|
||||
super().__init__()
|
||||
|
||||
self.eps = 1e-12
|
||||
self.global_context = global_context
|
||||
if global_context:
|
||||
self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
|
||||
else:
|
||||
self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
|
||||
self.tanh = nn.Tanh()
|
||||
self.conv = Conv1d(
|
||||
in_channels=attention_channels,
|
||||
out_channels=channels,
|
||||
kernel_size=1)
|
||||
|
||||
def forward(self, x, lengths=None):
|
||||
L = x.shape[-1]
|
||||
|
||||
def _compute_statistics(x, m, dim=2, eps=self.eps):
|
||||
mean = (m * x).sum(dim)
|
||||
std = torch.sqrt(
|
||||
(m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
|
||||
return mean, std
|
||||
|
||||
if lengths is None:
|
||||
lengths = torch.ones(x.shape[0], device=x.device)
|
||||
|
||||
# Make binary mask of shape [N, 1, L]
|
||||
mask = length_to_mask(lengths * L, max_len=L, device=x.device)
|
||||
mask = mask.unsqueeze(1)
|
||||
|
||||
# Expand the temporal context of the pooling layer by allowing the
|
||||
# self-attention to look at global properties of the utterance.
|
||||
if self.global_context:
|
||||
# torch.std is unstable for backward computation
|
||||
# https://github.com/pytorch/pytorch/issues/4320
|
||||
total = mask.sum(dim=2, keepdim=True).float()
|
||||
mean, std = _compute_statistics(x, mask / total)
|
||||
mean = mean.unsqueeze(2).repeat(1, 1, L)
|
||||
std = std.unsqueeze(2).repeat(1, 1, L)
|
||||
attn = torch.cat([x, mean, std], dim=1)
|
||||
else:
|
||||
attn = x
|
||||
|
||||
# Apply layers
|
||||
attn = self.conv(self.tanh(self.tdnn(attn)))
|
||||
|
||||
# Filter out zero-paddings
|
||||
attn = attn.masked_fill(mask == 0, float('-inf'))
|
||||
|
||||
attn = F.softmax(attn, dim=2)
|
||||
mean, std = _compute_statistics(x, attn)
|
||||
# Append mean and std of the batch
|
||||
pooled_stats = torch.cat((mean, std), dim=1)
|
||||
pooled_stats = pooled_stats.unsqueeze(2)
|
||||
|
||||
return pooled_stats
|
||||
|
||||
|
||||
class SERes2NetBlock(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
res2net_scale=8,
|
||||
se_channels=128,
|
||||
kernel_size=1,
|
||||
dilation=1,
|
||||
activation=torch.nn.ReLU,
|
||||
groups=1,
|
||||
):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels
|
||||
self.tdnn1 = TDNNBlock(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
dilation=1,
|
||||
activation=activation,
|
||||
groups=groups,
|
||||
)
|
||||
self.res2net_block = Res2NetBlock(out_channels, out_channels,
|
||||
res2net_scale, kernel_size, dilation)
|
||||
self.tdnn2 = TDNNBlock(
|
||||
out_channels,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
dilation=1,
|
||||
activation=activation,
|
||||
groups=groups,
|
||||
)
|
||||
self.se_block = SEBlock(out_channels, se_channels, out_channels)
|
||||
|
||||
self.shortcut = None
|
||||
if in_channels != out_channels:
|
||||
self.shortcut = Conv1d(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
)
|
||||
|
||||
def forward(self, x, lengths=None):
|
||||
residual = x
|
||||
if self.shortcut:
|
||||
residual = self.shortcut(x)
|
||||
|
||||
x = self.tdnn1(x)
|
||||
x = self.res2net_block(x)
|
||||
x = self.tdnn2(x)
|
||||
x = self.se_block(x, lengths)
|
||||
|
||||
return x + residual
|
||||
|
||||
|
||||
class ECAPA_TDNN(nn.Module):
|
||||
"""An implementation of the speaker embedding model in a paper.
|
||||
"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
|
||||
TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_size,
|
||||
device='cpu',
|
||||
lin_neurons=512,
|
||||
activation=torch.nn.ReLU,
|
||||
channels=[512, 512, 512, 512, 1536],
|
||||
kernel_sizes=[5, 3, 3, 3, 1],
|
||||
dilations=[1, 2, 3, 4, 1],
|
||||
attention_channels=128,
|
||||
res2net_scale=8,
|
||||
se_channels=128,
|
||||
global_context=True,
|
||||
groups=[1, 1, 1, 1, 1],
|
||||
):
|
||||
|
||||
super().__init__()
|
||||
assert len(channels) == len(kernel_sizes)
|
||||
assert len(channels) == len(dilations)
|
||||
self.channels = channels
|
||||
self.blocks = nn.ModuleList()
|
||||
|
||||
# The initial TDNN layer
|
||||
self.blocks.append(
|
||||
TDNNBlock(
|
||||
input_size,
|
||||
channels[0],
|
||||
kernel_sizes[0],
|
||||
dilations[0],
|
||||
activation,
|
||||
groups[0],
|
||||
))
|
||||
|
||||
# SE-Res2Net layers
|
||||
for i in range(1, len(channels) - 1):
|
||||
self.blocks.append(
|
||||
SERes2NetBlock(
|
||||
channels[i - 1],
|
||||
channels[i],
|
||||
res2net_scale=res2net_scale,
|
||||
se_channels=se_channels,
|
||||
kernel_size=kernel_sizes[i],
|
||||
dilation=dilations[i],
|
||||
activation=activation,
|
||||
groups=groups[i],
|
||||
))
|
||||
|
||||
# Multi-layer feature aggregation
|
||||
self.mfa = TDNNBlock(
|
||||
channels[-1],
|
||||
channels[-1],
|
||||
kernel_sizes[-1],
|
||||
dilations[-1],
|
||||
activation,
|
||||
groups=groups[-1],
|
||||
)
|
||||
|
||||
# Attentive Statistical Pooling
|
||||
self.asp = AttentiveStatisticsPooling(
|
||||
channels[-1],
|
||||
attention_channels=attention_channels,
|
||||
global_context=global_context,
|
||||
)
|
||||
self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
|
||||
|
||||
# Final linear transformation
|
||||
self.fc = Conv1d(
|
||||
in_channels=channels[-1] * 2,
|
||||
out_channels=lin_neurons,
|
||||
kernel_size=1,
|
||||
)
|
||||
|
||||
def forward(self, x, lengths=None):
|
||||
"""Returns the embedding vector.
|
||||
|
||||
Arguments
|
||||
---------
|
||||
x : torch.Tensor
|
||||
Tensor of shape (batch, time, channel).
|
||||
"""
|
||||
x = x.transpose(1, 2)
|
||||
|
||||
xl = []
|
||||
for layer in self.blocks:
|
||||
try:
|
||||
x = layer(x, lengths=lengths)
|
||||
except TypeError:
|
||||
x = layer(x)
|
||||
xl.append(x)
|
||||
|
||||
# Multi-layer feature aggregation
|
||||
x = torch.cat(xl[1:], dim=1)
|
||||
x = self.mfa(x)
|
||||
|
||||
# Attentive Statistical Pooling
|
||||
x = self.asp(x, lengths=lengths)
|
||||
x = self.asp_bn(x)
|
||||
|
||||
# Final linear transformation
|
||||
x = self.fc(x)
|
||||
|
||||
x = x.transpose(1, 2).squeeze(1)
|
||||
return x
|
||||
|
||||
|
||||
class RDINOHead(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_dim,
|
||||
out_dim,
|
||||
use_bn=False,
|
||||
norm_last_layer=True,
|
||||
nlayers=3,
|
||||
hidden_dim=2048,
|
||||
bottleneck_dim=256,
|
||||
add_dim=8192):
|
||||
super().__init__()
|
||||
nlayers = max(nlayers, 1)
|
||||
if nlayers == 1:
|
||||
self.mlp = nn.Linear(in_dim, bottleneck_dim)
|
||||
else:
|
||||
layers = [nn.Linear(in_dim, hidden_dim)]
|
||||
if use_bn:
|
||||
layers.append(nn.BatchNorm1d(hidden_dim))
|
||||
layers.append(nn.GELU())
|
||||
for _ in range(nlayers - 2):
|
||||
layers.append(nn.Linear(hidden_dim, hidden_dim))
|
||||
if use_bn:
|
||||
layers.append(nn.BatchNorm1d(hidden_dim))
|
||||
layers.append(nn.GELU())
|
||||
|
||||
layers.append(nn.Linear(hidden_dim, add_dim))
|
||||
self.mlp = nn.Sequential(*layers)
|
||||
self.add_layer = nn.Linear(add_dim, bottleneck_dim)
|
||||
self.apply(self._init_weights)
|
||||
self.last_layer = nn.utils.weight_norm(
|
||||
nn.Linear(bottleneck_dim, out_dim, bias=False))
|
||||
self.last_layer.weight_g.data.fill_(1)
|
||||
if norm_last_layer:
|
||||
self.last_layer.weight_g.requires_grad = False
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
torch.nn.init.trunc_normal_(m.weight, std=.02)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
nn.init.constant_(m.bias, 0)
|
||||
|
||||
def forward(self, x):
|
||||
vicr_out = self.mlp(x)
|
||||
x = self.add_layer(vicr_out)
|
||||
x = nn.functional.normalize(x, dim=-1, p=2)
|
||||
x = self.last_layer(x)
|
||||
return vicr_out, x
|
||||
|
||||
|
||||
class Combine(nn.Module):
|
||||
|
||||
def __init__(self, backbone, head):
|
||||
super(Combine, self).__init__()
|
||||
self.backbone = backbone
|
||||
self.head = head
|
||||
|
||||
def forward(self, x):
|
||||
x = self.backbone(x)
|
||||
output = self.head(x)
|
||||
return output
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.speaker_verification, module_name=Models.rdino_tdnn_sv)
|
||||
class SpeakerVerification_RDINO(TorchModel):
|
||||
|
||||
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
|
||||
**kwargs):
|
||||
super().__init__(model_dir, model_config, *args, **kwargs)
|
||||
self.model_config = model_config
|
||||
self.other_config = kwargs
|
||||
if self.model_config['channel'] != 1024:
|
||||
raise ValueError(
|
||||
'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
|
||||
)
|
||||
|
||||
self.feature_dim = 80
|
||||
channels_config = [1024, 1024, 1024, 1024, 3072]
|
||||
|
||||
self.embedding_model = ECAPA_TDNN(
|
||||
self.feature_dim, channels=channels_config)
|
||||
self.embedding_model = Combine(self.embedding_model,
|
||||
RDINOHead(512, 65536, True))
|
||||
|
||||
pretrained_model_name = kwargs['pretrained_model']
|
||||
self.__load_check_point(pretrained_model_name)
|
||||
|
||||
self.embedding_model.eval()
|
||||
|
||||
def forward(self, audio):
|
||||
assert len(audio.shape) == 2 and audio.shape[
|
||||
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
|
||||
# audio shape: [1, T]
|
||||
feature = self.__extract_feature(audio)
|
||||
embedding = self.embedding_model.backbone(feature)
|
||||
|
||||
return embedding
|
||||
|
||||
def __extract_feature(self, audio):
|
||||
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
|
||||
feature = feature - feature.mean(dim=0, keepdim=True)
|
||||
feature = feature.unsqueeze(0)
|
||||
return feature
|
||||
|
||||
def __load_check_point(self, pretrained_model_name, device=None):
|
||||
if not device:
|
||||
device = torch.device('cpu')
|
||||
state_dict = torch.load(
|
||||
os.path.join(self.model_dir, pretrained_model_name),
|
||||
map_location=device)
|
||||
state_dict_tea = {
|
||||
k.replace('module.', ''): v
|
||||
for k, v in state_dict['teacher'].items()
|
||||
}
|
||||
self.embedding_model.load_state_dict(state_dict_tea, strict=True)
|
||||
319
modelscope/models/audio/sv/speaker_change_locator.py
Normal file
319
modelscope/models/audio/sv/speaker_change_locator.py
Normal file
@@ -0,0 +1,319 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchaudio.compliance.kaldi as Kaldi
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import MODELS, TorchModel
|
||||
from modelscope.models.audio.sv.DTDNN import CAMPPlus
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
class MultiHeadSelfAttention(nn.Module):
|
||||
|
||||
def __init__(self, n_units, h=8, dropout=0.1):
|
||||
super(MultiHeadSelfAttention, self).__init__()
|
||||
self.linearQ = nn.Linear(n_units, n_units)
|
||||
self.linearK = nn.Linear(n_units, n_units)
|
||||
self.linearV = nn.Linear(n_units, n_units)
|
||||
self.linearO = nn.Linear(n_units, n_units)
|
||||
self.d_k = n_units // h
|
||||
self.h = h
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
self.att = None
|
||||
|
||||
def forward(self, x, batch_size):
|
||||
# x: (BT, F)
|
||||
q = self.linearQ(x).reshape(batch_size, -1, self.h, self.d_k)
|
||||
k = self.linearK(x).reshape(batch_size, -1, self.h, self.d_k)
|
||||
v = self.linearV(x).reshape(batch_size, -1, self.h, self.d_k)
|
||||
scores = torch.matmul(q.transpose(1, 2), k.permute(
|
||||
0, 2, 3, 1)) / np.sqrt(self.d_k)
|
||||
# scores: (B, h, T, T)
|
||||
self.att = F.softmax(scores, dim=3)
|
||||
p_att = self.dropout(self.att)
|
||||
# v : (B, T, h, d_k)
|
||||
# p_att : (B, h, T, T)
|
||||
x = torch.matmul(p_att, v.transpose(1, 2))
|
||||
# x : (B, h, T, d_k)
|
||||
x = x.transpose(1, 2).reshape(-1, self.h * self.d_k)
|
||||
return self.linearO(x)
|
||||
|
||||
|
||||
class PositionwiseFeedForward(nn.Module):
|
||||
|
||||
def __init__(self, n_units, d_units, dropout):
|
||||
super(PositionwiseFeedForward, self).__init__()
|
||||
self.linear1 = nn.Linear(n_units, d_units)
|
||||
self.linear2 = nn.Linear(d_units, n_units)
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear2(self.dropout(F.relu(self.linear1(x))))
|
||||
|
||||
|
||||
class PosEncoding(nn.Module):
|
||||
|
||||
def __init__(self, max_seq_len, d_word_vec):
|
||||
super(PosEncoding, self).__init__()
|
||||
pos_enc = np.array([[
|
||||
pos / np.power(10000, 2.0 * (j // 2) / d_word_vec)
|
||||
for j in range(d_word_vec)
|
||||
] for pos in range(max_seq_len)])
|
||||
pos_enc[:, 0::2] = np.sin(pos_enc[:, 0::2])
|
||||
pos_enc[:, 1::2] = np.cos(pos_enc[:, 1::2])
|
||||
pad_row = np.zeros([1, d_word_vec])
|
||||
pos_enc = np.concatenate([pad_row, pos_enc]).astype(np.float32)
|
||||
|
||||
self.pos_enc = torch.nn.Embedding(max_seq_len + 1, d_word_vec)
|
||||
self.pos_enc.weight = torch.nn.Parameter(
|
||||
torch.from_numpy(pos_enc), requires_grad=False)
|
||||
|
||||
def forward(self, input_len):
|
||||
max_len = torch.max(input_len)
|
||||
input_pos = torch.LongTensor([
|
||||
list(range(1, len + 1)) + [0] * (max_len - len)
|
||||
for len in input_len
|
||||
])
|
||||
|
||||
return self.pos_enc(input_pos)
|
||||
|
||||
|
||||
class TransformerEncoder(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
idim,
|
||||
n_units=256,
|
||||
n_layers=2,
|
||||
e_units=512,
|
||||
h=4,
|
||||
dropout=0.1):
|
||||
super(TransformerEncoder, self).__init__()
|
||||
self.linear_in = nn.Linear(idim, n_units)
|
||||
self.lnorm_in = nn.LayerNorm(n_units)
|
||||
|
||||
self.n_layers = n_layers
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
for i in range(n_layers):
|
||||
setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
|
||||
setattr(self, '{}{:d}'.format('self_att_', i),
|
||||
MultiHeadSelfAttention(n_units, h))
|
||||
setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
|
||||
setattr(self, '{}{:d}'.format('ff_', i),
|
||||
PositionwiseFeedForward(n_units, e_units, dropout))
|
||||
self.lnorm_out = nn.LayerNorm(n_units)
|
||||
|
||||
def forward(self, x):
|
||||
# x: [B, num_anchors, T, n_in]
|
||||
bs, num, tframe, dim = x.size()
|
||||
x = x.reshape(bs * num, tframe, -1) # [B*num_anchors, T, dim]
|
||||
# x: (B, T, F) ... batch, time, (mel)freq
|
||||
B_size, T_size, _ = x.shape
|
||||
# e: (BT, F)
|
||||
e = self.linear_in(x.reshape(B_size * T_size, -1))
|
||||
# Encoder stack
|
||||
for i in range(self.n_layers):
|
||||
# layer normalization
|
||||
e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
|
||||
# self-attention
|
||||
s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
|
||||
# residual
|
||||
e = e + self.dropout(s)
|
||||
# layer normalization
|
||||
e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
|
||||
# positionwise feed-forward
|
||||
s = getattr(self, '{}{:d}'.format('ff_', i))(e)
|
||||
# residual
|
||||
e = e + self.dropout(s)
|
||||
# final layer normalization
|
||||
# output: (BT, F)
|
||||
# output: (B, F, T)
|
||||
output = self.lnorm_out(e).reshape(B_size, T_size, -1)
|
||||
output = output.reshape(bs, num, tframe,
|
||||
-1) # [B, num_anchors, T, dim]
|
||||
return output
|
||||
|
||||
|
||||
class TransformerEncoder_out(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
idim,
|
||||
n_units=256,
|
||||
n_layers=2,
|
||||
e_units=512,
|
||||
h=4,
|
||||
dropout=0.1):
|
||||
super(TransformerEncoder_out, self).__init__()
|
||||
self.linear_in = nn.Linear(idim, n_units)
|
||||
self.lnorm_in = nn.LayerNorm(n_units)
|
||||
|
||||
self.n_layers = n_layers
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
for i in range(n_layers):
|
||||
setattr(self, '{}{:d}'.format('lnorm1_', i), nn.LayerNorm(n_units))
|
||||
setattr(self, '{}{:d}'.format('self_att_', i),
|
||||
MultiHeadSelfAttention(n_units, h))
|
||||
setattr(self, '{}{:d}'.format('lnorm2_', i), nn.LayerNorm(n_units))
|
||||
setattr(self, '{}{:d}'.format('ff_', i),
|
||||
PositionwiseFeedForward(n_units, e_units, dropout))
|
||||
self.lnorm_out = nn.LayerNorm(n_units)
|
||||
|
||||
def forward(self, x):
|
||||
# x: (B, T, F)
|
||||
B_size, T_size, _ = x.shape
|
||||
# e: (BT, F)
|
||||
e = self.linear_in(x.reshape(B_size * T_size, -1))
|
||||
# Encoder stack
|
||||
for i in range(self.n_layers):
|
||||
# layer normalization
|
||||
e = getattr(self, '{}{:d}'.format('lnorm1_', i))(e)
|
||||
# self-attention
|
||||
s = getattr(self, '{}{:d}'.format('self_att_', i))(e, x.shape[0])
|
||||
# residual
|
||||
e = e + self.dropout(s)
|
||||
# layer normalization
|
||||
e = getattr(self, '{}{:d}'.format('lnorm2_', i))(e)
|
||||
# positionwise feed-forward
|
||||
s = getattr(self, '{}{:d}'.format('ff_', i))(e)
|
||||
# residual
|
||||
e = e + self.dropout(s)
|
||||
# final layer normalization
|
||||
# output: (BT, F)
|
||||
# output: (B, T, F)
|
||||
output = self.lnorm_out(e).reshape(B_size, T_size, -1)
|
||||
return output
|
||||
|
||||
|
||||
class OutLayer(nn.Module):
|
||||
|
||||
def __init__(self, n_units=256, num_anchors=2):
|
||||
super(OutLayer, self).__init__()
|
||||
self.combine = TransformerEncoder_out(num_anchors * n_units, n_units)
|
||||
self.out_linear = nn.Linear(n_units // num_anchors, 1)
|
||||
|
||||
def forward(self, input):
|
||||
# input: [B, num_anchors, T, dim]
|
||||
bs, num, tframe, dim = input.size()
|
||||
output = input.permute(0, 2, 1,
|
||||
3).reshape(bs, tframe,
|
||||
-1) # [Bs, t, num_anchors*dim]
|
||||
output = self.combine(output) # [Bs, t, n_units]
|
||||
output = output.reshape(
|
||||
bs, tframe, num, -1) # [Bs, t, num_anchors, n_units//num_anchors]
|
||||
output = self.out_linear(output).squeeze(-1) # [Bs, t, num_anchors]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerDetector(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
frame_dim=512,
|
||||
anchor_dim=192,
|
||||
hidden_dim=256,
|
||||
max_seq_len=1000):
|
||||
super(TransformerDetector, self).__init__()
|
||||
self.detection = TransformerEncoder(
|
||||
idim=frame_dim + anchor_dim, n_units=hidden_dim)
|
||||
self.output = OutLayer(n_units=hidden_dim)
|
||||
self.pos_enc = PosEncoding(max_seq_len, hidden_dim)
|
||||
|
||||
def forward(self, feats, anchors):
|
||||
# feats: [1, t, fdim]
|
||||
num_frames = feats.shape[1]
|
||||
num_anchors = anchors.shape[1]
|
||||
bs = feats.shape[0]
|
||||
feats = feats.unsqueeze(1).repeat(
|
||||
1, num_anchors, 1, 1) # shape: [Bs, num_anchors, t, fdim]
|
||||
anchors = anchors.unsqueeze(2).repeat(
|
||||
1, 1, num_frames, 1) # shape: [Bs, num_anchors, t, xdim]
|
||||
sd_in = torch.cat((feats, anchors),
|
||||
dim=-1) # shape: [Bs, num_anchors, t, fdim+xdim]
|
||||
sd_out = self.detection(sd_in) # shape: [Bs, num_anchors, t, sd_dim]
|
||||
|
||||
# pos
|
||||
pos_emb = self.pos_enc(torch.tensor([num_frames] * (bs * num_anchors)))
|
||||
pos_emb = pos_emb.reshape(bs, num_anchors, num_frames, -1)
|
||||
sd_out += pos_emb
|
||||
|
||||
# output
|
||||
output = self.output(sd_out) # shape: [Bs, t, num_anchors]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.scl_sd)
|
||||
class SpeakerChangeLocatorTransformer(TorchModel):
|
||||
r"""A speaekr change locator using the transformer architecture as the backbone.
|
||||
Args:
|
||||
model_dir: A model dir.
|
||||
model_config: The model config.
|
||||
"""
|
||||
|
||||
def __init__(self, model_dir, model_config: Dict[str, Any], *args,
|
||||
**kwargs):
|
||||
super().__init__(model_dir, model_config, *args, **kwargs)
|
||||
self.model_config = model_config
|
||||
|
||||
self.feature_dim = self.model_config['fbank_dim']
|
||||
frame_size = self.model_config['frame_size']
|
||||
anchor_size = self.model_config['anchor_size']
|
||||
|
||||
self.encoder = CAMPPlus(self.feature_dim, output_level='frame')
|
||||
self.backend = TransformerDetector(
|
||||
frame_dim=frame_size, anchor_dim=anchor_size)
|
||||
|
||||
pretrained_encoder = kwargs['pretrained_encoder']
|
||||
pretrained_backend = kwargs['pretrained_backend']
|
||||
|
||||
self.__load_check_point(pretrained_encoder, pretrained_backend)
|
||||
|
||||
self.encoder.eval()
|
||||
self.backend.eval()
|
||||
|
||||
def forward(self, audio, anchors):
|
||||
assert len(audio.shape) == 2 and audio.shape[
|
||||
0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
|
||||
assert len(
|
||||
anchors.shape
|
||||
) == 3 and anchors.shape[0] == 1 and anchors.shape[
|
||||
1] == 2, 'modelscope error: the shape of input anchors to model needs to be [1, 2, D]'
|
||||
# audio shape: [1, T]
|
||||
feature = self.__extract_feature(audio)
|
||||
frame_state = self.encoder(feature)
|
||||
output = self.backend(frame_state, anchors)
|
||||
output = output.squeeze(0).detach().cpu().sigmoid()
|
||||
|
||||
time_scale_factor = int(np.ceil(feature.shape[1] / output.shape[0]))
|
||||
output = output.unsqueeze(1).expand(-1, time_scale_factor,
|
||||
-1).reshape(-1, output.shape[-1])
|
||||
return output
|
||||
|
||||
def __extract_feature(self, audio):
|
||||
feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
|
||||
feature = feature - feature.mean(dim=0, keepdim=True)
|
||||
feature = feature.unsqueeze(0)
|
||||
return feature
|
||||
|
||||
def __load_check_point(self,
|
||||
pretrained_encoder,
|
||||
pretrained_backend,
|
||||
device=None):
|
||||
if not device:
|
||||
device = torch.device('cpu')
|
||||
self.encoder.load_state_dict(
|
||||
torch.load(
|
||||
os.path.join(self.model_dir, pretrained_encoder),
|
||||
map_location=device))
|
||||
|
||||
self.backend.load_state_dict(
|
||||
torch.load(
|
||||
os.path.join(self.model_dir, pretrained_backend),
|
||||
map_location=device))
|
||||
@@ -17,11 +17,9 @@ from kantts.train.trainer import GAN_Trainer, Sambert_Trainer, distributed_init
|
||||
from kantts.utils.ling_unit.ling_unit import KanTtsLinguisticUnit
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from modelscope import __version__
|
||||
from modelscope.utils.audio.audio_utils import TtsCustomParams
|
||||
from modelscope.utils.audio.tts_exceptions import (
|
||||
TtsModelConfigurationException, TtsModelNotExistsException)
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
@@ -394,6 +392,7 @@ class Voice:
|
||||
logger.info(f'TRAINING steps: {train_max_steps}')
|
||||
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
|
||||
time.localtime())
|
||||
from modelscope import __version__
|
||||
config['modelscope_version'] = __version__
|
||||
|
||||
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
|
||||
@@ -558,6 +557,7 @@ class Voice:
|
||||
logger.info(f'resume from: {resume_from}')
|
||||
config['create_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
|
||||
time.localtime())
|
||||
from modelscope import __version__
|
||||
config['modelscope_version'] = __version__
|
||||
|
||||
with open(os.path.join(stage_dir, 'config.yaml'), 'w') as f:
|
||||
|
||||
@@ -4,9 +4,8 @@
|
||||
from . import (action_recognition, animal_recognition, bad_image_detecting,
|
||||
body_2d_keypoints, body_3d_keypoints, cartoon,
|
||||
cmdssl_video_embedding, controllable_image_generation,
|
||||
crowd_counting, face_2d_keypoints, face_detection,
|
||||
face_generation, face_reconstruction, human_reconstruction,
|
||||
human_wholebody_keypoint, image_classification,
|
||||
crowd_counting, face_detection, face_generation,
|
||||
face_reconstruction, human_reconstruction, image_classification,
|
||||
image_color_enhance, image_colorization, image_defrcn_fewshot,
|
||||
image_denoise, image_inpainting, image_instance_segmentation,
|
||||
image_matching, image_mvs_depth_estimation,
|
||||
|
||||
@@ -72,7 +72,7 @@ class PoseHighResolutionNetV2(TorchModel):
|
||||
self.stage4, pre_stage_channels = self._make_stage(
|
||||
self.stage4_cfg, num_channels, multi_scale_output=True)
|
||||
"""final four layers"""
|
||||
last_inp_channels = np.int(np.sum(pre_stage_channels))
|
||||
last_inp_channels = int(np.sum(pre_stage_channels))
|
||||
self.final_layer = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
in_channels=last_inp_channels,
|
||||
|
||||
@@ -81,7 +81,7 @@ class FaceLandmark:
|
||||
bbox[2] = center[0] + one_edge // 2
|
||||
bbox[3] = center[1] + one_edge // 2
|
||||
|
||||
bbox = bbox.astype(np.int)
|
||||
bbox = bbox.astype(int)
|
||||
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
|
||||
h, w, _ = crop_image.shape
|
||||
crop_image = cv2.resize(
|
||||
|
||||
@@ -356,7 +356,7 @@ class HighResolutionNet(nn.Module):
|
||||
num_channels)
|
||||
self.stage3, pre_stage_channels = self._make_stage(
|
||||
self.stage3_cfg, num_channels)
|
||||
last_inp_channels = np.int(np.sum(pre_stage_channels)) + 256
|
||||
last_inp_channels = int(np.sum(pre_stage_channels)) + 256
|
||||
self.redc_layer = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
in_channels=last_inp_channels,
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.base import BaseModel
|
||||
from easycv.utils.ms_utils import EasyCVMeta
|
||||
|
||||
from modelscope.models.base import TorchModel
|
||||
|
||||
|
||||
class EasyCVBaseModel(BaseModel, TorchModel):
|
||||
"""Base model for EasyCV."""
|
||||
|
||||
def __init__(self, model_dir=None, args=(), kwargs={}):
|
||||
kwargs.pop(EasyCVMeta.ARCH, None) # pop useless keys
|
||||
BaseModel.__init__(self)
|
||||
TorchModel.__init__(self, model_dir=model_dir)
|
||||
|
||||
def forward(self, img, mode='train', **kwargs):
|
||||
if self.training:
|
||||
losses = self.forward_train(img, **kwargs)
|
||||
loss, log_vars = self._parse_losses(losses)
|
||||
return dict(loss=loss, log_vars=log_vars)
|
||||
else:
|
||||
return self.forward_test(img, **kwargs)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self.forward(*args, **kwargs)
|
||||
@@ -1,20 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .face_2d_keypoints_align import Face2DKeypoints
|
||||
|
||||
else:
|
||||
_import_structure = {'face_2d_keypoints_align': ['Face2DKeypoints']}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
@@ -1,16 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.face.face_keypoint import FaceKeypoint
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.face_2d_keypoints, module_name=Models.face_2d_keypoints)
|
||||
class Face2DKeypoints(EasyCVBaseModel, FaceKeypoint):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
FaceKeypoint.__init__(self, *args, **kwargs)
|
||||
@@ -82,7 +82,7 @@ class FaceLandmark:
|
||||
bbox[2] = center[0] + one_edge // 2
|
||||
bbox[3] = center[1] + one_edge // 2
|
||||
|
||||
bbox = bbox.astype(np.int)
|
||||
bbox = bbox.astype(int)
|
||||
crop_image = bimg[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
|
||||
h, w, _ = crop_image.shape
|
||||
crop_image = cv2.resize(crop_image,
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .hand_2d_keypoints import Hand2dKeyPoints
|
||||
|
||||
else:
|
||||
_import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
@@ -1,16 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.pose import TopDown
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
|
||||
class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
TopDown.__init__(self, *args, **kwargs)
|
||||
@@ -31,7 +31,7 @@ class human_segmenter(object):
|
||||
img = np.dstack((img, img, img))
|
||||
elif img.shape[2] == 4:
|
||||
img = img[:, :, :3]
|
||||
img = img.astype(np.float)
|
||||
img = img.astype(float)
|
||||
return img
|
||||
|
||||
def run(self, img):
|
||||
|
||||
@@ -69,8 +69,8 @@ def eval_grid(coords,
|
||||
num_samples=512 * 512 * 512):
|
||||
resolution = coords.shape[1:4]
|
||||
sdf = np.zeros(resolution)
|
||||
dirty = np.ones(resolution, dtype=np.bool)
|
||||
grid_mask = np.zeros(resolution, dtype=np.bool)
|
||||
dirty = np.ones(resolution, dtype=bool)
|
||||
grid_mask = np.zeros(resolution, dtype=bool)
|
||||
reso = resolution[0] // init_resolution
|
||||
|
||||
while reso > 0:
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.pose.top_down import TopDown
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.human_wholebody_keypoint,
|
||||
module_name=Models.human_wholebody_keypoint)
|
||||
class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
TopDown.__init__(self, *args, **kwargs)
|
||||
@@ -163,7 +163,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
|
||||
os.path.join(split_dir,
|
||||
'box_{}shot_{}_train.txt'.format(shot,
|
||||
cls))) as f:
|
||||
fileids_ = np.loadtxt(f, dtype=np.str).tolist()
|
||||
fileids_ = np.loadtxt(f, dtype=np.str_).tolist()
|
||||
if isinstance(fileids_, str):
|
||||
fileids_ = [fileids_]
|
||||
fileids_ = [
|
||||
@@ -219,7 +219,7 @@ def load_filtered_voc_instances(name: str, root: str, dirname: str, split: str,
|
||||
with PathManager.open(
|
||||
os.path.join(root, dirname, 'ImageSets', 'Main',
|
||||
split + '.txt')) as f:
|
||||
fileids = np.loadtxt(f, dtype=np.str)
|
||||
fileids = np.loadtxt(f, dtype=np.str_)
|
||||
|
||||
for fileid in fileids:
|
||||
anno_file = os.path.join(root, dirname, 'Annotations',
|
||||
|
||||
@@ -8,10 +8,12 @@ if TYPE_CHECKING:
|
||||
from .maskdino_swin import MaskDINOSwin
|
||||
from .model import CascadeMaskRCNNSwinModel
|
||||
from .maskdino_model import MaskDINOSwinModel
|
||||
from .fastinst_model import FastInst
|
||||
from .postprocess_utils import get_img_ins_seg_result, get_maskdino_ins_seg_result
|
||||
else:
|
||||
_import_structure = {
|
||||
'cascade_mask_rcnn_swin': ['CascadeMaskRCNNSwin'],
|
||||
'fastinst_model': ['FastInst'],
|
||||
'maskdino_swin': ['MaskDINOSwin'],
|
||||
'model': ['CascadeMaskRCNNSwinModel'],
|
||||
'maskdino_model': ['MaskDINOSwinModel'],
|
||||
|
||||
@@ -6,10 +6,12 @@ from modelscope.utils.import_utils import LazyImportModule
|
||||
if TYPE_CHECKING:
|
||||
from .swin_transformer import SwinTransformer
|
||||
from .swin_transformer import D2SwinTransformer
|
||||
from .resnet import build_resnet_backbone
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'swin_transformer': ['SwinTransformer', 'D2SwinTransformer'],
|
||||
'resnet': ['build_resnet_backbone']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
# Part of the implementation is borrowed and modified from Detectron2, publicly available at
|
||||
# https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py
|
||||
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
from modelscope.models.cv.image_human_parsing.backbone.deeplab_resnet import (
|
||||
BottleneckBlock, DeeplabResNet, get_norm)
|
||||
from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
|
||||
Conv2d
|
||||
|
||||
|
||||
class BasicStem(nn.Module):
|
||||
"""
|
||||
The standard ResNet stem (layers before the first residual block),
|
||||
with a conv, relu and max_pool.
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels=3, out_channels=64, norm='BN'):
|
||||
"""
|
||||
Args:
|
||||
norm (str or callable): norm after the first conv layer.
|
||||
See :func:`layers.get_norm` for supported format.
|
||||
"""
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.stride = 4
|
||||
self.conv1 = Conv2d(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=7,
|
||||
stride=2,
|
||||
padding=3,
|
||||
bias=False,
|
||||
norm=get_norm(norm, out_channels),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = F.relu_(x)
|
||||
x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
|
||||
return x
|
||||
|
||||
|
||||
def build_resnet_backbone(out_features, depth, num_groups, width_per_group,
|
||||
norm, stem_out_channels, res2_out_channels,
|
||||
stride_in_1x1, res4_dilation, res5_dilation,
|
||||
res5_multi_grid, input_shape):
|
||||
stem = BasicStem(
|
||||
in_channels=input_shape['channels'],
|
||||
out_channels=stem_out_channels,
|
||||
norm=norm)
|
||||
bottleneck_channels = num_groups * width_per_group
|
||||
in_channels = stem_out_channels
|
||||
out_channels = res2_out_channels
|
||||
|
||||
assert res4_dilation in {
|
||||
1, 2
|
||||
}, 'res4_dilation cannot be {}.'.format(res4_dilation)
|
||||
assert res5_dilation in {
|
||||
1, 2, 4
|
||||
}, 'res5_dilation cannot be {}.'.format(res5_dilation)
|
||||
if res4_dilation == 2:
|
||||
# Always dilate res5 if res4 is dilated.
|
||||
assert res5_dilation == 4
|
||||
|
||||
num_blocks_per_stage = {
|
||||
50: [3, 4, 6, 3],
|
||||
101: [3, 4, 23, 3],
|
||||
152: [3, 8, 36, 3]
|
||||
}[depth]
|
||||
|
||||
stages = []
|
||||
out_stage_idx = [{
|
||||
'res2': 2,
|
||||
'res3': 3,
|
||||
'res4': 4,
|
||||
'res5': 5
|
||||
}[f] for f in out_features]
|
||||
max_stage_idx = max(out_stage_idx)
|
||||
for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
|
||||
if stage_idx == 4:
|
||||
dilation = res4_dilation
|
||||
elif stage_idx == 5:
|
||||
dilation = res5_dilation
|
||||
else:
|
||||
dilation = 1
|
||||
first_stride = 1 if idx == 0 or dilation > 1 else 2
|
||||
stride_per_block = [first_stride]
|
||||
stride_per_block += [1] * (num_blocks_per_stage[idx] - 1)
|
||||
stage_kargs = {
|
||||
'num_blocks': num_blocks_per_stage[idx],
|
||||
'stride_per_block': stride_per_block,
|
||||
'in_channels': in_channels,
|
||||
'out_channels': out_channels,
|
||||
'norm': norm,
|
||||
'bottleneck_channels': bottleneck_channels,
|
||||
'stride_in_1x1': stride_in_1x1,
|
||||
'dilation': dilation,
|
||||
'num_groups': num_groups,
|
||||
'block_class': BottleneckBlock
|
||||
}
|
||||
if stage_idx == 5:
|
||||
stage_kargs.pop('dilation')
|
||||
stage_kargs['dilation_per_block'] = [
|
||||
dilation * mg for mg in res5_multi_grid
|
||||
]
|
||||
blocks = DeeplabResNet.make_stage(**stage_kargs)
|
||||
in_channels = out_channels
|
||||
out_channels *= 2
|
||||
bottleneck_channels *= 2
|
||||
stages.append(blocks)
|
||||
return DeeplabResNet(stem, stages, out_features=out_features)
|
||||
@@ -0,0 +1 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
@@ -0,0 +1,351 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import math
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import (
|
||||
MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer)
|
||||
|
||||
|
||||
class QueryProposal(nn.Module):
|
||||
|
||||
def __init__(self, num_features, num_queries, num_classes):
|
||||
super().__init__()
|
||||
self.topk = num_queries
|
||||
self.num_classes = num_classes
|
||||
|
||||
self.conv_proposal_cls_logits = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
num_features, num_features, kernel_size=3, stride=1,
|
||||
padding=1),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Conv2d(
|
||||
num_features,
|
||||
num_classes + 1,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0),
|
||||
)
|
||||
|
||||
@torch.no_grad()
|
||||
def compute_coordinates(self, x):
|
||||
h, w = x.size(2), x.size(3)
|
||||
y_loc = torch.linspace(0, 1, h, device=x.device)
|
||||
x_loc = torch.linspace(0, 1, w, device=x.device)
|
||||
y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
|
||||
locations = torch.stack([x_loc, y_loc], 0).unsqueeze(0)
|
||||
return locations
|
||||
|
||||
def seek_local_maximum(self, x, epsilon=1e-6):
|
||||
"""
|
||||
inputs:
|
||||
x: torch.tensor, shape [b, c, h, w]
|
||||
return:
|
||||
torch.tensor, shape [b, c, h, w]
|
||||
"""
|
||||
x_pad = F.pad(x, (1, 1, 1, 1), 'constant', 0)
|
||||
# top, bottom, left, right, top-left, top-right, bottom-left, bottom-right
|
||||
maximum = (x >= x_pad[:, :, :-2, 1:-1]) & \
|
||||
(x >= x_pad[:, :, 2:, 1:-1]) & \
|
||||
(x >= x_pad[:, :, 1:-1, :-2]) & \
|
||||
(x >= x_pad[:, :, 1:-1, 2:]) & \
|
||||
(x >= x_pad[:, :, :-2, :-2]) & \
|
||||
(x >= x_pad[:, :, :-2, 2:]) & \
|
||||
(x >= x_pad[:, :, 2:, :-2]) & \
|
||||
(x >= x_pad[:, :, 2:, 2:]) & \
|
||||
(x >= epsilon)
|
||||
return maximum.to(x)
|
||||
|
||||
def forward(self, x, pos_embeddings):
|
||||
|
||||
proposal_cls_logits = self.conv_proposal_cls_logits(x) # b, c, h, w
|
||||
proposal_cls_probs = proposal_cls_logits.softmax(dim=1) # b, c, h, w
|
||||
proposal_cls_one_hot = F.one_hot(
|
||||
proposal_cls_probs[:, :-1, :, :].max(1)[1],
|
||||
num_classes=self.num_classes + 1).permute(0, 3, 1, 2) # b, c, h, w
|
||||
proposal_cls_probs = proposal_cls_probs.mul(proposal_cls_one_hot)
|
||||
proposal_local_maximum_map = self.seek_local_maximum(
|
||||
proposal_cls_probs) # b, c, h, w
|
||||
proposal_cls_probs = proposal_cls_probs + proposal_local_maximum_map # b, c, h, w
|
||||
|
||||
# top-k indices
|
||||
topk_indices = torch.topk(
|
||||
proposal_cls_probs[:, :-1, :, :].flatten(2).max(1)[0],
|
||||
self.topk,
|
||||
dim=1)[1] # b, q
|
||||
topk_indices = topk_indices.unsqueeze(1) # b, 1, q
|
||||
|
||||
# topk queries
|
||||
topk_proposals = torch.gather(
|
||||
x.flatten(2), dim=2, index=topk_indices.repeat(1, x.shape[1],
|
||||
1)) # b, c, q
|
||||
pos_embeddings = pos_embeddings.repeat(x.shape[0], 1, 1, 1).flatten(2)
|
||||
topk_pos_embeddings = torch.gather(
|
||||
pos_embeddings,
|
||||
dim=2,
|
||||
index=topk_indices.repeat(1, pos_embeddings.shape[1],
|
||||
1)) # b, c, q
|
||||
if self.training:
|
||||
locations = self.compute_coordinates(x).repeat(x.shape[0], 1, 1, 1)
|
||||
topk_locations = torch.gather(
|
||||
locations.flatten(2),
|
||||
dim=2,
|
||||
index=topk_indices.repeat(1, locations.shape[1], 1))
|
||||
topk_locations = topk_locations.transpose(-1, -2) # b, q, 2
|
||||
else:
|
||||
topk_locations = None
|
||||
return topk_proposals, topk_pos_embeddings, topk_locations, proposal_cls_logits
|
||||
|
||||
|
||||
class FastInstDecoder(nn.Module):
|
||||
|
||||
def __init__(self, in_channels, *, num_classes: int, hidden_dim: int,
|
||||
num_queries: int, num_aux_queries: int, nheads: int,
|
||||
dim_feedforward: int, dec_layers: int, pre_norm: bool,
|
||||
mask_dim: int):
|
||||
"""
|
||||
Args:
|
||||
in_channels: channels of the input features
|
||||
num_classes: number of classes
|
||||
hidden_dim: Transformer feature dimension
|
||||
num_queries: number of queries
|
||||
num_aux_queries: number of auxiliary queries
|
||||
nheads: number of heads
|
||||
dim_feedforward: feature dimension in feedforward network
|
||||
dec_layers: number of Transformer decoder layers
|
||||
pre_norm: whether to use pre-LayerNorm or not
|
||||
mask_dim: mask feature dimension
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_heads = nheads
|
||||
self.num_layers = dec_layers
|
||||
self.num_queries = num_queries
|
||||
self.num_aux_queries = num_aux_queries
|
||||
self.num_classes = num_classes
|
||||
|
||||
meta_pos_size = int(round(math.sqrt(self.num_queries)))
|
||||
self.meta_pos_embed = nn.Parameter(
|
||||
torch.empty(1, hidden_dim, meta_pos_size, meta_pos_size))
|
||||
if num_aux_queries > 0:
|
||||
self.empty_query_features = nn.Embedding(num_aux_queries,
|
||||
hidden_dim)
|
||||
self.empty_query_pos_embed = nn.Embedding(num_aux_queries,
|
||||
hidden_dim)
|
||||
|
||||
self.query_proposal = QueryProposal(hidden_dim, num_queries,
|
||||
num_classes)
|
||||
|
||||
self.transformer_query_cross_attention_layers = nn.ModuleList()
|
||||
self.transformer_query_self_attention_layers = nn.ModuleList()
|
||||
self.transformer_query_ffn_layers = nn.ModuleList()
|
||||
self.transformer_mask_cross_attention_layers = nn.ModuleList()
|
||||
self.transformer_mask_ffn_layers = nn.ModuleList()
|
||||
for idx in range(self.num_layers):
|
||||
self.transformer_query_cross_attention_layers.append(
|
||||
CrossAttentionLayer(
|
||||
d_model=hidden_dim,
|
||||
nhead=nheads,
|
||||
dropout=0.0,
|
||||
normalize_before=pre_norm))
|
||||
self.transformer_query_self_attention_layers.append(
|
||||
SelfAttentionLayer(
|
||||
d_model=hidden_dim,
|
||||
nhead=nheads,
|
||||
dropout=0.0,
|
||||
normalize_before=pre_norm))
|
||||
self.transformer_query_ffn_layers.append(
|
||||
FFNLayer(
|
||||
d_model=hidden_dim,
|
||||
dim_feedforward=dim_feedforward,
|
||||
dropout=0.0,
|
||||
normalize_before=pre_norm))
|
||||
self.transformer_mask_cross_attention_layers.append(
|
||||
CrossAttentionLayer(
|
||||
d_model=hidden_dim,
|
||||
nhead=nheads,
|
||||
dropout=0.0,
|
||||
normalize_before=pre_norm))
|
||||
self.transformer_mask_ffn_layers.append(
|
||||
FFNLayer(
|
||||
d_model=hidden_dim,
|
||||
dim_feedforward=dim_feedforward,
|
||||
dropout=0.0,
|
||||
normalize_before=pre_norm))
|
||||
|
||||
self.decoder_query_norm_layers = nn.ModuleList()
|
||||
self.class_embed_layers = nn.ModuleList()
|
||||
self.mask_embed_layers = nn.ModuleList()
|
||||
self.mask_features_layers = nn.ModuleList()
|
||||
for idx in range(self.num_layers + 1):
|
||||
self.decoder_query_norm_layers.append(nn.LayerNorm(hidden_dim))
|
||||
self.class_embed_layers.append(
|
||||
MLP(hidden_dim, hidden_dim, num_classes + 1, 3))
|
||||
self.mask_embed_layers.append(
|
||||
MLP(hidden_dim, hidden_dim, mask_dim, 3))
|
||||
self.mask_features_layers.append(nn.Linear(hidden_dim, mask_dim))
|
||||
|
||||
def forward(self, x, mask_features, targets=None):
|
||||
bs = x[0].shape[0]
|
||||
proposal_size = x[1].shape[-2:]
|
||||
pixel_feature_size = x[2].shape[-2:]
|
||||
|
||||
pixel_pos_embeds = F.interpolate(
|
||||
self.meta_pos_embed,
|
||||
size=pixel_feature_size,
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
proposal_pos_embeds = F.interpolate(
|
||||
self.meta_pos_embed,
|
||||
size=proposal_size,
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
|
||||
pixel_features = x[2].flatten(2).permute(2, 0, 1)
|
||||
pixel_pos_embeds = pixel_pos_embeds.flatten(2).permute(2, 0, 1)
|
||||
|
||||
query_features, query_pos_embeds, query_locations, proposal_cls_logits = self.query_proposal(
|
||||
x[1], proposal_pos_embeds)
|
||||
query_features = query_features.permute(2, 0, 1)
|
||||
query_pos_embeds = query_pos_embeds.permute(2, 0, 1)
|
||||
if self.num_aux_queries > 0:
|
||||
aux_query_features = self.empty_query_features.weight.unsqueeze(
|
||||
1).repeat(1, bs, 1)
|
||||
aux_query_pos_embed = self.empty_query_pos_embed.weight.unsqueeze(
|
||||
1).repeat(1, bs, 1)
|
||||
query_features = torch.cat([query_features, aux_query_features],
|
||||
dim=0)
|
||||
query_pos_embeds = torch.cat(
|
||||
[query_pos_embeds, aux_query_pos_embed], dim=0)
|
||||
|
||||
outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
|
||||
query_features,
|
||||
pixel_features,
|
||||
pixel_feature_size,
|
||||
-1,
|
||||
return_attn_mask=True)
|
||||
predictions_class = [outputs_class]
|
||||
predictions_mask = [outputs_mask]
|
||||
predictions_matching_index = [None]
|
||||
query_feature_memory = [query_features]
|
||||
pixel_feature_memory = [pixel_features]
|
||||
|
||||
for i in range(self.num_layers):
|
||||
query_features, pixel_features = self.forward_one_layer(
|
||||
query_features, pixel_features, query_pos_embeds,
|
||||
pixel_pos_embeds, attn_mask, i)
|
||||
if i < self.num_layers - 1:
|
||||
outputs_class, outputs_mask, attn_mask, _, _ = self.forward_prediction_heads(
|
||||
query_features,
|
||||
pixel_features,
|
||||
pixel_feature_size,
|
||||
i,
|
||||
return_attn_mask=True,
|
||||
)
|
||||
else:
|
||||
outputs_class, outputs_mask, _, matching_indices, gt_attn_mask = self.forward_prediction_heads(
|
||||
query_features,
|
||||
pixel_features,
|
||||
pixel_feature_size,
|
||||
i,
|
||||
)
|
||||
predictions_class.append(outputs_class)
|
||||
predictions_mask.append(outputs_mask)
|
||||
predictions_matching_index.append(None)
|
||||
query_feature_memory.append(query_features)
|
||||
pixel_feature_memory.append(pixel_features)
|
||||
|
||||
out = {
|
||||
'proposal_cls_logits':
|
||||
proposal_cls_logits,
|
||||
'query_locations':
|
||||
query_locations,
|
||||
'pred_logits':
|
||||
predictions_class[-1],
|
||||
'pred_masks':
|
||||
predictions_mask[-1],
|
||||
'pred_indices':
|
||||
predictions_matching_index[-1],
|
||||
'aux_outputs':
|
||||
self._set_aux_loss(predictions_class, predictions_mask,
|
||||
predictions_matching_index, query_locations)
|
||||
}
|
||||
return out
|
||||
|
||||
def forward_one_layer(self, query_features, pixel_features,
|
||||
query_pos_embeds, pixel_pos_embeds, attn_mask, i):
|
||||
pixel_features = self.transformer_mask_cross_attention_layers[i](
|
||||
pixel_features,
|
||||
query_features,
|
||||
query_pos=pixel_pos_embeds,
|
||||
pos=query_pos_embeds)
|
||||
pixel_features = self.transformer_mask_ffn_layers[i](pixel_features)
|
||||
|
||||
query_features = self.transformer_query_cross_attention_layers[i](
|
||||
query_features,
|
||||
pixel_features,
|
||||
memory_mask=attn_mask,
|
||||
query_pos=query_pos_embeds,
|
||||
pos=pixel_pos_embeds)
|
||||
query_features = self.transformer_query_self_attention_layers[i](
|
||||
query_features, query_pos=query_pos_embeds)
|
||||
query_features = self.transformer_query_ffn_layers[i](query_features)
|
||||
return query_features, pixel_features
|
||||
|
||||
def forward_prediction_heads(self,
|
||||
query_features,
|
||||
pixel_features,
|
||||
pixel_feature_size,
|
||||
idx_layer,
|
||||
return_attn_mask=False,
|
||||
return_gt_attn_mask=False,
|
||||
targets=None,
|
||||
query_locations=None):
|
||||
decoder_query_features = self.decoder_query_norm_layers[idx_layer + 1](
|
||||
query_features[:self.num_queries])
|
||||
decoder_query_features = decoder_query_features.transpose(0, 1)
|
||||
if idx_layer + 1 == self.num_layers:
|
||||
outputs_class = self.class_embed_layers[idx_layer + 1](
|
||||
decoder_query_features)
|
||||
else:
|
||||
outputs_class = None
|
||||
outputs_mask_embed = self.mask_embed_layers[idx_layer + 1](
|
||||
decoder_query_features)
|
||||
outputs_mask_features = self.mask_features_layers[idx_layer + 1](
|
||||
pixel_features.transpose(0, 1))
|
||||
|
||||
outputs_mask = torch.einsum('bqc,blc->bql', outputs_mask_embed,
|
||||
outputs_mask_features)
|
||||
outputs_mask = outputs_mask.reshape(-1, self.num_queries,
|
||||
*pixel_feature_size)
|
||||
|
||||
if return_attn_mask:
|
||||
# outputs_mask.shape: b, q, h, w
|
||||
attn_mask = F.pad(outputs_mask,
|
||||
(0, 0, 0, 0, 0, self.num_aux_queries),
|
||||
'constant', 1)
|
||||
attn_mask = (attn_mask < 0.).flatten(2) # b, q, hw
|
||||
invalid_query = attn_mask.all(-1, keepdim=True) # b, q, 1
|
||||
attn_mask = (~invalid_query) & attn_mask # b, q, hw
|
||||
attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1,
|
||||
1).flatten(0, 1)
|
||||
attn_mask = attn_mask.detach()
|
||||
else:
|
||||
attn_mask = None
|
||||
|
||||
matching_indices = None
|
||||
gt_attn_mask = None
|
||||
|
||||
return outputs_class, outputs_mask, attn_mask, matching_indices, gt_attn_mask
|
||||
|
||||
@torch.jit.unused
|
||||
def _set_aux_loss(self, outputs_class, outputs_seg_masks, output_indices,
|
||||
output_query_locations):
|
||||
return [{
|
||||
'query_locations': output_query_locations,
|
||||
'pred_logits': a,
|
||||
'pred_masks': b,
|
||||
'pred_matching_indices': c
|
||||
} for a, b, c in zip(outputs_class[:-1], outputs_seg_masks[:-1],
|
||||
output_indices[:-1])]
|
||||
@@ -0,0 +1,180 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import logging
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \
|
||||
Conv2d
|
||||
|
||||
|
||||
# This is a modified FPN decoder.
|
||||
class BaseFPN(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_shape,
|
||||
*,
|
||||
convs_dim: int,
|
||||
mask_dim: int,
|
||||
norm: Optional[Union[str, Callable]] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
input_shape: shapes (channels and stride) of the input features
|
||||
convs_dim: number of output channels for the intermediate conv layers.
|
||||
mask_dim: number of output channels for the final conv layer.
|
||||
norm (str or callable): normalization for all conv layers
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride'])
|
||||
self.in_features = [k for k, v in input_shape
|
||||
] # starting from "res3" to "res5"
|
||||
feature_channels = [v['channels'] for k, v in input_shape]
|
||||
|
||||
lateral_convs = []
|
||||
output_convs = []
|
||||
|
||||
use_bias = norm == ''
|
||||
for idx, in_channels in enumerate(feature_channels):
|
||||
lateral_norm = nn.GroupNorm(32, convs_dim)
|
||||
output_norm = nn.GroupNorm(32, convs_dim)
|
||||
|
||||
lateral_conv = Conv2d(
|
||||
in_channels,
|
||||
convs_dim,
|
||||
kernel_size=1,
|
||||
bias=use_bias,
|
||||
norm=lateral_norm)
|
||||
output_conv = Conv2d(
|
||||
convs_dim,
|
||||
convs_dim,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias=use_bias,
|
||||
norm=output_norm,
|
||||
activation=F.relu,
|
||||
)
|
||||
self.add_module('adapter_{}'.format(idx + 1), lateral_conv)
|
||||
self.add_module('layer_{}'.format(idx + 1), output_conv)
|
||||
|
||||
lateral_convs.append(lateral_conv)
|
||||
output_convs.append(output_conv)
|
||||
# Place convs into top-down order (from low to high resolution)
|
||||
# to make the top-down computation in forward clearer.
|
||||
self.lateral_convs = lateral_convs[::-1]
|
||||
self.output_convs = output_convs[::-1]
|
||||
|
||||
self.convs_dim = convs_dim
|
||||
self.num_feature_levels = 3 # always use 3 scales
|
||||
|
||||
def forward_features(self, features):
|
||||
multi_scale_features = []
|
||||
num_cur_levels = 0
|
||||
# Reverse feature maps into top-down order (from low to high resolution)
|
||||
for idx, f in enumerate(self.in_features[::-1]):
|
||||
x = features[f]
|
||||
lateral_conv = self.lateral_convs[idx]
|
||||
output_conv = self.output_convs[idx]
|
||||
if idx == 0:
|
||||
y = lateral_conv(x)
|
||||
else:
|
||||
cur_fpn = lateral_conv(x)
|
||||
y = cur_fpn + F.interpolate(
|
||||
y,
|
||||
size=cur_fpn.shape[-2:],
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
y = output_conv(y)
|
||||
|
||||
if num_cur_levels < self.num_feature_levels:
|
||||
multi_scale_features.append(y)
|
||||
num_cur_levels += 1
|
||||
return None, multi_scale_features
|
||||
|
||||
def forward(self, features, targets=None):
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.warning(
|
||||
'Calling forward() may cause unpredicted behavior of PixelDecoder module.'
|
||||
)
|
||||
return self.forward_features(features)
|
||||
|
||||
|
||||
class PyramidPoolingModule(nn.Module):
|
||||
|
||||
def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
|
||||
super().__init__()
|
||||
self.stages = []
|
||||
self.stages = nn.ModuleList(
|
||||
[self._make_stage(in_channels, channels, size) for size in sizes])
|
||||
self.bottleneck = Conv2d(in_channels + len(sizes) * channels,
|
||||
in_channels, 1)
|
||||
|
||||
def _make_stage(self, features, out_features, size):
|
||||
prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
|
||||
conv = Conv2d(features, out_features, 1)
|
||||
return nn.Sequential(prior, conv)
|
||||
|
||||
def forward(self, feats):
|
||||
h, w = feats.size(2), feats.size(3)
|
||||
priors = [
|
||||
F.interpolate(
|
||||
input=F.relu_(stage(feats)),
|
||||
size=(h, w),
|
||||
mode='bilinear',
|
||||
align_corners=False) for stage in self.stages
|
||||
] + [feats]
|
||||
out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
|
||||
return out
|
||||
|
||||
|
||||
class PyramidPoolingModuleFPN(BaseFPN):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_shape,
|
||||
*,
|
||||
convs_dim: int,
|
||||
mask_dim: int,
|
||||
norm: Optional[Union[str, Callable]] = None,
|
||||
):
|
||||
"""
|
||||
NOTE: this interface is experimental.
|
||||
Args:
|
||||
input_shape: shapes (channels and stride) of the input features
|
||||
convs_dim: number of output channels for the intermediate conv layers.
|
||||
mask_dim: number of output channels for the final conv layer.
|
||||
norm (str or callable): normalization for all conv layers
|
||||
"""
|
||||
super().__init__(
|
||||
input_shape, convs_dim=convs_dim, mask_dim=mask_dim, norm=norm)
|
||||
self.ppm = PyramidPoolingModule(convs_dim, convs_dim // 4)
|
||||
|
||||
def forward_features(self, features):
|
||||
multi_scale_features = []
|
||||
num_cur_levels = 0
|
||||
# Reverse feature maps into top-down order (from low to high resolution)
|
||||
for idx, f in enumerate(self.in_features[::-1]):
|
||||
x = features[f]
|
||||
lateral_conv = self.lateral_convs[idx]
|
||||
output_conv = self.output_convs[idx]
|
||||
if idx == 0:
|
||||
y = self.ppm(lateral_conv(x))
|
||||
else:
|
||||
cur_fpn = lateral_conv(x)
|
||||
y = cur_fpn + F.interpolate(
|
||||
y,
|
||||
size=cur_fpn.shape[-2:],
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
y = output_conv(y)
|
||||
|
||||
if num_cur_levels < self.num_feature_levels:
|
||||
multi_scale_features.append(y)
|
||||
num_cur_levels += 1
|
||||
|
||||
return None, multi_scale_features
|
||||
@@ -0,0 +1,221 @@
|
||||
# Part of implementation is borrowed and modified from Mask2Former, publicly available at
|
||||
# https://github.com/facebookresearch/Mask2Former.
|
||||
import os
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \
|
||||
ImageList
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .backbones import build_resnet_backbone
|
||||
from .fastinst.fastinst_decoder import FastInstDecoder
|
||||
from .fastinst.fastinst_encoder import PyramidPoolingModuleFPN
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.image_segmentation, module_name=Models.fastinst)
|
||||
class FastInst(TorchModel):
|
||||
|
||||
def __init__(self,
|
||||
model_dir,
|
||||
backbone=None,
|
||||
encoder=None,
|
||||
decoder=None,
|
||||
pretrained=None,
|
||||
classes=None,
|
||||
**kwargs):
|
||||
"""
|
||||
Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394
|
||||
Args:
|
||||
backbone (dict): backbone config.
|
||||
encoder (dict): encoder config.
|
||||
decoder (dict): decoder config.
|
||||
pretrained (bool): whether to use pretrained model
|
||||
classes (list): class names
|
||||
"""
|
||||
super(FastInst, self).__init__(model_dir, **kwargs)
|
||||
|
||||
self.backbone = build_resnet_backbone(
|
||||
**backbone, input_shape={'channels': 3})
|
||||
in_features = encoder.pop('in_features')
|
||||
input_shape = {
|
||||
k: v
|
||||
for k, v in self.backbone.output_shape().items()
|
||||
if k in in_features
|
||||
}
|
||||
encoder = PyramidPoolingModuleFPN(input_shape=input_shape, **encoder)
|
||||
decoder = FastInstDecoder(in_channels=encoder.convs_dim, **decoder)
|
||||
self.sem_seg_head = FastInstHead(
|
||||
pixel_decoder=encoder, transformer_predictor=decoder)
|
||||
|
||||
self.num_classes = decoder.num_classes
|
||||
self.num_queries = decoder.num_queries
|
||||
self.size_divisibility = 32
|
||||
self.register_buffer(
|
||||
'pixel_mean',
|
||||
torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False)
|
||||
self.register_buffer(
|
||||
'pixel_std',
|
||||
torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False)
|
||||
self.classes = classes
|
||||
self.test_topk_per_image = 100
|
||||
|
||||
if pretrained:
|
||||
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
|
||||
logger.info(f'loading model from {model_path}')
|
||||
weight = torch.load(model_path, map_location='cpu')['model']
|
||||
tgt_weight = self.state_dict()
|
||||
for name in list(weight.keys()):
|
||||
if name in tgt_weight:
|
||||
load_size = weight[name].size()
|
||||
tgt_size = tgt_weight[name].size()
|
||||
mis_match = False
|
||||
if len(load_size) != len(tgt_size):
|
||||
mis_match = True
|
||||
else:
|
||||
for n1, n2 in zip(load_size, tgt_size):
|
||||
if n1 != n2:
|
||||
mis_match = True
|
||||
break
|
||||
if mis_match:
|
||||
logger.info(
|
||||
f'size mismatch for {name} '
|
||||
f'({load_size} -> {tgt_size}), skip loading.')
|
||||
del weight[name]
|
||||
else:
|
||||
logger.info(
|
||||
f'{name} doesn\'t exist in current model, skip loading.'
|
||||
)
|
||||
|
||||
self.load_state_dict(weight, strict=False)
|
||||
logger.info('load model done')
|
||||
|
||||
def forward(self, batched_inputs: List[dict]) -> Dict[str, Any]:
|
||||
images = [x['image'].to(self.device) for x in batched_inputs]
|
||||
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
||||
images = ImageList.from_tensors(images, self.size_divisibility)
|
||||
|
||||
features = self.backbone(images.tensor)
|
||||
outputs = self.sem_seg_head(features)
|
||||
|
||||
return dict(
|
||||
outputs=outputs, batched_inputs=batched_inputs, images=images)
|
||||
|
||||
def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
outputs = input['outputs']
|
||||
batched_inputs = input['batched_inputs']
|
||||
images = input['images']
|
||||
if self.training:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
mask_cls_results = outputs['pred_logits'] # (B, Q, C+1)
|
||||
mask_pred_results = outputs['pred_masks'] # (B, Q, H, W)
|
||||
# upsample masks
|
||||
mask_pred_results = F.interpolate(
|
||||
mask_pred_results,
|
||||
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
|
||||
mode='bilinear',
|
||||
align_corners=False,
|
||||
)
|
||||
|
||||
del outputs
|
||||
|
||||
processed_results = []
|
||||
for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
|
||||
mask_cls_results, mask_pred_results, batched_inputs,
|
||||
images.image_sizes):
|
||||
height = input_per_image.get('height', image_size[0])
|
||||
width = input_per_image.get('width', image_size[1])
|
||||
processed_results.append({}) # for each image
|
||||
|
||||
mask_pred_result = self.sem_seg_postprocess(
|
||||
mask_pred_result, image_size, height, width)
|
||||
mask_cls_result = mask_cls_result.to(mask_pred_result)
|
||||
|
||||
instance_r = self.instance_inference(mask_cls_result,
|
||||
mask_pred_result)
|
||||
processed_results[-1]['instances'] = instance_r
|
||||
|
||||
return dict(eval_result=processed_results)
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return self.pixel_mean.device
|
||||
|
||||
def sem_seg_postprocess(self, result, img_size, output_height,
|
||||
output_width):
|
||||
result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1)
|
||||
result = F.interpolate(
|
||||
result,
|
||||
size=(output_height, output_width),
|
||||
mode='bilinear',
|
||||
align_corners=False)[0]
|
||||
return result
|
||||
|
||||
def instance_inference(self, mask_cls, mask_pred):
|
||||
# mask_pred is already processed to have the same shape as original input
|
||||
image_size = mask_pred.shape[-2:]
|
||||
|
||||
# [Q, K]
|
||||
scores = F.softmax(mask_cls, dim=-1)[:, :-1]
|
||||
labels = torch.arange(
|
||||
self.num_classes,
|
||||
device=self.device).unsqueeze(0).repeat(self.num_queries,
|
||||
1).flatten(0, 1)
|
||||
scores_per_image, topk_indices = scores.flatten(0, 1).topk(
|
||||
self.test_topk_per_image, sorted=False)
|
||||
labels_per_image = labels[topk_indices]
|
||||
|
||||
topk_indices = topk_indices // self.num_classes
|
||||
mask_pred = mask_pred[topk_indices]
|
||||
|
||||
result = {'image_size': image_size}
|
||||
# mask (before sigmoid)
|
||||
mask_pred_sigmoid = mask_pred.sigmoid()
|
||||
result['pred_masks'] = (mask_pred_sigmoid > 0.5).float()
|
||||
|
||||
# calculate average mask prob
|
||||
mask_scores_per_image = (mask_pred_sigmoid.flatten(1)
|
||||
* result['pred_masks'].flatten(1)).sum(1) / (
|
||||
result['pred_masks'].flatten(1).sum(1)
|
||||
+ 1e-6)
|
||||
result['scores'] = scores_per_image * mask_scores_per_image
|
||||
result['pred_classes'] = labels_per_image
|
||||
return result
|
||||
|
||||
|
||||
class FastInstHead(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
pixel_decoder: nn.Module,
|
||||
# extra parameters
|
||||
transformer_predictor: nn.Module):
|
||||
"""
|
||||
NOTE: this interface is experimental.
|
||||
Args:
|
||||
pixel_decoder: the pixel decoder module
|
||||
transformer_predictor: the transformer decoder that makes prediction
|
||||
"""
|
||||
super().__init__()
|
||||
self.pixel_decoder = pixel_decoder
|
||||
self.predictor = transformer_predictor
|
||||
|
||||
def forward(self, features, targets=None):
|
||||
return self.layers(features, targets)
|
||||
|
||||
def layers(self, features, targets=None):
|
||||
mask_features, multi_scale_features = self.pixel_decoder.forward_features(
|
||||
features)
|
||||
predictions = self.predictor(multi_scale_features, mask_features,
|
||||
targets)
|
||||
return predictions
|
||||
@@ -108,16 +108,16 @@ def get_img_ins_seg_result(img_seg_result=None,
|
||||
for seg_result in img_seg_result:
|
||||
|
||||
box = [
|
||||
np.int(seg_result[0]),
|
||||
np.int(seg_result[1]),
|
||||
np.int(seg_result[2]),
|
||||
np.int(seg_result[3])
|
||||
int(seg_result[0]),
|
||||
int(seg_result[1]),
|
||||
int(seg_result[2]),
|
||||
int(seg_result[3])
|
||||
]
|
||||
score = np.float(seg_result[4])
|
||||
score = float(seg_result[4])
|
||||
category = seg_result[5]
|
||||
|
||||
mask = np.array(seg_result[6], order='F', dtype='uint8')
|
||||
mask = mask.astype(np.float)
|
||||
mask = mask.astype(float)
|
||||
|
||||
results_dict[OutputKeys.BOXES].append(box)
|
||||
results_dict[OutputKeys.MASKS].append(mask)
|
||||
|
||||
@@ -382,7 +382,7 @@ def processing_single_scene(args):
|
||||
points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
|
||||
points3d[p3d_id].xyz[2], 1
|
||||
])
|
||||
zs.append(np.asscalar(transformed[2]))
|
||||
zs.append(transformed[2].item())
|
||||
zs_sorted = sorted(zs)
|
||||
# relaxed depth range
|
||||
max_ratio = 0.1
|
||||
|
||||
@@ -40,7 +40,7 @@ def read_mask(filename):
|
||||
|
||||
# save a binary mask
|
||||
def save_mask(filename, mask):
|
||||
assert mask.dtype == np.bool
|
||||
assert mask.dtype == bool
|
||||
mask = mask.astype(np.uint8) * 255
|
||||
Image.fromarray(mask).save(filename)
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .panseg_model import SwinLPanopticSegmentation
|
||||
from .r50_panseg_model import R50PanopticSegmentation
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from easycv.models.segmentation import Mask2Former
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_segmentation,
|
||||
module_name=Models.r50_panoptic_segmentation)
|
||||
class R50PanopticSegmentation(EasyCVBaseModel, Mask2Former):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
Mask2Former.__init__(self, *args, **kwargs)
|
||||
@@ -1,16 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.segmentation import EncoderDecoder
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_segmentation, module_name=Models.segformer)
|
||||
class Segformer(EasyCVBaseModel, EncoderDecoder):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
EncoderDecoder.__init__(self, *args, **kwargs)
|
||||
@@ -60,7 +60,7 @@ class SemanticSegmentation(TorchModel):
|
||||
ids = ids[legal_indices]
|
||||
|
||||
segms = (semantic_result[None] == ids[:, None, None])
|
||||
masks = [it.astype(np.int) for it in segms]
|
||||
masks = [it.astype(int) for it in segms]
|
||||
labels_txt = np.array(self.CLASSES)[ids].tolist()
|
||||
|
||||
results = {
|
||||
|
||||
@@ -458,7 +458,7 @@ class HrnetBackBone(nn.Module):
|
||||
self.stage4, pre_stage_channels = self._make_stage(
|
||||
self.stage4_cfg, num_channels, multi_scale_output=True)
|
||||
|
||||
self.backbone_last_inp_channels = np.int(np.sum(pre_stage_channels))
|
||||
self.backbone_last_inp_channels = int(np.sum(pre_stage_channels))
|
||||
|
||||
def _make_transition_layer(self, num_channels_pre_layer,
|
||||
num_channels_cur_layer):
|
||||
|
||||
@@ -259,7 +259,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
|
||||
num_channels = [64, last_inp_channels]
|
||||
self.stage_super, super_stage_channels = self._make_stage(
|
||||
self.super_dict, num_channels)
|
||||
last_inp_channels = np.int(np.sum(super_stage_channels))
|
||||
last_inp_channels = int(np.sum(super_stage_channels))
|
||||
|
||||
if self.is_contain_aspp:
|
||||
aspp_param = kwargs['aspp']
|
||||
@@ -372,7 +372,7 @@ class HrnetSuperAndOcr(HrnetBackBone):
|
||||
num_channels = [64, ocr_mid_channels]
|
||||
self.stage_super, super_stage_channels = self._make_stage(
|
||||
self.super_dict, num_channels)
|
||||
last_inp_channels = np.int(np.sum(super_stage_channels))
|
||||
last_inp_channels = int(np.sum(super_stage_channels))
|
||||
|
||||
self.cls_head = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
|
||||
@@ -13,7 +13,8 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchvision.transforms as TF
|
||||
from PIL import Image
|
||||
from shotdetect_scenedetect_lgss import shot_detect
|
||||
from shotdetect_scenedetect_lgss import shot_detector
|
||||
from tqdm import tqdm
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base.base_torch_model import TorchModel
|
||||
@@ -60,6 +61,9 @@ class MovieSceneSegmentationModel(TorchModel):
|
||||
self.head_sbd = nn.Linear(hdim, 2)
|
||||
load_param_with_prefix('head_sbd', self.head_sbd, params)
|
||||
|
||||
self.shot_detector = shot_detector()
|
||||
self.shot_detector.init(**self.cfg.preprocessor.shot_detect)
|
||||
|
||||
self.test_transform = TF.Compose([
|
||||
TF.Resize(size=256, interpolation=Image.BICUBIC),
|
||||
TF.CenterCrop(224),
|
||||
@@ -98,29 +102,45 @@ class MovieSceneSegmentationModel(TorchModel):
|
||||
def inference(self, batch):
|
||||
logger.info('Begin scene detect ......')
|
||||
bs = self.cfg.pipeline.batch_size_per_gpu
|
||||
sids = batch['sid']
|
||||
inputs = batch['shot_feat']
|
||||
device = self.crn.attention_mask.device
|
||||
|
||||
shot_num = len(sids)
|
||||
shot_timecode_lst = batch['shot_timecode_lst']
|
||||
shot_idx_lst = batch['shot_idx_lst']
|
||||
|
||||
shot_num = len(shot_timecode_lst)
|
||||
cnt = math.ceil(shot_num / bs)
|
||||
|
||||
infer_sid, infer_pred = [], []
|
||||
infer_pred = []
|
||||
infer_result = {}
|
||||
for i in range(cnt):
|
||||
self.shot_detector.start()
|
||||
|
||||
for i in tqdm(range(cnt)):
|
||||
start = i * bs
|
||||
end = (i + 1) * bs if (i + 1) * bs < shot_num else shot_num
|
||||
input_ = inputs[start:end]
|
||||
sid_ = sids[start:end]
|
||||
input_ = torch.stack(input_)
|
||||
|
||||
batch_shot_idx_lst = shot_idx_lst[start:end]
|
||||
|
||||
shot_start_idx = batch_shot_idx_lst[0][0]
|
||||
shot_end_idx = batch_shot_idx_lst[-1][-1]
|
||||
batch_timecode_lst = {
|
||||
i: shot_timecode_lst[i]
|
||||
for i in range(shot_start_idx, shot_end_idx + 1)
|
||||
}
|
||||
batch_shot_keyf_lst = self.shot_detector.get_frame_img(
|
||||
batch_timecode_lst, shot_start_idx, shot_num)
|
||||
inputs = self.get_batch_input(batch_shot_keyf_lst, shot_start_idx,
|
||||
batch_shot_idx_lst)
|
||||
|
||||
input_ = torch.stack(inputs).to(device)
|
||||
outputs = self.shared_step(input_) # shape [b,2]
|
||||
prob = F.softmax(outputs, dim=1)
|
||||
infer_sid.extend(sid_.cpu().detach().numpy())
|
||||
infer_pred.extend(prob[:, 1].cpu().detach().numpy())
|
||||
infer_result.update({'pred': np.stack(infer_pred)})
|
||||
infer_result.update({'sid': infer_sid})
|
||||
|
||||
assert len(infer_result['sid']) == len(sids)
|
||||
assert len(infer_result['pred']) == len(inputs)
|
||||
infer_result.update({'pred': np.stack(infer_pred)})
|
||||
infer_result.update({'sid': np.arange(shot_num)})
|
||||
|
||||
assert len(infer_result['pred']) == shot_num
|
||||
self.shot_detector.release()
|
||||
return infer_result
|
||||
|
||||
def shared_step(self, inputs):
|
||||
@@ -162,38 +182,48 @@ class MovieSceneSegmentationModel(TorchModel):
|
||||
logger.info('Generate scene .......')
|
||||
|
||||
pred_dict = inputs['feat']
|
||||
shot2keyf = inputs['shot2keyf']
|
||||
thres = self.cfg.pipeline.save_threshold
|
||||
|
||||
anno_dict = get_pred_boundary(pred_dict, thres)
|
||||
scene_dict_lst, scene_list, shot_num, shot_dict_lst = pred2scene(
|
||||
self.shot2keyf, anno_dict)
|
||||
shot2keyf, anno_dict)
|
||||
if self.cfg.pipeline.save_split_scene:
|
||||
re_dir = scene2video(inputs['input_video_pth'], scene_list, thres)
|
||||
print(f'Split scene video saved to {re_dir}')
|
||||
return len(scene_list), scene_dict_lst, shot_num, shot_dict_lst
|
||||
|
||||
def preprocess(self, inputs):
|
||||
logger.info('Begin shot detect......')
|
||||
shot_keyf_lst, anno, shot2keyf = shot_detect(
|
||||
inputs, **self.cfg.preprocessor.shot_detect)
|
||||
logger.info('Shot detect done!')
|
||||
def get_batch_input(self, shot_keyf_lst, shot_start_idx, shot_idx_lst):
|
||||
|
||||
single_shot_feat, sid = [], []
|
||||
single_shot_feat = []
|
||||
for idx, one_shot in enumerate(shot_keyf_lst):
|
||||
one_shot = [
|
||||
self.test_transform(one_frame) for one_frame in one_shot
|
||||
]
|
||||
one_shot = torch.stack(one_shot, dim=0)
|
||||
single_shot_feat.append(one_shot)
|
||||
sid.append(idx)
|
||||
|
||||
single_shot_feat = torch.stack(single_shot_feat, dim=0)
|
||||
|
||||
shot_feat = []
|
||||
for idx, shot_idx in enumerate(shot_idx_lst):
|
||||
shot_idx_ = shot_idx - shot_start_idx
|
||||
_one_shot = single_shot_feat[shot_idx_]
|
||||
shot_feat.append(_one_shot)
|
||||
|
||||
return shot_feat
|
||||
|
||||
def preprocess(self, inputs):
|
||||
logger.info('Begin shot detect......')
|
||||
shot_timecode_lst, anno, shot2keyf = self.shot_detector.shot_detect(
|
||||
inputs, **self.cfg.preprocessor.shot_detect)
|
||||
logger.info('Shot detect done!')
|
||||
|
||||
shot_idx_lst = []
|
||||
for idx, one_shot in enumerate(anno):
|
||||
shot_idx = int(one_shot['shot_id']) + np.arange(
|
||||
-self.neighbor_size, self.neighbor_size + 1)
|
||||
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'])
|
||||
_one_shot = single_shot_feat[shot_idx]
|
||||
shot_feat.append(_one_shot)
|
||||
self.shot2keyf = shot2keyf
|
||||
self.anno = anno
|
||||
return shot_feat, sid
|
||||
shot_idx = np.clip(shot_idx, 0, one_shot['num_shot'] - 1)
|
||||
shot_idx_lst.append(shot_idx)
|
||||
|
||||
return shot2keyf, anno, shot_timecode_lst, shot_idx_lst
|
||||
|
||||
@@ -10,11 +10,12 @@ from tqdm import tqdm
|
||||
|
||||
|
||||
def get_pred_boundary(pred_dict, threshold=0.5):
|
||||
pred = pred_dict['pred']
|
||||
pred = pred_dict['pred'].cpu().numpy()
|
||||
sid = pred_dict['sid'].cpu().numpy().astype(np.int32)
|
||||
tmp = (pred > threshold).astype(np.int32)
|
||||
anno_dict = {}
|
||||
for idx in range(len(tmp)):
|
||||
anno_dict.update({str(pred_dict['sid'][idx]).zfill(4): int(tmp[idx])})
|
||||
anno_dict.update({str(sid[idx]).zfill(4): int(tmp[idx])})
|
||||
return anno_dict
|
||||
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ class ObjectSegmenter(object):
|
||||
elif img.shape[2] == 4:
|
||||
img = img[:, :, :3]
|
||||
img = img[:, :, ::-1]
|
||||
img = img.astype(np.float)
|
||||
img = img.astype(float)
|
||||
return img
|
||||
|
||||
def run_mask(self, img):
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.detection.detectors import Detection as _Detection
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_object_detection, module_name=Models.dino)
|
||||
class DINO(EasyCVBaseModel, _Detection):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
_Detection.__init__(self, *args, **kwargs)
|
||||
@@ -1,21 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.detection.detectors import YOLOX as _YOLOX
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_object_detection, module_name=Models.yolox)
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_object_detection,
|
||||
module_name=Models.image_object_detection_auto)
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.domain_specific_object_detection, module_name=Models.yolox)
|
||||
class YOLOX(EasyCVBaseModel, _YOLOX):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
_YOLOX.__init__(self, *args, **kwargs)
|
||||
@@ -30,7 +30,7 @@ def depth2color(depth):
|
||||
if gray == 1:
|
||||
return tuple(colors[-1].tolist())
|
||||
num_rank = len(colors) - 1
|
||||
rank = np.floor(gray * num_rank).astype(np.int)
|
||||
rank = np.floor(gray * num_rank).astype(int)
|
||||
diff = (gray - rank / num_rank) * num_rank
|
||||
tmp = colors[rank + 1] - colors[rank]
|
||||
return tuple((colors[rank] + tmp * diff).tolist())
|
||||
@@ -136,7 +136,7 @@ def plot_result(res_path,
|
||||
l2g = get_lidar2global(infos)
|
||||
corners_lidar = corners_global @ np.linalg.inv(l2g).T
|
||||
corners_lidar = corners_lidar[:, :3]
|
||||
pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool)
|
||||
pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=bool)
|
||||
scores = [
|
||||
pred_res[rid]['detection_score'] for rid in range(len(pred_res))
|
||||
]
|
||||
@@ -151,7 +151,7 @@ def plot_result(res_path,
|
||||
origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3)
|
||||
corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt],
|
||||
axis=0)
|
||||
gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool)
|
||||
gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=bool)
|
||||
pred_flag = np.concatenate(
|
||||
[pred_flag, np.logical_not(gt_flag)], axis=0)
|
||||
scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])]
|
||||
@@ -169,7 +169,7 @@ def plot_result(res_path,
|
||||
check_point_in_img(corners_img, img.shape[0], img.shape[1]))
|
||||
valid = valid.reshape(
|
||||
-1, 8) # valid means: d>0 and visible in current view
|
||||
corners_img = corners_img.reshape(-1, 8, 2).astype(np.int)
|
||||
corners_img = corners_img.reshape(-1, 8, 2).astype(int)
|
||||
for aid in range(valid.shape[0]):
|
||||
if scores[aid] < vis_thred and pred_flag[aid]:
|
||||
continue
|
||||
|
||||
@@ -90,8 +90,15 @@ class OCRRecognition(TorchModel):
|
||||
f'recognizer should be either ConvNextViT, CRNN, but got {cfgs.model.recognizer}'
|
||||
)
|
||||
if model_path != '':
|
||||
self.recognizer.load_state_dict(
|
||||
torch.load(model_path, map_location='cpu'))
|
||||
params_pretrained = torch.load(model_path, map_location='cpu')
|
||||
model_dict = self.recognizer.state_dict()
|
||||
# remove prefix for finetuned models
|
||||
check_point = {
|
||||
k.replace('recognizer.', ''): v
|
||||
for k, v in params_pretrained.items()
|
||||
}
|
||||
model_dict.update(check_point)
|
||||
self.recognizer.load_state_dict(model_dict)
|
||||
|
||||
dict_path = os.path.join(model_dir, ModelFile.VOCAB_FILE)
|
||||
self.labelMapping = dict()
|
||||
|
||||
@@ -176,8 +176,7 @@ class OpenVocabularyDetectionViLD(Model):
|
||||
# Filter out invalid rois (nmsed rois)
|
||||
valid_indices = np.where(
|
||||
np.logical_and(
|
||||
np.isin(
|
||||
np.arange(len(roi_scores), dtype=np.int), nmsed_indices),
|
||||
np.isin(np.arange(len(roi_scores), dtype=int), nmsed_indices),
|
||||
np.logical_and(
|
||||
np.logical_not(np.all(roi_boxes == 0., axis=-1)),
|
||||
np.logical_and(roi_scores >= min_rpn_score_thresh,
|
||||
|
||||
@@ -72,7 +72,7 @@ class Cube2Equirec(nn.Module):
|
||||
self.equ_h, 0), 3 * self.equ_w // 8, 1)
|
||||
|
||||
# Prepare ceil mask
|
||||
mask = np.zeros((self.equ_h, self.equ_w // 4), np.bool)
|
||||
mask = np.zeros((self.equ_h, self.equ_w // 4), bool)
|
||||
idx = np.linspace(-np.pi, np.pi, self.equ_w // 4) / 4
|
||||
idx = self.equ_h // 2 - np.round(
|
||||
np.arctan(np.cos(idx)) * self.equ_h / np.pi).astype(int)
|
||||
|
||||
@@ -29,7 +29,7 @@ def load_depth(file):
|
||||
elif file.endswith('png'):
|
||||
depth_png = np.array(load_image(file), dtype=int)
|
||||
assert (np.max(depth_png) > 255), 'Wrong .png depth file'
|
||||
return depth_png.astype(np.float) / 256.
|
||||
return depth_png.astype(float) / 256.
|
||||
else:
|
||||
raise NotImplementedError('Depth extension not supported.')
|
||||
|
||||
|
||||
@@ -85,7 +85,7 @@ def do_scene_detect(F01_tensor, F10_tensor, img0_tensor, img1_tensor):
|
||||
img_diff = ori_img.float() - ref_img.float()
|
||||
img_diff = torch.abs(img_diff)
|
||||
|
||||
kernel = np.ones([8, 8], np.float) / 64
|
||||
kernel = np.ones([8, 8], float) / 64
|
||||
kernel = torch.FloatTensor(kernel).to(device).unsqueeze(0).unsqueeze(0)
|
||||
diff = F.conv2d(img_diff, kernel, padding=4)
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ def linear_assignment(cost_matrix, thresh):
|
||||
|
||||
|
||||
def ious(atlbrs, btlbrs):
|
||||
ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
|
||||
ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=float)
|
||||
if ious.size == 0:
|
||||
return ious
|
||||
|
||||
@@ -60,13 +60,13 @@ def embedding_distance(tracks, detections, metric='cosine'):
|
||||
cost_matrix: np.ndarray
|
||||
"""
|
||||
|
||||
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
|
||||
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=float)
|
||||
if cost_matrix.size == 0:
|
||||
return cost_matrix
|
||||
det_features = np.asarray([track.curr_feat for track in detections],
|
||||
dtype=np.float)
|
||||
dtype=float)
|
||||
track_features = np.asarray([track.smooth_feat for track in tracks],
|
||||
dtype=np.float)
|
||||
dtype=float)
|
||||
cost_matrix = np.maximum(0.0, cdist(track_features, det_features, metric))
|
||||
return cost_matrix
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ class STrack(BaseTrack):
|
||||
def __init__(self, tlwh, score, temp_feat, buffer_size=30):
|
||||
|
||||
# wait activate
|
||||
self._tlwh = np.asarray(tlwh, dtype=np.float)
|
||||
self._tlwh = np.asarray(tlwh, dtype=float)
|
||||
self.kalman_filter = None
|
||||
self.mean, self.covariance = None, None
|
||||
self.is_activated = False
|
||||
|
||||
@@ -20,6 +20,8 @@ if TYPE_CHECKING:
|
||||
from .vldoc import VLDocForDocVLEmbedding
|
||||
from .video_synthesis import TextToVideoSynthesis
|
||||
from .efficient_diffusion_tuning import EfficientStableDiffusion
|
||||
from .mplug_owl import MplugOwlForConditionalGeneration
|
||||
from .clip_interrogator import CLIP_Interrogator
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
@@ -37,7 +39,9 @@ else:
|
||||
['MultiStageDiffusionForTextToImageSynthesis'],
|
||||
'vldoc': ['VLDocForDocVLEmbedding'],
|
||||
'video_synthesis': ['TextToVideoSynthesis'],
|
||||
'efficient_diffusion_tuning': ['EfficientStableDiffusion']
|
||||
'efficient_diffusion_tuning': ['EfficientStableDiffusion'],
|
||||
'mplug_owl': ['MplugOwlForConditionalGeneration'],
|
||||
'clip_interrogator': ['CLIP_Interrogator'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
from .model import CLIP_Interrogator
|
||||
599
modelscope/models/multi_modal/clip_interrogator/model.py
Normal file
599
modelscope/models/multi_modal/clip_interrogator/model.py
Normal file
@@ -0,0 +1,599 @@
|
||||
# This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at
|
||||
# https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py
|
||||
|
||||
import hashlib
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import open_clip
|
||||
import requests
|
||||
import torch
|
||||
import torchvision.transforms as transforms
|
||||
from PIL import Image
|
||||
from safetensors.numpy import load_file, save_file
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoProcessor,
|
||||
Blip2ForConditionalGeneration,
|
||||
BlipForConditionalGeneration)
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.preprocessors import LoadImage
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
__all__ = ['CLIP_Interrogator']
|
||||
|
||||
CAPTION_MODELS = {
|
||||
'blip-base': 'blip-image-captioning-base',
|
||||
'blip-large': 'blip-image-captioning-large',
|
||||
'blip2-2.7b': 'blip2-opt-2.7b',
|
||||
'blip2-flan-t5-xl': 'blip2-flan-t5-xl',
|
||||
'git-large-coco': 'git-large-coco',
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
# models can optionally be passed in directly
|
||||
caption_model = None
|
||||
caption_processor = None
|
||||
clip_model = None
|
||||
clip_preprocess = None
|
||||
|
||||
# blip settings
|
||||
caption_max_length: int = 32
|
||||
caption_model_name: Optional[
|
||||
str] = 'blip-large' # use a key from CAPTION_MODELS or None
|
||||
caption_offload: bool = False
|
||||
|
||||
# clip settings
|
||||
clip_model_name: str = 'ViT-L-14/openai'
|
||||
clip_model_path: Optional[str] = None
|
||||
clip_offload: bool = False
|
||||
|
||||
# interrogator settings
|
||||
cache_path: str = 'cache' # path to store cached text embeddings
|
||||
download_cache: bool = False # when true, cached embeds are downloaded from huggingface
|
||||
chunk_size: int = 2048 # batch size for CLIP, use smaller for lower VRAM
|
||||
data_path: str = os.path.join(os.path.dirname(__file__), 'data')
|
||||
device: str = ('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
flavor_intermediate_count: int = 2048
|
||||
quiet: bool = False # when quiet progress bars are not shown
|
||||
|
||||
def apply_low_vram_defaults(self):
|
||||
self.caption_model_name = 'blip-base'
|
||||
self.caption_offload = True
|
||||
self.clip_offload = True
|
||||
self.chunk_size = 1024
|
||||
self.flavor_intermediate_count = 1024
|
||||
|
||||
|
||||
# CLIP-Interrogator utilize CLIP and BLIP to generate rich caption for images.
|
||||
# CLIP is a zero-shot image classifier which can be used to generate image and text embeddings.
|
||||
# BLIP is a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks.
|
||||
# BLIP effectively utilizes the noisy web data by bootstrapping the captions, where
|
||||
# a captioner generates synthetic captions and a filter removes the noisy ones.
|
||||
# Please infer to the paper CLIP: Learning Transferable Visual Models From Natural Language Supervision
|
||||
# https://arxiv.org/abs/2103.00020
|
||||
# BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
|
||||
# https://arxiv.org/abs/2201.12086
|
||||
|
||||
|
||||
class Interrogator():
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
self.device = config.device
|
||||
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
|
||||
self.caption_offloaded = True
|
||||
self.clip_offloaded = True
|
||||
self.load_caption_model()
|
||||
self.load_clip_model()
|
||||
|
||||
def load_caption_model(self):
|
||||
if self.config.caption_model is None and self.config.caption_model_name:
|
||||
if not self.config.quiet:
|
||||
print(
|
||||
f'Loading caption model {self.config.caption_model_name}...'
|
||||
)
|
||||
|
||||
model_path = CAPTION_MODELS[self.config.caption_model_name]
|
||||
if self.config.caption_model_name.startswith('git-'):
|
||||
caption_model = AutoModelForCausalLM.from_pretrained(
|
||||
os.path.join(self.config.cache_path, model_path),
|
||||
torch_dtype=torch.float32)
|
||||
elif self.config.caption_model_name.startswith('blip2-'):
|
||||
caption_model = Blip2ForConditionalGeneration.from_pretrained(
|
||||
os.path.join(self.config.cache_path, model_path),
|
||||
torch_dtype=self.dtype)
|
||||
else:
|
||||
caption_model = BlipForConditionalGeneration.from_pretrained(
|
||||
os.path.join(self.config.cache_path, model_path),
|
||||
torch_dtype=self.dtype)
|
||||
self.caption_processor = AutoProcessor.from_pretrained(
|
||||
os.path.join(self.config.cache_path, model_path))
|
||||
|
||||
caption_model.eval()
|
||||
if not self.config.caption_offload:
|
||||
caption_model = caption_model.to(self.config.device)
|
||||
self.caption_model = caption_model
|
||||
else:
|
||||
self.caption_model = self.config.caption_model
|
||||
self.caption_processor = self.config.caption_processor
|
||||
|
||||
def load_clip_model(self):
|
||||
start_time = time.time()
|
||||
config = self.config
|
||||
|
||||
clip_model_name, clip_model_pretrained_name = config.clip_model_name.split(
|
||||
'/', 2)
|
||||
|
||||
if config.clip_model is None:
|
||||
if not config.quiet:
|
||||
print(f'Loading CLIP model {config.clip_model_name}...')
|
||||
|
||||
self.clip_model, _, self.clip_preprocess = open_clip.create_model_and_transforms(
|
||||
clip_model_name,
|
||||
pretrained=clip_model_pretrained_name,
|
||||
precision='fp16' if config.device == 'cuda' else 'fp32',
|
||||
device=config.device,
|
||||
jit=False,
|
||||
cache_dir=config.clip_model_path)
|
||||
self.clip_model.eval()
|
||||
else:
|
||||
self.clip_model = config.clip_model
|
||||
self.clip_preprocess = config.clip_preprocess
|
||||
self.tokenize = open_clip.get_tokenizer(clip_model_name)
|
||||
|
||||
sites = [
|
||||
'Artstation', 'behance', 'cg society', 'cgsociety', 'deviantart',
|
||||
'dribbble', 'flickr', 'instagram', 'pexels', 'pinterest',
|
||||
'pixabay', 'pixiv', 'polycount', 'reddit', 'shutterstock',
|
||||
'tumblr', 'unsplash', 'zbrush central'
|
||||
]
|
||||
trending_list = [site for site in sites]
|
||||
trending_list.extend(['trending on ' + site for site in sites])
|
||||
trending_list.extend(['featured on ' + site for site in sites])
|
||||
trending_list.extend([site + ' contest winner' for site in sites])
|
||||
|
||||
raw_artists = load_list(config.data_path, 'artists.txt')
|
||||
artists = [f'by {a}' for a in raw_artists]
|
||||
artists.extend([f'inspired by {a}' for a in raw_artists])
|
||||
|
||||
self._prepare_clip()
|
||||
self.artists = LabelTable(artists, 'artists', self)
|
||||
self.flavors = LabelTable(
|
||||
load_list(config.data_path, 'flavors.txt'), 'flavors', self)
|
||||
self.mediums = LabelTable(
|
||||
load_list(config.data_path, 'mediums.txt'), 'mediums', self)
|
||||
self.movements = LabelTable(
|
||||
load_list(config.data_path, 'movements.txt'), 'movements', self)
|
||||
self.trendings = LabelTable(trending_list, 'trendings', self)
|
||||
self.negative = LabelTable(
|
||||
load_list(config.data_path, 'negative.txt'), 'negative', self)
|
||||
|
||||
end_time = time.time()
|
||||
if not config.quiet:
|
||||
print(
|
||||
f'Loaded CLIP model and data in {end_time-start_time:.2f} seconds.'
|
||||
)
|
||||
|
||||
def chain(self,
|
||||
image_features: torch.Tensor,
|
||||
phrases: List[str],
|
||||
best_prompt: str = '',
|
||||
best_sim: float = 0,
|
||||
min_count: int = 8,
|
||||
max_count: int = 32,
|
||||
desc='Chaining',
|
||||
reverse: bool = False) -> str:
|
||||
self._prepare_clip()
|
||||
|
||||
phrases = set(phrases)
|
||||
if not best_prompt:
|
||||
best_prompt = self.rank_top(
|
||||
image_features, [f for f in phrases], reverse=reverse)
|
||||
best_sim = self.similarity(image_features, best_prompt)
|
||||
phrases.remove(best_prompt)
|
||||
curr_prompt, curr_sim = best_prompt, best_sim
|
||||
|
||||
def check(addition: str, idx: int) -> bool:
|
||||
nonlocal best_prompt, best_sim, curr_prompt, curr_sim
|
||||
prompt = curr_prompt + ', ' + addition
|
||||
sim = self.similarity(image_features, prompt)
|
||||
if reverse:
|
||||
sim = -sim
|
||||
|
||||
if sim > best_sim:
|
||||
best_prompt, best_sim = prompt, sim
|
||||
if sim > curr_sim or idx < min_count:
|
||||
curr_prompt, curr_sim = prompt, sim
|
||||
return True
|
||||
return False
|
||||
|
||||
for idx in tqdm(
|
||||
range(max_count), desc=desc, disable=self.config.quiet):
|
||||
best = self.rank_top(
|
||||
image_features, [f'{curr_prompt}, {f}' for f in phrases],
|
||||
reverse=reverse)
|
||||
flave = best[len(curr_prompt) + 2:]
|
||||
if not check(flave, idx):
|
||||
break
|
||||
if _prompt_at_max_len(curr_prompt, self.tokenize):
|
||||
break
|
||||
phrases.remove(flave)
|
||||
|
||||
return best_prompt
|
||||
|
||||
def generate_caption(self, pil_image: Image) -> str:
|
||||
assert self.caption_model is not None, 'No caption model loaded.'
|
||||
self._prepare_caption()
|
||||
inputs = self.caption_processor(
|
||||
images=pil_image, return_tensors='pt').to(self.device)
|
||||
if not self.config.caption_model_name.startswith('git-'):
|
||||
inputs = inputs.to(self.dtype)
|
||||
tokens = self.caption_model.generate(
|
||||
**inputs, max_new_tokens=self.config.caption_max_length)
|
||||
return self.caption_processor.batch_decode(
|
||||
tokens, skip_special_tokens=True)[0].strip()
|
||||
|
||||
def image_to_features(self, image: Image) -> torch.Tensor:
|
||||
self._prepare_clip()
|
||||
images = self.clip_preprocess(image).unsqueeze(0).to(self.device)
|
||||
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||
image_features = self.clip_model.encode_image(images)
|
||||
image_features /= image_features.norm(dim=-1, keepdim=True)
|
||||
return image_features
|
||||
|
||||
def interrogate_classic(self,
|
||||
image: Image,
|
||||
max_flavors: int = 3,
|
||||
caption: Optional[str] = None) -> str:
|
||||
"""Classic mode creates a prompt in a standard format first describing the image,
|
||||
then listing the artist, trending, movement, and flavor text modifiers."""
|
||||
caption = caption or self.generate_caption(image)
|
||||
image_features = self.image_to_features(image)
|
||||
|
||||
medium = self.mediums.rank(image_features, 1)[0]
|
||||
artist = self.artists.rank(image_features, 1)[0]
|
||||
trending = self.trendings.rank(image_features, 1)[0]
|
||||
movement = self.movements.rank(image_features, 1)[0]
|
||||
flaves = ', '.join(self.flavors.rank(image_features, max_flavors))
|
||||
|
||||
if caption.startswith(medium):
|
||||
prompt = f'{caption} {artist}, {trending}, {movement}, {flaves}'
|
||||
else:
|
||||
prompt = f'{caption}, {medium} {artist}, {trending}, {movement}, {flaves}'
|
||||
|
||||
return _truncate_to_fit(prompt, self.tokenize)
|
||||
|
||||
def interrogate_fast(self,
|
||||
image: Image,
|
||||
max_flavors: int = 32,
|
||||
caption: Optional[str] = None) -> str:
|
||||
"""Fast mode simply adds the top ranked terms after a caption. It generally results in
|
||||
better similarity between generated prompt and image than classic mode, but the prompts
|
||||
are less readable."""
|
||||
caption = caption or self.generate_caption(image)
|
||||
image_features = self.image_to_features(image)
|
||||
merged = _merge_tables([
|
||||
self.artists, self.flavors, self.mediums, self.movements,
|
||||
self.trendings
|
||||
], self)
|
||||
tops = merged.rank(image_features, max_flavors)
|
||||
return _truncate_to_fit(caption + ', ' + ', '.join(tops),
|
||||
self.tokenize)
|
||||
|
||||
def interrogate_negative(self, image: Image, max_flavors: int = 32) -> str:
|
||||
"""Negative mode chains together the most dissimilar terms to the image. It can be used
|
||||
to help build a negative prompt to pair with the regular positive prompt and often
|
||||
improve the results of generated images particularly with Stable Diffusion 2."""
|
||||
image_features = self.image_to_features(image)
|
||||
flaves = self.flavors.rank(
|
||||
image_features,
|
||||
self.config.flavor_intermediate_count,
|
||||
reverse=True)
|
||||
flaves = flaves + self.negative.labels
|
||||
return self.chain(
|
||||
image_features,
|
||||
flaves,
|
||||
max_count=max_flavors,
|
||||
reverse=True,
|
||||
desc='Negative chain')
|
||||
|
||||
def interrogate(self,
|
||||
image: Image,
|
||||
min_flavors: int = 8,
|
||||
max_flavors: int = 32,
|
||||
caption: Optional[str] = None) -> str:
|
||||
caption = caption or self.generate_caption(image)
|
||||
image_features = self.image_to_features(image)
|
||||
|
||||
merged = _merge_tables([
|
||||
self.artists, self.flavors, self.mediums, self.movements,
|
||||
self.trendings
|
||||
], self)
|
||||
flaves = merged.rank(image_features,
|
||||
self.config.flavor_intermediate_count)
|
||||
best_prompt, best_sim = caption, self.similarity(
|
||||
image_features, caption)
|
||||
best_prompt = self.chain(
|
||||
image_features,
|
||||
flaves,
|
||||
best_prompt,
|
||||
best_sim,
|
||||
min_count=min_flavors,
|
||||
max_count=max_flavors,
|
||||
desc='Flavor chain')
|
||||
|
||||
fast_prompt = self.interrogate_fast(
|
||||
image, max_flavors, caption=caption)
|
||||
classic_prompt = self.interrogate_classic(
|
||||
image, max_flavors, caption=caption)
|
||||
candidates = [caption, classic_prompt, fast_prompt, best_prompt]
|
||||
return candidates[np.argmax(
|
||||
self.similarities(image_features, candidates))]
|
||||
|
||||
def rank_top(self,
|
||||
image_features: torch.Tensor,
|
||||
text_array: List[str],
|
||||
reverse: bool = False) -> str:
|
||||
self._prepare_clip()
|
||||
text_tokens = self.tokenize([text
|
||||
for text in text_array]).to(self.device)
|
||||
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||
text_features = self.clip_model.encode_text(text_tokens)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
similarity = text_features @ image_features.T
|
||||
if reverse:
|
||||
similarity = -similarity
|
||||
return text_array[similarity.argmax().item()]
|
||||
|
||||
def similarity(self, image_features: torch.Tensor, text: str) -> float:
|
||||
self._prepare_clip()
|
||||
text_tokens = self.tokenize([text]).to(self.device)
|
||||
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||
text_features = self.clip_model.encode_text(text_tokens)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
similarity = text_features @ image_features.T
|
||||
return similarity[0][0].item()
|
||||
|
||||
def similarities(self, image_features: torch.Tensor,
|
||||
text_array: List[str]) -> List[float]:
|
||||
self._prepare_clip()
|
||||
text_tokens = self.tokenize([text
|
||||
for text in text_array]).to(self.device)
|
||||
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||
text_features = self.clip_model.encode_text(text_tokens)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
similarity = text_features @ image_features.T
|
||||
return similarity.T[0].tolist()
|
||||
|
||||
def _prepare_caption(self):
|
||||
if self.config.clip_offload and not self.clip_offloaded:
|
||||
self.clip_model = self.clip_model.to('cpu')
|
||||
self.clip_offloaded = True
|
||||
if self.caption_offloaded:
|
||||
self.caption_model = self.caption_model.to(self.device)
|
||||
self.caption_offloaded = False
|
||||
|
||||
def _prepare_clip(self):
|
||||
if self.config.caption_offload and not self.caption_offloaded:
|
||||
self.caption_model = self.caption_model.to('cpu')
|
||||
self.caption_offloaded = True
|
||||
if self.clip_offloaded:
|
||||
self.clip_model = self.clip_model.to(self.device)
|
||||
self.clip_offloaded = False
|
||||
|
||||
|
||||
class LabelTable():
|
||||
|
||||
def __init__(self, labels: List[str], desc: str, ci: Interrogator):
|
||||
clip_model, config = ci.clip_model, ci.config
|
||||
self.chunk_size = config.chunk_size
|
||||
self.config = config
|
||||
self.device = config.device
|
||||
self.embeds = []
|
||||
self.labels = labels
|
||||
self.tokenize = ci.tokenize
|
||||
|
||||
hash = hashlib.sha256(','.join(labels).encode()).hexdigest()
|
||||
sanitized_name = self.config.clip_model_name.replace('/', '_').replace(
|
||||
'@', '_')
|
||||
self._load_cached(desc, hash, sanitized_name)
|
||||
|
||||
if len(self.labels) != len(self.embeds):
|
||||
self.embeds = []
|
||||
chunks = np.array_split(
|
||||
self.labels, max(1,
|
||||
len(self.labels) / config.chunk_size))
|
||||
for chunk in tqdm(
|
||||
chunks,
|
||||
desc=f'Preprocessing {desc}' if desc else None,
|
||||
disable=self.config.quiet):
|
||||
text_tokens = self.tokenize(chunk).to(self.device)
|
||||
with torch.no_grad(), torch.cuda.amp.autocast():
|
||||
text_features = clip_model.encode_text(text_tokens)
|
||||
text_features /= text_features.norm(dim=-1, keepdim=True)
|
||||
text_features = text_features.half().cpu().numpy()
|
||||
for i in range(text_features.shape[0]):
|
||||
self.embeds.append(text_features[i])
|
||||
|
||||
if desc and self.config.cache_path:
|
||||
os.makedirs(self.config.cache_path, exist_ok=True)
|
||||
cache_filepath = os.path.join(
|
||||
self.config.cache_path,
|
||||
f'{sanitized_name}_{desc}.safetensors')
|
||||
tensors = {
|
||||
'embeds': np.stack(self.embeds),
|
||||
'hash': np.array([ord(c) for c in hash], dtype=np.int8)
|
||||
}
|
||||
save_file(tensors, cache_filepath)
|
||||
|
||||
if self.device == 'cpu' or self.device == torch.device('cpu'):
|
||||
self.embeds = [e.astype(np.float32) for e in self.embeds]
|
||||
|
||||
def _load_cached(self, desc: str, hash: str, sanitized_name: str) -> bool:
|
||||
if self.config.cache_path is None or desc is None:
|
||||
return False
|
||||
|
||||
cached_safetensors = os.path.join(
|
||||
self.config.cache_path, f'{sanitized_name}_{desc}.safetensors')
|
||||
|
||||
if os.path.exists(cached_safetensors):
|
||||
try:
|
||||
tensors = load_file(cached_safetensors)
|
||||
except Exception as e:
|
||||
print(f'Failed to load {cached_safetensors}')
|
||||
print(e)
|
||||
return False
|
||||
if 'hash' in tensors and 'embeds' in tensors:
|
||||
if np.array_equal(
|
||||
tensors['hash'],
|
||||
np.array([ord(c) for c in hash], dtype=np.int8)):
|
||||
self.embeds = tensors['embeds']
|
||||
if len(self.embeds.shape) == 2:
|
||||
self.embeds = [
|
||||
self.embeds[i] for i in range(self.embeds.shape[0])
|
||||
]
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _rank(self,
|
||||
image_features: torch.Tensor,
|
||||
text_embeds: torch.Tensor,
|
||||
top_count: int = 1,
|
||||
reverse: bool = False) -> str:
|
||||
top_count = min(top_count, len(text_embeds))
|
||||
text_embeds = torch.stack([torch.from_numpy(t)
|
||||
for t in text_embeds]).to(self.device)
|
||||
with torch.cuda.amp.autocast():
|
||||
similarity = image_features @ text_embeds.T
|
||||
if reverse:
|
||||
similarity = -similarity
|
||||
_, top_labels = similarity.float().cpu().topk(top_count, dim=-1)
|
||||
return [top_labels[0][i].numpy() for i in range(top_count)]
|
||||
|
||||
def rank(self,
|
||||
image_features: torch.Tensor,
|
||||
top_count: int = 1,
|
||||
reverse: bool = False) -> List[str]:
|
||||
if len(self.labels) <= self.chunk_size:
|
||||
tops = self._rank(
|
||||
image_features,
|
||||
self.embeds,
|
||||
top_count=top_count,
|
||||
reverse=reverse)
|
||||
return [self.labels[i] for i in tops]
|
||||
|
||||
num_chunks = int(math.ceil(len(self.labels) / self.chunk_size))
|
||||
keep_per_chunk = int(self.chunk_size / num_chunks)
|
||||
|
||||
top_labels, top_embeds = [], []
|
||||
for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet):
|
||||
start = chunk_idx * self.chunk_size
|
||||
stop = min(start + self.chunk_size, len(self.embeds))
|
||||
tops = self._rank(
|
||||
image_features,
|
||||
self.embeds[start:stop],
|
||||
top_count=keep_per_chunk,
|
||||
reverse=reverse)
|
||||
top_labels.extend([self.labels[start + i] for i in tops])
|
||||
top_embeds.extend([self.embeds[start + i] for i in tops])
|
||||
|
||||
tops = self._rank(image_features, top_embeds, top_count=top_count)
|
||||
return [top_labels[i] for i in tops]
|
||||
|
||||
|
||||
def _download_file(url: str,
|
||||
filepath: str,
|
||||
chunk_size: int = 4 * 1024 * 1024,
|
||||
quiet: bool = False):
|
||||
r = requests.get(url, stream=True)
|
||||
if r.status_code != 200:
|
||||
return
|
||||
|
||||
file_size = int(r.headers.get('Content-Length', 0))
|
||||
filename = url.split('/')[-1]
|
||||
progress = tqdm(
|
||||
total=file_size,
|
||||
unit='B',
|
||||
unit_scale=True,
|
||||
desc=filename,
|
||||
disable=quiet)
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=chunk_size):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
progress.update(len(chunk))
|
||||
progress.close()
|
||||
|
||||
|
||||
def _merge_tables(tables: List[LabelTable], ci: Interrogator) -> LabelTable:
|
||||
m = LabelTable([], None, ci)
|
||||
for table in tables:
|
||||
m.labels.extend(table.labels)
|
||||
m.embeds.extend(table.embeds)
|
||||
return m
|
||||
|
||||
|
||||
def _prompt_at_max_len(text: str, tokenize) -> bool:
|
||||
tokens = tokenize([text])
|
||||
return tokens[0][-1] != 0
|
||||
|
||||
|
||||
def _truncate_to_fit(text: str, tokenize) -> str:
|
||||
parts = text.split(', ')
|
||||
new_text = parts[0]
|
||||
for part in parts[1:]:
|
||||
if _prompt_at_max_len(new_text + part, tokenize):
|
||||
break
|
||||
new_text += ', ' + part
|
||||
return new_text
|
||||
|
||||
|
||||
def list_caption_models() -> List[str]:
|
||||
return list(CAPTION_MODELS.keys())
|
||||
|
||||
|
||||
def list_clip_models() -> List[str]:
|
||||
return ['/'.join(x) for x in open_clip.list_pretrained()]
|
||||
|
||||
|
||||
def load_list(data_path: str, filename: Optional[str] = None) -> List[str]:
|
||||
"""Load a list of strings from a file."""
|
||||
if filename is not None:
|
||||
data_path = os.path.join(data_path, filename)
|
||||
with open(data_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
items = [line.strip() for line in f.readlines()]
|
||||
return items
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.image_captioning, module_name=Models.clip_interrogator)
|
||||
class CLIP_Interrogator(TorchModel):
|
||||
|
||||
def __init__(self, model_dir, device='cuda', device_id=0, *args, **kwargs):
|
||||
super().__init__(
|
||||
model_dir=model_dir, device_id=device_id, *args, **kwargs)
|
||||
self.device = device
|
||||
self.dtype = torch.float16 if self.device == 'cuda' else torch.float32
|
||||
cf = Config(clip_model_name='ViT-L-14/openai')
|
||||
cf.data_path = os.path.join(model_dir, 'data')
|
||||
cf.clip_model_path = model_dir
|
||||
cf.cache_path = model_dir
|
||||
self.ci = Interrogator(cf)
|
||||
|
||||
def forward(self, inputs):
|
||||
image = transforms.ToPILImage()(inputs)
|
||||
return {'caption': self.ci.interrogate(image)}
|
||||
@@ -128,13 +128,13 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
|
||||
local_transform,
|
||||
s=None,
|
||||
e=None):
|
||||
video_mask = np.zeros(self.max_frames, dtype=np.long)
|
||||
video_mask = np.zeros(self.max_frames, dtype=int)
|
||||
max_video_length = 0
|
||||
|
||||
# T x 3 x H x W
|
||||
video = np.zeros((self.max_frames, 3, rawVideoExtractor.size,
|
||||
rawVideoExtractor.size),
|
||||
dtype=np.float)
|
||||
dtype=float)
|
||||
|
||||
if s is None:
|
||||
start_time, end_time = None, None
|
||||
|
||||
18
modelscope/models/multi_modal/mplug_owl/__init__.py
Normal file
18
modelscope/models/multi_modal/mplug_owl/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Copyright 2021-2023 The Alibaba DAMO mPLUG Authors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .configuration_mplug_owl import (MplugOwlConfig, MplugOwlVisionConfig,
|
||||
MplugOwlVisualAbstractorConfig)
|
||||
from .modeling_mplug_owl import MplugOwlForConditionalGeneration
|
||||
@@ -0,0 +1,257 @@
|
||||
# Copyright 2021-2023 The Alibaba DAMO mPLUG Team Authors.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" MPLUG OWL model configuration """
|
||||
import copy
|
||||
import os
|
||||
from typing import Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.models.auto import CONFIG_MAPPING
|
||||
from transformers.utils import logging
|
||||
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
logger = logging.get_logger()
|
||||
|
||||
|
||||
class MplugOwlVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 32):
|
||||
The size (resolution) of each patch.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||
The epsilon used by the layer normalization layers.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float`, *optional*, defaults to 1):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
```"""
|
||||
|
||||
model_type = 'mplug_owl_vision_model'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1024,
|
||||
intermediate_size=4096,
|
||||
projection_dim=768,
|
||||
num_hidden_layers=24,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
image_size=224,
|
||||
patch_size=14,
|
||||
hidden_act='quick_gelu',
|
||||
layer_norm_eps=1e-6,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=0.02,
|
||||
initializer_factor=1.0,
|
||||
use_flash_attn=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
self.use_flash_attn = use_flash_attn
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||
os.PathLike],
|
||||
**kwargs) -> 'PretrainedConfig':
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the vision config dict if we are loading from MplugOwlConfig
|
||||
if config_dict.get('model_type') == 'mplug_owl':
|
||||
config_dict = config_dict['vision_config']
|
||||
|
||||
if 'model_type' in config_dict and hasattr(
|
||||
cls,
|
||||
'model_type') and config_dict['model_type'] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MplugOwlVisualAbstractorConfig(PretrainedConfig):
|
||||
|
||||
model_type = 'MPlugOwlVisualAbstractor'
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=1024,
|
||||
num_hidden_layers=6,
|
||||
num_attention_heads=16,
|
||||
intermediate_size=4096,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-6,
|
||||
encoder_hidden_size=1024,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.encoder_hidden_size = encoder_hidden_size
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
|
||||
os.PathLike],
|
||||
**kwargs) -> 'PretrainedConfig':
|
||||
config_dict, kwargs = cls.get_config_dict(
|
||||
pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
# get the qformer config dict if we are loading from MplugOwlConfig
|
||||
if config_dict.get('model_type') == 'mplug_owl':
|
||||
config_dict = config_dict['abstractor_config']
|
||||
|
||||
if 'model_type' in config_dict and hasattr(
|
||||
cls,
|
||||
'model_type') and config_dict['model_type'] != cls.model_type:
|
||||
logger.warning(
|
||||
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
||||
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
||||
)
|
||||
|
||||
return cls.from_dict(config_dict, **kwargs)
|
||||
|
||||
|
||||
class MplugOwlConfig(PretrainedConfig):
|
||||
r"""
|
||||
Args:
|
||||
vision_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`MplugOwlVisionConfig`].
|
||||
qformer_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize [`MplugOwlVisualAbstractorConfig`].
|
||||
text_config (`dict`, *optional*):
|
||||
Dictionary of configuration options used to initialize any [`PretrainedConfig`].
|
||||
num_query_tokens (`int`, *optional*, defaults to 32):
|
||||
The number of query tokens passed through the Transformer.
|
||||
|
||||
kwargs (*optional*):
|
||||
Dictionary of keyword arguments.
|
||||
"""
|
||||
|
||||
model_type = 'mplug_owl'
|
||||
is_composition = True
|
||||
|
||||
def __init__(self,
|
||||
task=Tasks.multimodal_dialogue,
|
||||
vision_config=None,
|
||||
visual_abstractor_config=None,
|
||||
text_config=None,
|
||||
num_query_tokens=64,
|
||||
**kwargs):
|
||||
|
||||
super().__init__(**kwargs)
|
||||
self.task = task
|
||||
if vision_config is None:
|
||||
vision_config = MplugOwlVisionConfig().to_dict()
|
||||
logger.info('vision_config is None.')
|
||||
|
||||
if visual_abstractor_config is None:
|
||||
visual_abstractor_config = {}
|
||||
logger.info('abstractor_config is None. ')
|
||||
|
||||
if text_config is None:
|
||||
# we use LLAMA 7b by default
|
||||
from transformers.models.llama.configuration_llama import \
|
||||
LlamaConfig
|
||||
text_config = LlamaConfig(pad_token_id=2).to_dict()
|
||||
logger.info('text_config is None.')
|
||||
|
||||
self.vision_config = MplugOwlVisionConfig(**vision_config)
|
||||
self.visual_abstractor_config = MplugOwlVisualAbstractorConfig(
|
||||
**visual_abstractor_config)
|
||||
text_model_type = text_config[
|
||||
'model_type'] if 'model_type' in text_config else 'llama'
|
||||
self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
|
||||
|
||||
self.tie_word_embeddings = self.text_config.tie_word_embeddings
|
||||
|
||||
self.num_query_tokens = num_query_tokens
|
||||
self.initializer_factor = 1.0
|
||||
self.initializer_range = 0.02
|
||||
|
||||
@classmethod
|
||||
def from_vision_abstractor_text_configs(
|
||||
cls,
|
||||
vision_config: MplugOwlVisionConfig,
|
||||
visual_abstractor_config: MplugOwlVisualAbstractorConfig,
|
||||
text_config: PretrainedConfig,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Returns:
|
||||
[`MplugOwlConfig`]: An instance of a configuration object
|
||||
"""
|
||||
|
||||
return cls(
|
||||
vision_config=vision_config.to_dict(),
|
||||
visual_abstractor_config=visual_abstractor_config.to_dict(),
|
||||
text_config=text_config.to_dict(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
||||
|
||||
Returns:
|
||||
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
||||
"""
|
||||
output = copy.deepcopy(self.__dict__)
|
||||
output['vision_config'] = self.vision_config.to_dict()
|
||||
tmp = self.visual_abstractor_config.to_dict()
|
||||
output['visual_abstractor_config'] = tmp
|
||||
output['text_config'] = self.text_config.to_dict()
|
||||
output['model_type'] = self.__class__.model_type
|
||||
return output
|
||||
1551
modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
Normal file
1551
modelscope/models/multi_modal/mplug_owl/modeling_mplug_owl.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -212,10 +212,10 @@ class ConstructBlockStrategy:
|
||||
block_spans,
|
||||
rng,
|
||||
task='bert'):
|
||||
position_ids = np.arange(len(tokens), dtype=np.long)
|
||||
position_ids = np.arange(len(tokens), dtype=int)
|
||||
targets = copy.deepcopy(tokens)
|
||||
mask_id = self.tokenizer.get_command('MASK').Id
|
||||
mlm_masks = np.zeros(len(tokens), dtype=np.long)
|
||||
mlm_masks = np.zeros(len(tokens), dtype=int)
|
||||
for start, end in block_spans:
|
||||
for idx in range(start, end):
|
||||
tokens[idx] = mask_id
|
||||
@@ -231,7 +231,7 @@ class ConstructBlockStrategy:
|
||||
rng,
|
||||
task='bert'):
|
||||
text_length = len(tokens)
|
||||
position_ids = np.ones(len(tokens), dtype=np.long)
|
||||
position_ids = np.ones(len(tokens), dtype=int)
|
||||
for start, end in block_spans:
|
||||
position_ids[start + 1:end] = 0
|
||||
position_ids = np.cumsum(position_ids) - 1
|
||||
@@ -270,7 +270,7 @@ class ConstructBlockStrategy:
|
||||
(end - start + 1))
|
||||
if self.block_position_encoding:
|
||||
target_block_position_ids.append(
|
||||
np.arange(1, end - start + 2, dtype=np.long))
|
||||
np.arange(1, end - start + 2, dtype=int))
|
||||
else:
|
||||
target_block_position_ids.append([1] * (end - start + 1))
|
||||
block_spans.sort(key=lambda x: x[0])
|
||||
@@ -307,7 +307,7 @@ class ConstructBlockStrategy:
|
||||
target_tokens = target_tokens + [
|
||||
self.tokenizer.get_command('eop').Id
|
||||
]
|
||||
loss_masks = np.ones(len(target_tokens), dtype=np.long)
|
||||
loss_masks = np.ones(len(target_tokens), dtype=int)
|
||||
return source_tokens, target_tokens, loss_masks
|
||||
else:
|
||||
tokens = np.concatenate(source_tokens + target_tokens)
|
||||
@@ -326,12 +326,12 @@ class ConstructBlockStrategy:
|
||||
for pos in mask_pos:
|
||||
tokens[pos] = self.tokenizer.get_command('dBLOCK').Id
|
||||
targets = np.concatenate(source_tokens + targets)
|
||||
loss_masks = np.ones(len(tokens), dtype=np.long)
|
||||
loss_masks = np.ones(len(tokens), dtype=int)
|
||||
loss_masks[:source_length] = 0
|
||||
position_ids = np.concatenate(source_position_ids
|
||||
+ target_position_ids)
|
||||
block_position_ids = np.concatenate(
|
||||
[np.zeros(source_length, dtype=np.long)]
|
||||
[np.zeros(source_length, dtype=int)]
|
||||
+ target_block_position_ids)
|
||||
position_ids = np.stack([position_ids, block_position_ids], axis=0)
|
||||
if attention_mask is not None:
|
||||
@@ -539,22 +539,21 @@ class ConstructBlockStrategy:
|
||||
(source_tokens, [self.generation_mask], target_tokens))
|
||||
loss_masks = np.concatenate(
|
||||
(np.zeros(len(source_tokens) + 1,
|
||||
dtype=np.long), target_masks))
|
||||
dtype=int), target_masks))
|
||||
token_batch.append(tokens)
|
||||
target_batch.append(targets)
|
||||
loss_mask_batch.append(loss_masks)
|
||||
position_ids = np.arange(
|
||||
len(source_tokens) + len(target_tokens) + 1,
|
||||
dtype=np.long)
|
||||
len(source_tokens) + len(target_tokens) + 1, dtype=int)
|
||||
position_ids[len(source_tokens) + 1:] = len(source_tokens)
|
||||
if self.block_position_encoding:
|
||||
block_position_ids = np.concatenate(
|
||||
(np.zeros(len(source_tokens), dtype=np.long),
|
||||
np.arange(len(target_tokens) + 1, dtype=np.long)))
|
||||
(np.zeros(len(source_tokens), dtype=int),
|
||||
np.arange(len(target_tokens) + 1, dtype=int)))
|
||||
else:
|
||||
block_position_ids = np.concatenate(
|
||||
(np.zeros(len(source_tokens) + 1, dtype=np.long),
|
||||
np.ones(len(target_tokens) + 1, dtype=np.long)))
|
||||
(np.zeros(len(source_tokens) + 1, dtype=int),
|
||||
np.ones(len(target_tokens) + 1, dtype=int)))
|
||||
position_id_batch.append(
|
||||
np.stack([position_ids, block_position_ids], axis=0))
|
||||
else:
|
||||
@@ -597,27 +596,25 @@ class ConstructBlockStrategy:
|
||||
max_length = max(seq_lengths)
|
||||
token_batch = [
|
||||
np.concatenate(
|
||||
(tokens, np.zeros(max_length - len(tokens),
|
||||
dtype=np.long)))
|
||||
(tokens, np.zeros(max_length - len(tokens), dtype=int)))
|
||||
for tokens in token_batch
|
||||
]
|
||||
target_batch = [
|
||||
np.concatenate(
|
||||
(targets,
|
||||
np.zeros(max_length - len(targets), dtype=np.long)))
|
||||
(targets, np.zeros(max_length - len(targets), dtype=int)))
|
||||
for targets in target_batch
|
||||
]
|
||||
loss_mask_batch = [
|
||||
np.concatenate(
|
||||
(loss_masks,
|
||||
np.zeros(max_length - len(loss_masks), dtype=np.long)))
|
||||
np.zeros(max_length - len(loss_masks), dtype=int)))
|
||||
for loss_masks in loss_mask_batch
|
||||
]
|
||||
position_id_batch = [
|
||||
np.concatenate((position_ids,
|
||||
np.concatenate(
|
||||
(position_ids,
|
||||
np.zeros(
|
||||
(2, max_length - position_ids.shape[1]),
|
||||
dtype=np.long)),
|
||||
(2, max_length - position_ids.shape[1]), dtype=int)),
|
||||
axis=1) for position_ids in position_id_batch
|
||||
]
|
||||
return token_batch, target_batch, loss_mask_batch, position_id_batch
|
||||
|
||||
@@ -583,8 +583,8 @@ class XLDataset(data.Dataset):
|
||||
def getidx(self, idx):
|
||||
tokens, targets, loss_masks = [], [], []
|
||||
attention_mask = np.concatenate(
|
||||
(np.zeros((self.max_seq_len, self.mem_len), dtype=np.long),
|
||||
np.ones((self.max_seq_len, self.max_seq_len), dtype=np.long)),
|
||||
(np.zeros((self.max_seq_len, self.mem_len), dtype=int),
|
||||
np.ones((self.max_seq_len, self.max_seq_len), dtype=int)),
|
||||
axis=1)
|
||||
sample_idx = bisect_right(self.indices, idx * self.max_seq_len)
|
||||
last_end = 0 if sample_idx == 0 else self.indices[sample_idx - 1]
|
||||
|
||||
@@ -28,7 +28,7 @@ def main():
|
||||
counts = np.array([0] * 10)
|
||||
for _ in range(10000):
|
||||
spans = strategy.sample_span_in_document(
|
||||
np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=np.long), [1, 1],
|
||||
np.array([1, 2, 3, 0, 4, 5, 6, 7, 9, 0], dtype=int), [1, 1],
|
||||
random.Random())
|
||||
for start, end in spans:
|
||||
counts[start:end] += 1
|
||||
|
||||
@@ -17,7 +17,7 @@ def main():
|
||||
num_iters=300000,
|
||||
decay_style='cosine',
|
||||
decay_ratio=0.1)
|
||||
steps = np.arange(0, 400000, 10, dtype=np.long)
|
||||
steps = np.arange(0, 400000, 10, dtype=int)
|
||||
rates = []
|
||||
for step in steps:
|
||||
lr_scheduler.num_iters = step
|
||||
|
||||
@@ -5,12 +5,12 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_unite import UniTEConfig
|
||||
from .modeling_unite import UniTEForTranslationEvaluation
|
||||
from .configuration import UniTEConfig
|
||||
from .translation_evaluation import UniTEForTranslationEvaluation
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_unite': ['UniTEConfig'],
|
||||
'modeling_unite': ['UniTEForTranslationEvaluation'],
|
||||
'configuration': ['UniTEConfig'],
|
||||
'translation_evaluation': ['UniTEForTranslationEvaluation'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -9,7 +9,7 @@ from modelscope.utils.config import Config
|
||||
logger = logging.get_logger()
|
||||
|
||||
|
||||
class EvaluationMode(Enum):
|
||||
class InputFormat(Enum):
|
||||
SRC = 'src'
|
||||
REF = 'ref'
|
||||
SRC_REF = 'src-ref'
|
||||
@@ -20,6 +20,8 @@ from transformers.activations import ACT2FN
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.unite.configuration import InputFormat
|
||||
from modelscope.outputs.nlp_outputs import TranslationEvaluationOutput
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
@@ -71,8 +73,16 @@ class LayerwiseAttention(Module):
|
||||
mask: torch.Tensor = None,
|
||||
) -> torch.Tensor:
|
||||
tensors = torch.cat(list(x.unsqueeze(dim=0) for x in tensors), dim=0)
|
||||
|
||||
if self.training and self.dropout:
|
||||
normed_weights = softmax(
|
||||
self.scalar_parameters, dim=0).view(-1, 1, 1, 1)
|
||||
torch.where(self.dropout_mask.uniform_() > self.dropout,
|
||||
self.scalar_parameters, self.dropout_fill),
|
||||
dim=-1)
|
||||
else:
|
||||
normed_weights = softmax(self.scalar_parameters, dim=-1)
|
||||
|
||||
normed_weights = normed_weights.view(-1, 1, 1, 1)
|
||||
|
||||
mask_float = mask.float()
|
||||
weighted_sum = (normed_weights
|
||||
@@ -266,8 +276,11 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
|
||||
return
|
||||
|
||||
def forward(self, input_sentences: List[torch.Tensor]):
|
||||
input_ids = self.combine_input_sentences(input_sentences)
|
||||
def forward(self,
|
||||
input_ids: torch.Tensor,
|
||||
input_format: Optional[List[InputFormat]] = None,
|
||||
score: Optional[torch.Tensor] = None,
|
||||
**kwargs) -> TranslationEvaluationOutput:
|
||||
attention_mask = input_ids.ne(self.pad_token_id).long()
|
||||
outputs = self.encoder(
|
||||
input_ids=input_ids,
|
||||
@@ -276,32 +289,48 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
return_dict=True)
|
||||
mix_states = self.layerwise_attention(outputs['hidden_states'],
|
||||
attention_mask)
|
||||
pred = self.estimator(mix_states)
|
||||
return pred.squeeze(dim=-1)
|
||||
pred = self.estimator(mix_states).squeeze(dim=-1)
|
||||
output = TranslationEvaluationOutput(
|
||||
score=pred.cpu().tolist(), input_format=input_format)
|
||||
|
||||
def load_checkpoint(self, path: str, device: torch.device):
|
||||
if score is not None:
|
||||
loss = (pred - score).pow(2).mean()
|
||||
output['loss'] = loss
|
||||
|
||||
return output
|
||||
|
||||
def load_checkpoint(self, path: str, device: torch.device, plm_only: bool):
|
||||
if plm_only:
|
||||
self.encoder = self.encoder.from_pretrained(path).to(device)
|
||||
self.encoder.pooler = None
|
||||
else:
|
||||
state_dict = torch.load(path, map_location=device)
|
||||
self.load_state_dict(state_dict)
|
||||
logger.info('Loading checkpoint parameters from %s' % path)
|
||||
return
|
||||
|
||||
def combine_input_sentences(self, input_sent_groups: List[torch.Tensor]):
|
||||
for input_sent_group in input_sent_groups[1:]:
|
||||
input_sent_group[:, 0] = self.eos_token_id
|
||||
|
||||
if len(input_sent_groups) == 3:
|
||||
cutted_sents = self.cut_long_sequences3(input_sent_groups)
|
||||
def combine_input_sentences(all_input_concat: List[List[torch.Tensor]],
|
||||
maximum_length: int = 512,
|
||||
pad_idx: int = 1,
|
||||
eos_idx: int = 2):
|
||||
for group in all_input_concat[1:]:
|
||||
group[:, 0] = eos_idx
|
||||
|
||||
if len(all_input_concat) == 3:
|
||||
return cut_long_sequences3(all_input_concat, maximum_length, pad_idx)
|
||||
else:
|
||||
cutted_sents = self.cut_long_sequences2(input_sent_groups)
|
||||
return cutted_sents
|
||||
return cut_long_sequences2(all_input_concat, maximum_length, pad_idx)
|
||||
|
||||
@staticmethod
|
||||
def cut_long_sequences2(all_input_concat: List[List[torch.Tensor]],
|
||||
|
||||
def cut_long_sequences2(all_input_concat: List[List[torch.Tensor]],
|
||||
maximum_length: int = 512,
|
||||
pad_idx: int = 1):
|
||||
all_input_concat = list(zip(*all_input_concat))
|
||||
collected_tuples = list()
|
||||
for tensor_tuple in all_input_concat:
|
||||
tensor_tuple = tuple(
|
||||
x.masked_select(x.ne(pad_idx)) for x in tensor_tuple)
|
||||
all_lens = tuple(len(x) for x in tensor_tuple)
|
||||
|
||||
if sum(all_lens) > maximum_length:
|
||||
@@ -315,13 +344,12 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
// 2) and min(all_lens) > offset:
|
||||
lengths = dict((k, v - offset) for k, v in lengths.items())
|
||||
else:
|
||||
lengths[lengths_sorted_idxes[
|
||||
0]] = maximum_length - lengths[lengths_sorted_idxes[1]]
|
||||
lengths[lengths_sorted_idxes[0]] = maximum_length - lengths[
|
||||
lengths_sorted_idxes[1]]
|
||||
|
||||
new_lens = list(lengths[k]
|
||||
for k in range(0, len(tensor_tuple)))
|
||||
new_tensor_tuple = tuple(
|
||||
x[:y] for x, y in zip(tensor_tuple, new_lens))
|
||||
new_lens = list(lengths[k] for k in range(0, len(tensor_tuple)))
|
||||
new_tensor_tuple = tuple(x[:y]
|
||||
for x, y in zip(tensor_tuple, new_lens))
|
||||
for x, y in zip(new_tensor_tuple, tensor_tuple):
|
||||
x[-1] = y[-1]
|
||||
collected_tuples.append(new_tensor_tuple)
|
||||
@@ -331,16 +359,17 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
concat_tensor = list(torch.cat(x, dim=0) for x in collected_tuples)
|
||||
all_input_concat_padded = pad_sequence(
|
||||
concat_tensor, batch_first=True, padding_value=pad_idx)
|
||||
|
||||
return all_input_concat_padded
|
||||
|
||||
@staticmethod
|
||||
def cut_long_sequences3(all_input_concat: List[List[torch.Tensor]],
|
||||
|
||||
def cut_long_sequences3(all_input_concat: List[List[torch.Tensor]],
|
||||
maximum_length: int = 512,
|
||||
pad_idx: int = 1):
|
||||
all_input_concat = list(zip(*all_input_concat))
|
||||
collected_tuples = list()
|
||||
for tensor_tuple in all_input_concat:
|
||||
tensor_tuple = tuple(
|
||||
x.masked_select(x.ne(pad_idx)) for x in tensor_tuple)
|
||||
all_lens = tuple(len(x) for x in tensor_tuple)
|
||||
|
||||
if sum(all_lens) > maximum_length:
|
||||
@@ -357,9 +386,8 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
while sum(lengths.values()) > maximum_length:
|
||||
if lengths[lengths_sorted_idxes[0]] > lengths[
|
||||
lengths_sorted_idxes[1]]:
|
||||
offset = maximum_length - lengths[
|
||||
lengths_sorted_idxes[1]] - lengths[
|
||||
lengths_sorted_idxes[2]]
|
||||
offset = maximum_length - lengths[lengths_sorted_idxes[
|
||||
1]] - lengths[lengths_sorted_idxes[2]]
|
||||
if offset > lengths[lengths_sorted_idxes[1]]:
|
||||
lengths[lengths_sorted_idxes[0]] = offset
|
||||
else:
|
||||
@@ -380,12 +408,11 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
else:
|
||||
lengths[lengths_sorted_idxes[0]] = lengths[
|
||||
lengths_sorted_idxes[1]] = lengths[
|
||||
lengths_sorted_idxes[
|
||||
2]] = maximum_length // 3
|
||||
lengths_sorted_idxes[2]] = maximum_length // 3
|
||||
|
||||
new_lens = list(lengths[k] for k in range(0, len(lengths)))
|
||||
new_tensor_tuple = tuple(
|
||||
x[:y] for x, y in zip(tensor_tuple, new_lens))
|
||||
new_tensor_tuple = tuple(x[:y]
|
||||
for x, y in zip(tensor_tuple, new_lens))
|
||||
|
||||
for x, y in zip(new_tensor_tuple, tensor_tuple):
|
||||
x[-1] = y[-1]
|
||||
@@ -396,5 +423,4 @@ class UniTEForTranslationEvaluation(TorchModel):
|
||||
concat_tensor = list(torch.cat(x, dim=0) for x in collected_tuples)
|
||||
all_input_concat_padded = pad_sequence(
|
||||
concat_tensor, batch_first=True, padding_value=pad_idx)
|
||||
|
||||
return all_input_concat_padded
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user