format finetune code, and ut case

This commit is contained in:
行嗔
2022-10-23 21:29:17 +08:00
parent c573625b05
commit 55fb3b05a9
11 changed files with 215 additions and 94 deletions

View File

@@ -377,7 +377,7 @@ class Metrics(object):
audio_noise_metric = 'audio-noise-metric'
# text gen
bleu = 'bleu'
BLEU = 'bleu'
# metrics for image denoise task
image_denoise_metric = 'image-denoise-metric'
@@ -399,6 +399,8 @@ class Metrics(object):
movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
# metric for inpainting task
image_inpainting_metric = 'image-inpainting-metric'
# metric for ocr
NED = 'ned'
class Optimizers(object):

View File

@@ -11,7 +11,7 @@ from .builder import METRICS, MetricKeys
EVAL_BLEU_ORDER = 4
@METRICS.register_module(group_key=default_group, module_name=Metrics.bleu)
@METRICS.register_module(group_key=default_group, module_name=Metrics.BLEU)
class BleuMetric(Metric):
"""The metric computation bleu for text generation classes.

View File

@@ -23,6 +23,7 @@ class MetricKeys(object):
BLEU_4 = 'bleu-4'
ROUGE_1 = 'rouge-1'
ROUGE_L = 'rouge-l'
NED = 'ned' # ocr metric
task_default_metrics = {

View File

@@ -16,6 +16,7 @@ from .base import Preprocessor
from .builder import PREPROCESSORS
from .ofa import * # noqa
from .ofa.utils.collate import collate_fn
from .ofa.utils.constant import OFA_TASK_KEY_MAPPING
__all__ = [
'OfaPreprocessor',
@@ -51,24 +52,13 @@ class OfaPreprocessor(Preprocessor):
Tasks.text_summarization: OfaSummarizationPreprocessor,
Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor
}
input_key_mapping = {
Tasks.ocr_recognition: ['image'],
Tasks.image_captioning: ['image'],
Tasks.image_classification: ['image'],
Tasks.text_summarization: ['text'],
Tasks.text_classification: ['text', 'text2'],
Tasks.visual_grounding: ['image', 'text'],
Tasks.visual_question_answering: ['image', 'text'],
Tasks.visual_entailment: ['image', 'text', 'text2'],
Tasks.text_to_image_synthesis: ['text']
}
model_dir = model_dir if osp.exists(model_dir) else snapshot_download(
model_dir)
self.cfg = Config.from_file(
osp.join(model_dir, ModelFile.CONFIGURATION))
self.preprocess = preprocess_mapping[self.cfg.task](
cfg=self.cfg, model_dir=model_dir, mode=mode)
self.keys = input_key_mapping[self.cfg.task]
self.keys = OFA_TASK_KEY_MAPPING[self.cfg.task]
self.tokenizer = self.preprocess.tokenizer
if kwargs.get('no_collate', None):
self.no_collate = True

View File

@@ -6,9 +6,12 @@ from os import path as osp
import json
import numpy as np
import torch
from PIL import Image
from modelscope.models.multi_modal.ofa import OFATokenizer, OFATokenizerZH
from modelscope.preprocessors.image import load_image
from modelscope.utils.trie import Trie
from .utils.constant import OFA_TASK_KEY_MAPPING
from .utils.random_help import set_torch_seed
@@ -59,6 +62,14 @@ class OfaBasePreprocessor:
self.mean = [0.5, 0.5, 0.5]
self.std = [0.5, 0.5, 0.5]
self.patch_image_size = self.cfg.model.get('patch_image_size', 480)
self.column_map = {
key: key
for key in OFA_TASK_KEY_MAPPING[self.cfg.task]
}
if hasattr(self.cfg,
'dataset') and self.cfg.dataset.column_map is not None:
for k, v in self.cfg.dataset.column_map.items():
self.column_map[k] = v
self.transtab = str.maketrans(
{key: None
for key in string.punctuation})
@@ -147,3 +158,8 @@ class OfaBasePreprocessor:
constraint_prefix_token)
constraint_mask[i][constraint_nodes] = True
sample['constraint_mask'] = constraint_mask
def get_img_pil(self, path_or_url_or_pil):
image = path_or_url_or_pil if isinstance(path_or_url_or_pil, Image.Image) \
else load_image(path_or_url_or_pil)
return image

View File

@@ -1,12 +1,9 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict, Union
from typing import Any, Dict
import torch
from PIL import Image
from torchvision import transforms
from modelscope.preprocessors.image import load_image
from modelscope.utils.constant import ModeKeys
from .base import OfaBasePreprocessor
@@ -46,7 +43,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
sample = self._build_infer_sample(data)
target = data['text']
target = data[self.column_map['text']]
target = target.translate(self.transtab).strip()
target_token_list = target.strip().split()
target = ' '.join(target_token_list[:self.max_tgt_length])
@@ -56,8 +53,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
return sample
def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
image = data['image'] if isinstance(
data['image'], Image.Image) else load_image(data['image'])
image = self.get_img_pil(data[self.column_map['image']])
patch_image = self.patch_resize_transform(image)
prompt = self.cfg.model.get('prompt', ' what does the image describe?')
inputs = self.tokenize_text(prompt)
@@ -66,6 +62,6 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor):
'patch_image': patch_image,
'patch_mask': torch.tensor([True])
}
if 'text' in data:
sample['label'] = data['text']
if self.column_map['text'] in data:
sample['label'] = data[self.column_map['text']]
return sample

View File

@@ -1,7 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import random
import unicodedata
from typing import Any, Dict, Union
from typing import Any, Dict
import torch
from PIL import Image

View File

@@ -0,0 +1,13 @@
from modelscope.utils.constant import Tasks
OFA_TASK_KEY_MAPPING = {
Tasks.ocr_recognition: ['image'],
Tasks.image_captioning: ['image'],
Tasks.image_classification: ['image'],
Tasks.text_summarization: ['text'],
Tasks.text_classification: ['text', 'text2'],
Tasks.visual_grounding: ['image', 'text'],
Tasks.visual_question_answering: ['image', 'text'],
Tasks.visual_entailment: ['image', 'text', 'text2'],
Tasks.text_to_image_synthesis: ['text']
}

View File

@@ -2,21 +2,27 @@
import math
import os
import shutil
from functools import partial
from typing import Callable, Dict, Optional, Tuple, Union
from datasets import load_dataset
import torch
from torch import distributed as dist
from torch import nn
from torch.utils.data import Dataset
from modelscope.metainfo import Trainers
from modelscope.models.base import Model
from modelscope.models.base import Model, TorchModel
from modelscope.msdatasets.ms_dataset import MsDataset
from modelscope.preprocessors.base import Preprocessor
from modelscope.preprocessors.multi_modal import OfaPreprocessor
from modelscope.preprocessors.ofa.utils.collate import collate_fn
from modelscope.trainers import EpochBasedTrainer
from modelscope.trainers.builder import TRAINERS
from modelscope.trainers.optimizer.builder import build_optimizer
from modelscope.utils.config import Config
from modelscope.utils.constant import ConfigKeys, ModeKeys, ModelFile
from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys,
ModeKeys)
from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
get_schedule)
@@ -24,56 +30,100 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion,
@TRAINERS.register_module(module_name=Trainers.ofa_tasks)
class OFATrainer(EpochBasedTrainer):
def __init__(self, model: str, *args, **kwargs):
model = Model.from_pretrained(model)
def __init__(
self,
model: Optional[Union[TorchModel, nn.Module, str]] = None,
cfg_file: Optional[str] = None,
arg_parse_fn: Optional[Callable] = None,
data_collator: Optional[Union[Callable, Dict[str,
Callable]]] = None,
train_dataset: Optional[Union[MsDataset, Dataset]] = None,
eval_dataset: Optional[Union[MsDataset, Dataset]] = None,
preprocessor: Optional[Union[Preprocessor,
Dict[str, Preprocessor]]] = None,
optimizers: Tuple[torch.optim.Optimizer,
torch.optim.lr_scheduler._LRScheduler] = (None,
None),
model_revision: Optional[str] = DEFAULT_MODEL_REVISION,
seed: int = 42,
**kwargs):
model = Model.from_pretrained(model, revision=model_revision)
model_dir = model.model_dir
cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
cfg = Config.from_file(cfg_file)
dataset = self._build_dataset_with_config(cfg)
preprocessor = {
ConfigKeys.train:
OfaPreprocessor(
model_dir=model_dir, mode=ModeKeys.TRAIN, no_collate=True),
ConfigKeys.val:
OfaPreprocessor(
model_dir=model_dir, mode=ModeKeys.EVAL, no_collate=True),
if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0:
work_dir = cfg.train.work_dir
else:
work_dir = kwargs['work_dir']
tokenizer_files = {
'zh': [
'tokenizer.json', 'tokenizer_config.json', 'vocab.txt',
'config.json'
],
'en':
['tokenizer.json', 'vocab.json', 'merges.txt', 'config.json'],
}
for filename in tokenizer_files[cfg.model.get('language', 'en')]:
finetune_file = os.path.join(work_dir, filename)
pretrain_file = os.path.join(model_dir, filename)
if os.path.exists(finetune_file):
continue
if os.path.exists(pretrain_file):
shutil.copy(pretrain_file, finetune_file)
if preprocessor is None:
preprocessor = {
ConfigKeys.train:
OfaPreprocessor(
model_dir=work_dir, mode=ModeKeys.TRAIN, no_collate=True),
ConfigKeys.val:
OfaPreprocessor(
model_dir=work_dir, mode=ModeKeys.EVAL, no_collate=True),
}
# use torchrun launch
world_size = int(os.environ.get('WORLD_SIZE', 1))
epoch_steps = math.ceil(
len(dataset['train']) / # noqa
len(train_dataset) / # noqa
(cfg.train.dataloader.batch_size_per_gpu * world_size)) # noqa
cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs
cfg.train.criterion.tokenizer = model.tokenizer
self.criterion = AdjustLabelSmoothedCrossEntropyCriterion(
cfg.train.criterion)
optimizer = build_optimizer(model, cfg=cfg.train.optimizer)
scheduler_class, scheduler_args = get_schedule(cfg.train.lr_scheduler)
if scheduler_class is not None:
lr_scheduler = scheduler_class(**{'optimizer': optimizer},
**scheduler_args)
if optimizers[0] is None:
optimizer = build_optimizer(model, cfg=cfg.train.optimizer)
else:
lr_scheduler = None
collator = partial(
collate_fn,
pad_idx=model.tokenizer.pad_token_id,
eos_idx=model.tokenizer.eos_token_id,
)
optimizer = optimizers[0]
if optimizers[1] is None:
scheduler_class, scheduler_args = get_schedule(
cfg.train.lr_scheduler)
if scheduler_class is not None:
lr_scheduler = scheduler_class(**{'optimizer': optimizer},
**scheduler_args)
else:
lr_scheduler = None
else:
lr_scheduler = optimizers[1]
optimizers = (optimizer, lr_scheduler)
if data_collator is None:
data_collator = partial(
collate_fn,
pad_idx=model.tokenizer.pad_token_id,
eos_idx=model.tokenizer.eos_token_id,
)
if 'launcher' not in kwargs and cfg.train.get('launcher', None):
kwargs['launcher'] = cfg.train.launcher
if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False):
kwargs['use_fp16'] = cfg.train.use_fp16
kwargs['to_tensor'] = False
super().__init__(
cfg_file=cfg_file,
model=model,
data_collator=collator,
train_dataset=dataset['train'],
eval_dataset=dataset['valid'],
cfg_file=cfg_file,
arg_parse_fn=arg_parse_fn,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
preprocessor=preprocessor,
optimizers=(optimizer, lr_scheduler),
work_dir=cfg.train.work_dir,
*args,
optimizers=optimizers,
seed=seed,
**kwargs,
)
@@ -102,24 +152,3 @@ class OFATrainer(EpochBasedTrainer):
else:
self.log_buffer.update(train_outputs['log_vars'])
self.train_outputs = train_outputs
def _build_dataset_with_config(self, cfg):
if hasattr(cfg.dataset, 'hf_dataset'):
dataset = load_dataset(
cfg.dataset.script,
data_files=cfg.dataset.hf_dataset,
sep=cfg.dataset.sep,
)
dataset = MsDataset.from_hf_dataset(
dataset.rename_columns(cfg.dataset.column_map))
return dataset
elif hasattr(cfg.dataset, 'ms_dataset'):
dataset_d = dict()
for key in cfg.dataset.ms_dataset.keys():
dataset_d[key] = MsDataset.load(**cfg.dataset.ms_dataset[key])
dataset_d[key] = MsDataset.from_hf_dataset(
dataset_d[key]._hf_ds.rename_columns(
cfg.dataset.column_map))
return dataset_d
else:
raise NotImplementedError

View File

@@ -282,6 +282,7 @@ class ConfigKeys(object):
"""Fixed keywords in configuration file"""
train = 'train'
val = 'val'
test = 'test'
class Requirements(object):

View File

@@ -5,27 +5,102 @@ import os.path as osp
import shutil
import unittest
from modelscope.metainfo import Trainers
import json
from modelscope.metainfo import Metrics, Trainers
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer
from modelscope.utils.constant import ModelFile
from modelscope.utils.test_utils import test_level
class TestOfaTrainer(unittest.TestCase):
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_trainer(self):
os.environ['LOCAL_RANK'] = '0'
model_id = 'damo/ofa_text-classification_mnli_large_en'
default_args = {'model': model_id}
trainer = build_trainer(
name=Trainers.ofa_tasks, default_args=default_args)
os.makedirs(trainer.work_dir, exist_ok=True)
def setUp(self) -> None:
self.finetune_cfg = \
{'framework': 'pytorch',
'task': 'image-captioning',
'model': {'type': 'ofa',
'beam_search': {'beam_size': 5,
'max_len_b': 16,
'min_len': 1,
'no_repeat_ngram_size': 0},
'seed': 7,
'max_src_length': 256,
'language': 'en',
'gen_type': 'generation',
'patch_image_size': 480,
'max_image_size': 480,
'imagenet_default_mean_and_std': False},
'pipeline': {'type': 'image-captioning'},
'dataset': {'column_map': {'text': 'caption'}},
'train': {'work_dir': 'work/ckpts/caption',
# 'launcher': 'pytorch',
'max_epochs': 1,
'use_fp16': True,
'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
'lr_scheduler': {'name': 'polynomial_decay',
'warmup_proportion': 0.01,
'lr_end': 1e-07},
'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False},
'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01},
'optimizer_hook': {'type': 'TorchAMPOptimizerHook',
'cumulative_iters': 1,
'grad_clip': {'max_norm': 1.0, 'norm_type': 2},
'loss_keys': 'loss'},
'criterion': {'name': 'AdjustLabelSmoothedCrossEntropyCriterion',
'constraint_range': None,
'drop_worst_after': 0,
'drop_worst_ratio': 0.0,
'ignore_eos': False,
'ignore_prefix_size': 0,
'label_smoothing': 0.0,
'reg_alpha': 1.0,
'report_accuracy': False,
'sample_patch_num': 196,
'sentence_avg': False,
'use_rdrop': False},
'hooks': [{'type': 'BestCkptSaverHook',
'metric_key': 'bleu-4',
'interval': 100},
{'type': 'TextLoggerHook', 'interval': 1},
{'type': 'IterTimerHook'},
{'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]},
'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0},
'metrics': [{'type': 'bleu',
'eval_tokenized_bleu': False,
'ref_name': 'labels',
'hyp_name': 'caption'}]},
'preprocessor': []}
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_trainer_std(self):
WORKSPACE = './workspace/ckpts/caption'
os.makedirs(WORKSPACE, exist_ok=True)
config_file = os.path.join(WORKSPACE, 'configuration.json')
with open(config_file, 'w') as writer:
json.dump(self.finetune_cfg, writer)
pretrained_model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_large_en'
args = dict(
model=pretrained_model,
work_dir=WORKSPACE,
train_dataset=MsDataset.load(
'coco_2014_caption',
namespace='modelscope',
split='train[:100]'),
eval_dataset=MsDataset.load(
'coco_2014_caption',
namespace='modelscope',
split='validation[:20]'),
metrics=[Metrics.BLEU],
cfg_file=config_file)
trainer = build_trainer(name=Trainers.ofa_tasks, default_args=args)
trainer.train()
assert len(
glob.glob(osp.join(trainer.work_dir,
'best_epoch*_accuracy*.pth'))) == 2
if os.path.exists(self.trainer.work_dir):
shutil.rmtree(self.trainer.work_dir)
self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE,
os.path.join(WORKSPACE, 'output'))
shutil.rmtree(WORKSPACE)
if __name__ == '__main__':