From 55fb3b05a91107dd083d9684ee22906d406338e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A1=8C=E5=97=94?= Date: Sun, 23 Oct 2022 21:29:17 +0800 Subject: [PATCH] format finetune code, and ut case --- modelscope/metainfo.py | 4 +- modelscope/metrics/bleu_metric.py | 2 +- modelscope/metrics/builder.py | 1 + modelscope/preprocessors/multi_modal.py | 14 +- modelscope/preprocessors/ofa/base.py | 16 ++ .../preprocessors/ofa/image_captioning.py | 14 +- .../preprocessors/ofa/ocr_recognition.py | 4 +- .../preprocessors/ofa/utils/constant.py | 13 ++ .../trainers/multi_modal/ofa/ofa_trainer.py | 137 +++++++++++------- modelscope/utils/constant.py | 1 + tests/trainers/test_ofa_trainer.py | 103 +++++++++++-- 11 files changed, 215 insertions(+), 94 deletions(-) create mode 100644 modelscope/preprocessors/ofa/utils/constant.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index ac3fb4e2..b559f5c0 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -377,7 +377,7 @@ class Metrics(object): audio_noise_metric = 'audio-noise-metric' # text gen - bleu = 'bleu' + BLEU = 'bleu' # metrics for image denoise task image_denoise_metric = 'image-denoise-metric' @@ -399,6 +399,8 @@ class Metrics(object): movie_scene_segmentation_metric = 'movie-scene-segmentation-metric' # metric for inpainting task image_inpainting_metric = 'image-inpainting-metric' + # metric for ocr + NED = 'ned' class Optimizers(object): diff --git a/modelscope/metrics/bleu_metric.py b/modelscope/metrics/bleu_metric.py index 43d1b105..7c134b6a 100644 --- a/modelscope/metrics/bleu_metric.py +++ b/modelscope/metrics/bleu_metric.py @@ -11,7 +11,7 @@ from .builder import METRICS, MetricKeys EVAL_BLEU_ORDER = 4 -@METRICS.register_module(group_key=default_group, module_name=Metrics.bleu) +@METRICS.register_module(group_key=default_group, module_name=Metrics.BLEU) class BleuMetric(Metric): """The metric computation bleu for text generation classes. diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py index 1c8e16d7..da3b64c7 100644 --- a/modelscope/metrics/builder.py +++ b/modelscope/metrics/builder.py @@ -23,6 +23,7 @@ class MetricKeys(object): BLEU_4 = 'bleu-4' ROUGE_1 = 'rouge-1' ROUGE_L = 'rouge-l' + NED = 'ned' # ocr metric task_default_metrics = { diff --git a/modelscope/preprocessors/multi_modal.py b/modelscope/preprocessors/multi_modal.py index 2447c0b5..3c4ac58a 100644 --- a/modelscope/preprocessors/multi_modal.py +++ b/modelscope/preprocessors/multi_modal.py @@ -16,6 +16,7 @@ from .base import Preprocessor from .builder import PREPROCESSORS from .ofa import * # noqa from .ofa.utils.collate import collate_fn +from .ofa.utils.constant import OFA_TASK_KEY_MAPPING __all__ = [ 'OfaPreprocessor', @@ -51,24 +52,13 @@ class OfaPreprocessor(Preprocessor): Tasks.text_summarization: OfaSummarizationPreprocessor, Tasks.text_to_image_synthesis: OfaTextToImageSynthesisPreprocessor } - input_key_mapping = { - Tasks.ocr_recognition: ['image'], - Tasks.image_captioning: ['image'], - Tasks.image_classification: ['image'], - Tasks.text_summarization: ['text'], - Tasks.text_classification: ['text', 'text2'], - Tasks.visual_grounding: ['image', 'text'], - Tasks.visual_question_answering: ['image', 'text'], - Tasks.visual_entailment: ['image', 'text', 'text2'], - Tasks.text_to_image_synthesis: ['text'] - } model_dir = model_dir if osp.exists(model_dir) else snapshot_download( model_dir) self.cfg = Config.from_file( osp.join(model_dir, ModelFile.CONFIGURATION)) self.preprocess = preprocess_mapping[self.cfg.task]( cfg=self.cfg, model_dir=model_dir, mode=mode) - self.keys = input_key_mapping[self.cfg.task] + self.keys = OFA_TASK_KEY_MAPPING[self.cfg.task] self.tokenizer = self.preprocess.tokenizer if kwargs.get('no_collate', None): self.no_collate = True diff --git a/modelscope/preprocessors/ofa/base.py b/modelscope/preprocessors/ofa/base.py index 47d70f6d..55b3895d 100644 --- a/modelscope/preprocessors/ofa/base.py +++ b/modelscope/preprocessors/ofa/base.py @@ -6,9 +6,12 @@ from os import path as osp import json import numpy as np import torch +from PIL import Image from modelscope.models.multi_modal.ofa import OFATokenizer, OFATokenizerZH +from modelscope.preprocessors.image import load_image from modelscope.utils.trie import Trie +from .utils.constant import OFA_TASK_KEY_MAPPING from .utils.random_help import set_torch_seed @@ -59,6 +62,14 @@ class OfaBasePreprocessor: self.mean = [0.5, 0.5, 0.5] self.std = [0.5, 0.5, 0.5] self.patch_image_size = self.cfg.model.get('patch_image_size', 480) + self.column_map = { + key: key + for key in OFA_TASK_KEY_MAPPING[self.cfg.task] + } + if hasattr(self.cfg, + 'dataset') and self.cfg.dataset.column_map is not None: + for k, v in self.cfg.dataset.column_map.items(): + self.column_map[k] = v self.transtab = str.maketrans( {key: None for key in string.punctuation}) @@ -147,3 +158,8 @@ class OfaBasePreprocessor: constraint_prefix_token) constraint_mask[i][constraint_nodes] = True sample['constraint_mask'] = constraint_mask + + def get_img_pil(self, path_or_url_or_pil): + image = path_or_url_or_pil if isinstance(path_or_url_or_pil, Image.Image) \ + else load_image(path_or_url_or_pil) + return image diff --git a/modelscope/preprocessors/ofa/image_captioning.py b/modelscope/preprocessors/ofa/image_captioning.py index 6c842aa9..99eda15d 100644 --- a/modelscope/preprocessors/ofa/image_captioning.py +++ b/modelscope/preprocessors/ofa/image_captioning.py @@ -1,12 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict, Union +from typing import Any, Dict import torch -from PIL import Image from torchvision import transforms -from modelscope.preprocessors.image import load_image from modelscope.utils.constant import ModeKeys from .base import OfaBasePreprocessor @@ -46,7 +43,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor): def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: sample = self._build_infer_sample(data) - target = data['text'] + target = data[self.column_map['text']] target = target.translate(self.transtab).strip() target_token_list = target.strip().split() target = ' '.join(target_token_list[:self.max_tgt_length]) @@ -56,8 +53,7 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor): return sample def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: - image = data['image'] if isinstance( - data['image'], Image.Image) else load_image(data['image']) + image = self.get_img_pil(data[self.column_map['image']]) patch_image = self.patch_resize_transform(image) prompt = self.cfg.model.get('prompt', ' what does the image describe?') inputs = self.tokenize_text(prompt) @@ -66,6 +62,6 @@ class OfaImageCaptioningPreprocessor(OfaBasePreprocessor): 'patch_image': patch_image, 'patch_mask': torch.tensor([True]) } - if 'text' in data: - sample['label'] = data['text'] + if self.column_map['text'] in data: + sample['label'] = data[self.column_map['text']] return sample diff --git a/modelscope/preprocessors/ofa/ocr_recognition.py b/modelscope/preprocessors/ofa/ocr_recognition.py index 1d30e572..4c8c245a 100644 --- a/modelscope/preprocessors/ofa/ocr_recognition.py +++ b/modelscope/preprocessors/ofa/ocr_recognition.py @@ -1,7 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import random -import unicodedata -from typing import Any, Dict, Union +from typing import Any, Dict import torch from PIL import Image diff --git a/modelscope/preprocessors/ofa/utils/constant.py b/modelscope/preprocessors/ofa/utils/constant.py new file mode 100644 index 00000000..102d27c0 --- /dev/null +++ b/modelscope/preprocessors/ofa/utils/constant.py @@ -0,0 +1,13 @@ +from modelscope.utils.constant import Tasks + +OFA_TASK_KEY_MAPPING = { + Tasks.ocr_recognition: ['image'], + Tasks.image_captioning: ['image'], + Tasks.image_classification: ['image'], + Tasks.text_summarization: ['text'], + Tasks.text_classification: ['text', 'text2'], + Tasks.visual_grounding: ['image', 'text'], + Tasks.visual_question_answering: ['image', 'text'], + Tasks.visual_entailment: ['image', 'text', 'text2'], + Tasks.text_to_image_synthesis: ['text'] +} diff --git a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py index 3daadf43..c287c182 100644 --- a/modelscope/trainers/multi_modal/ofa/ofa_trainer.py +++ b/modelscope/trainers/multi_modal/ofa/ofa_trainer.py @@ -2,21 +2,27 @@ import math import os +import shutil from functools import partial +from typing import Callable, Dict, Optional, Tuple, Union -from datasets import load_dataset +import torch from torch import distributed as dist +from torch import nn +from torch.utils.data import Dataset from modelscope.metainfo import Trainers -from modelscope.models.base import Model +from modelscope.models.base import Model, TorchModel from modelscope.msdatasets.ms_dataset import MsDataset +from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.multi_modal import OfaPreprocessor from modelscope.preprocessors.ofa.utils.collate import collate_fn from modelscope.trainers import EpochBasedTrainer from modelscope.trainers.builder import TRAINERS from modelscope.trainers.optimizer.builder import build_optimizer from modelscope.utils.config import Config -from modelscope.utils.constant import ConfigKeys, ModeKeys, ModelFile +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigKeys, + ModeKeys) from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion, get_schedule) @@ -24,56 +30,100 @@ from .ofa_trainer_utils import (AdjustLabelSmoothedCrossEntropyCriterion, @TRAINERS.register_module(module_name=Trainers.ofa_tasks) class OFATrainer(EpochBasedTrainer): - def __init__(self, model: str, *args, **kwargs): - model = Model.from_pretrained(model) + def __init__( + self, + model: Optional[Union[TorchModel, nn.Module, str]] = None, + cfg_file: Optional[str] = None, + arg_parse_fn: Optional[Callable] = None, + data_collator: Optional[Union[Callable, Dict[str, + Callable]]] = None, + train_dataset: Optional[Union[MsDataset, Dataset]] = None, + eval_dataset: Optional[Union[MsDataset, Dataset]] = None, + preprocessor: Optional[Union[Preprocessor, + Dict[str, Preprocessor]]] = None, + optimizers: Tuple[torch.optim.Optimizer, + torch.optim.lr_scheduler._LRScheduler] = (None, + None), + model_revision: Optional[str] = DEFAULT_MODEL_REVISION, + seed: int = 42, + **kwargs): + model = Model.from_pretrained(model, revision=model_revision) model_dir = model.model_dir - cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) cfg = Config.from_file(cfg_file) - dataset = self._build_dataset_with_config(cfg) - preprocessor = { - ConfigKeys.train: - OfaPreprocessor( - model_dir=model_dir, mode=ModeKeys.TRAIN, no_collate=True), - ConfigKeys.val: - OfaPreprocessor( - model_dir=model_dir, mode=ModeKeys.EVAL, no_collate=True), + if 'work_dir' not in kwargs or len(kwargs['work_dir']) == 0: + work_dir = cfg.train.work_dir + else: + work_dir = kwargs['work_dir'] + tokenizer_files = { + 'zh': [ + 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt', + 'config.json' + ], + 'en': + ['tokenizer.json', 'vocab.json', 'merges.txt', 'config.json'], } + for filename in tokenizer_files[cfg.model.get('language', 'en')]: + finetune_file = os.path.join(work_dir, filename) + pretrain_file = os.path.join(model_dir, filename) + if os.path.exists(finetune_file): + continue + if os.path.exists(pretrain_file): + shutil.copy(pretrain_file, finetune_file) + + if preprocessor is None: + preprocessor = { + ConfigKeys.train: + OfaPreprocessor( + model_dir=work_dir, mode=ModeKeys.TRAIN, no_collate=True), + ConfigKeys.val: + OfaPreprocessor( + model_dir=work_dir, mode=ModeKeys.EVAL, no_collate=True), + } # use torchrun launch world_size = int(os.environ.get('WORLD_SIZE', 1)) epoch_steps = math.ceil( - len(dataset['train']) / # noqa + len(train_dataset) / # noqa (cfg.train.dataloader.batch_size_per_gpu * world_size)) # noqa cfg.train.lr_scheduler.num_train_steps = epoch_steps * cfg.train.max_epochs cfg.train.criterion.tokenizer = model.tokenizer self.criterion = AdjustLabelSmoothedCrossEntropyCriterion( cfg.train.criterion) - optimizer = build_optimizer(model, cfg=cfg.train.optimizer) - scheduler_class, scheduler_args = get_schedule(cfg.train.lr_scheduler) - if scheduler_class is not None: - lr_scheduler = scheduler_class(**{'optimizer': optimizer}, - **scheduler_args) + if optimizers[0] is None: + optimizer = build_optimizer(model, cfg=cfg.train.optimizer) else: - lr_scheduler = None - collator = partial( - collate_fn, - pad_idx=model.tokenizer.pad_token_id, - eos_idx=model.tokenizer.eos_token_id, - ) + optimizer = optimizers[0] + if optimizers[1] is None: + scheduler_class, scheduler_args = get_schedule( + cfg.train.lr_scheduler) + if scheduler_class is not None: + lr_scheduler = scheduler_class(**{'optimizer': optimizer}, + **scheduler_args) + else: + lr_scheduler = None + else: + lr_scheduler = optimizers[1] + optimizers = (optimizer, lr_scheduler) + if data_collator is None: + data_collator = partial( + collate_fn, + pad_idx=model.tokenizer.pad_token_id, + eos_idx=model.tokenizer.eos_token_id, + ) if 'launcher' not in kwargs and cfg.train.get('launcher', None): kwargs['launcher'] = cfg.train.launcher if 'use_fp16' not in kwargs and cfg.train.get('use_fp16', False): kwargs['use_fp16'] = cfg.train.use_fp16 kwargs['to_tensor'] = False super().__init__( - cfg_file=cfg_file, model=model, - data_collator=collator, - train_dataset=dataset['train'], - eval_dataset=dataset['valid'], + cfg_file=cfg_file, + arg_parse_fn=arg_parse_fn, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, preprocessor=preprocessor, - optimizers=(optimizer, lr_scheduler), - work_dir=cfg.train.work_dir, - *args, + optimizers=optimizers, + seed=seed, **kwargs, ) @@ -102,24 +152,3 @@ class OFATrainer(EpochBasedTrainer): else: self.log_buffer.update(train_outputs['log_vars']) self.train_outputs = train_outputs - - def _build_dataset_with_config(self, cfg): - if hasattr(cfg.dataset, 'hf_dataset'): - dataset = load_dataset( - cfg.dataset.script, - data_files=cfg.dataset.hf_dataset, - sep=cfg.dataset.sep, - ) - dataset = MsDataset.from_hf_dataset( - dataset.rename_columns(cfg.dataset.column_map)) - return dataset - elif hasattr(cfg.dataset, 'ms_dataset'): - dataset_d = dict() - for key in cfg.dataset.ms_dataset.keys(): - dataset_d[key] = MsDataset.load(**cfg.dataset.ms_dataset[key]) - dataset_d[key] = MsDataset.from_hf_dataset( - dataset_d[key]._hf_ds.rename_columns( - cfg.dataset.column_map)) - return dataset_d - else: - raise NotImplementedError diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 87a0a417..a3f4a935 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -282,6 +282,7 @@ class ConfigKeys(object): """Fixed keywords in configuration file""" train = 'train' val = 'val' + test = 'test' class Requirements(object): diff --git a/tests/trainers/test_ofa_trainer.py b/tests/trainers/test_ofa_trainer.py index 8aab3544..fe7672df 100644 --- a/tests/trainers/test_ofa_trainer.py +++ b/tests/trainers/test_ofa_trainer.py @@ -5,27 +5,102 @@ import os.path as osp import shutil import unittest -from modelscope.metainfo import Trainers +import json + +from modelscope.metainfo import Metrics, Trainers +from modelscope.msdatasets import MsDataset from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile from modelscope.utils.test_utils import test_level class TestOfaTrainer(unittest.TestCase): - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') - def test_trainer(self): - os.environ['LOCAL_RANK'] = '0' - model_id = 'damo/ofa_text-classification_mnli_large_en' - default_args = {'model': model_id} - trainer = build_trainer( - name=Trainers.ofa_tasks, default_args=default_args) - os.makedirs(trainer.work_dir, exist_ok=True) + def setUp(self) -> None: + self.finetune_cfg = \ + {'framework': 'pytorch', + 'task': 'image-captioning', + 'model': {'type': 'ofa', + 'beam_search': {'beam_size': 5, + 'max_len_b': 16, + 'min_len': 1, + 'no_repeat_ngram_size': 0}, + 'seed': 7, + 'max_src_length': 256, + 'language': 'en', + 'gen_type': 'generation', + 'patch_image_size': 480, + 'max_image_size': 480, + 'imagenet_default_mean_and_std': False}, + 'pipeline': {'type': 'image-captioning'}, + 'dataset': {'column_map': {'text': 'caption'}}, + 'train': {'work_dir': 'work/ckpts/caption', + # 'launcher': 'pytorch', + 'max_epochs': 1, + 'use_fp16': True, + 'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0}, + 'lr_scheduler': {'name': 'polynomial_decay', + 'warmup_proportion': 0.01, + 'lr_end': 1e-07}, + 'lr_scheduler_hook': {'type': 'LrSchedulerHook', 'by_epoch': False}, + 'optimizer': {'type': 'AdamW', 'lr': 5e-05, 'weight_decay': 0.01}, + 'optimizer_hook': {'type': 'TorchAMPOptimizerHook', + 'cumulative_iters': 1, + 'grad_clip': {'max_norm': 1.0, 'norm_type': 2}, + 'loss_keys': 'loss'}, + 'criterion': {'name': 'AdjustLabelSmoothedCrossEntropyCriterion', + 'constraint_range': None, + 'drop_worst_after': 0, + 'drop_worst_ratio': 0.0, + 'ignore_eos': False, + 'ignore_prefix_size': 0, + 'label_smoothing': 0.0, + 'reg_alpha': 1.0, + 'report_accuracy': False, + 'sample_patch_num': 196, + 'sentence_avg': False, + 'use_rdrop': False}, + 'hooks': [{'type': 'BestCkptSaverHook', + 'metric_key': 'bleu-4', + 'interval': 100}, + {'type': 'TextLoggerHook', 'interval': 1}, + {'type': 'IterTimerHook'}, + {'type': 'EvaluationHook', 'by_epoch': True, 'interval': 1}]}, + 'evaluation': {'dataloader': {'batch_size_per_gpu': 4, 'workers_per_gpu': 0}, + 'metrics': [{'type': 'bleu', + 'eval_tokenized_bleu': False, + 'ref_name': 'labels', + 'hyp_name': 'caption'}]}, + 'preprocessor': []} + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer_std(self): + WORKSPACE = './workspace/ckpts/caption' + os.makedirs(WORKSPACE, exist_ok=True) + config_file = os.path.join(WORKSPACE, 'configuration.json') + with open(config_file, 'w') as writer: + json.dump(self.finetune_cfg, writer) + + pretrained_model = '/apsarapangu/disk2/yichang.zyc/ckpt/MaaS/ofa_image-caption_coco_large_en' + args = dict( + model=pretrained_model, + work_dir=WORKSPACE, + train_dataset=MsDataset.load( + 'coco_2014_caption', + namespace='modelscope', + split='train[:100]'), + eval_dataset=MsDataset.load( + 'coco_2014_caption', + namespace='modelscope', + split='validation[:20]'), + metrics=[Metrics.BLEU], + cfg_file=config_file) + trainer = build_trainer(name=Trainers.ofa_tasks, default_args=args) trainer.train() - assert len( - glob.glob(osp.join(trainer.work_dir, - 'best_epoch*_accuracy*.pth'))) == 2 - if os.path.exists(self.trainer.work_dir): - shutil.rmtree(self.trainer.work_dir) + + self.assertIn(ModelFile.TORCH_MODEL_BIN_FILE, + os.path.join(WORKSPACE, 'output')) + shutil.rmtree(WORKSPACE) if __name__ == '__main__':