From db0f70bc1c9e1c94448d5a91b757c44c5bb685f5 Mon Sep 17 00:00:00 2001 From: "LingFeng.Chen.Cn" Date: Fri, 7 Jul 2023 22:14:13 +0800 Subject: [PATCH 1/6] text_in is required (#365) --- modelscope/pipelines/audio/timestamp_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/pipelines/audio/timestamp_pipeline.py b/modelscope/pipelines/audio/timestamp_pipeline.py index 17cf9545..98e9eb05 100644 --- a/modelscope/pipelines/audio/timestamp_pipeline.py +++ b/modelscope/pipelines/audio/timestamp_pipeline.py @@ -93,7 +93,7 @@ class TimestampPipeline(Pipeline): def __call__(self, audio_in: Union[str, bytes], - text_in: str = None, + text_in: str, audio_fs: int = None, recog_type: str = None, audio_format: str = None, From fd6e352922f70de5c83783074e5ac8294c729c3f Mon Sep 17 00:00:00 2001 From: Wang Qiang <37444407+XDUWQ@users.noreply.github.com> Date: Tue, 11 Jul 2023 15:36:35 +0800 Subject: [PATCH 2/6] Add pipeline num_inference_steps and guidance_scale parameter for stable diffusion pipeline (#367) * add pipeline num_inference_steps and guidance_scale parameter * precommit --------- Co-authored-by: XDUWQ --- .../diffusers_wrapped/diffusers_pipeline.py | 2 +- .../stable_diffusion_pipeline.py | 69 ++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py index ce0455b6..3eed0947 100644 --- a/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py +++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py @@ -15,7 +15,7 @@ class DiffusersPipeline(Pipeline): """ use `model` to create a diffusers pipeline Args: - model: model id on modelscope hub. + model: model id on modelscope hub or local dir. device: str = 'gpu' """ diff --git a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py index f09d459d..1b75656e 100644 --- a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py +++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py @@ -48,6 +48,60 @@ class StableDiffusionPipeline(DiffusersPipeline): def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: + """ + Inputs Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + """ if not isinstance(inputs, dict): raise ValueError( f'Expected the input to be a dictionary, but got {type(input)}' @@ -57,7 +111,20 @@ class StableDiffusionPipeline(DiffusersPipeline): raise ValueError('input should contain "text", but not found') images = self.pipeline( - inputs['text'], num_inference_steps=30, guidance_scale=7.5) + prompt=inputs.get('text'), + height=inputs.get('height'), + width=inputs.get('width'), + num_inference_steps=inputs.get('num_inference_steps', 50), + guidance_scale=inputs.get('guidance_scale', 7.5), + negative_prompt=inputs.get('negative_prompt'), + num_images_per_prompt=inputs.get('num_images_per_prompt', 1), + eta=inputs.get('eta', 0.0), + generator=inputs.get('generator'), + latents=inputs.get('latents'), + output_type=inputs.get('output_type', 'pil'), + return_dict=inputs.get('return_dict', True), + callback=inputs.get('callback'), + callback_steps=inputs.get('callback_steps', 1)) return images From 0c43a0e8ea97ec6cf936d4a3f8330aea9db7fa6d Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 11 Jul 2023 16:25:11 +0800 Subject: [PATCH 3/6] fix load meta-csv cathe paths --- .../msdatasets/download/dataset_builder.py | 17 +++++++++++++---- modelscope/version.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/modelscope/msdatasets/download/dataset_builder.py b/modelscope/msdatasets/download/dataset_builder.py index 140503f0..ad5ebbcf 100644 --- a/modelscope/msdatasets/download/dataset_builder.py +++ b/modelscope/msdatasets/download/dataset_builder.py @@ -223,11 +223,20 @@ class CsvDatasetBuilder(csv.Csv): if field_name.endswith(':FILE'): transform_fields.append(field_name) - base_extracted_dir = self.split_path_dict.get(split_name, '') + base_extracted_dir: Union[str, list] = self.split_path_dict.get(split_name, '') for field_name in transform_fields: - if base_extracted_dir: - df[field_name] = df[field_name].apply( - lambda x: os.path.join(base_extracted_dir, x)) + if isinstance(base_extracted_dir, list) and len(base_extracted_dir) > 0: + if df.shape[0] != len(base_extracted_dir): + logger.error( + f"Number of lines in meta-csv file for split '{split_name}' ({df.shape[0]}) " + f"does not match number of data-files({len(base_extracted_dir)})!" + ) + else: + df[field_name] = base_extracted_dir + elif isinstance(base_extracted_dir, str) and base_extracted_dir: + df[field_name] = df[field_name].apply(lambda x: os.path.join(base_extracted_dir, x)) + else: + logger.warning(f'Nothing to do for field {field_name}') pa_data = pa.Table.from_pandas(df) return Dataset(arrow_table=pa_data) diff --git a/modelscope/version.py b/modelscope/version.py index e4028ca2..fbb09a54 100644 --- a/modelscope/version.py +++ b/modelscope/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '1.7.0' +__version__ = '1.7.1' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-10-13 08:56:12' From d8dd3989de59c34674a3209fdedae9d621abab25 Mon Sep 17 00:00:00 2001 From: Wang Qiang <37444407+XDUWQ@users.noreply.github.com> Date: Tue, 11 Jul 2023 16:28:35 +0800 Subject: [PATCH 4/6] Xformers accelerate memory efficient attention (#362) * xformers accelerate memory efficient attention * xformers * precommit --------- Co-authored-by: XDUWQ --- .../stable_diffusion/stable_diffusion.py | 14 ++++++++++++++ modelscope/utils/error.py | 6 ++++++ modelscope/utils/import_utils.py | 1 + 3 files changed, 21 insertions(+) diff --git a/modelscope/models/multi_modal/stable_diffusion/stable_diffusion.py b/modelscope/models/multi_modal/stable_diffusion/stable_diffusion.py index 88cb4969..6b829485 100644 --- a/modelscope/models/multi_modal/stable_diffusion/stable_diffusion.py +++ b/modelscope/models/multi_modal/stable_diffusion/stable_diffusion.py @@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union import torch import torch.nn.functional as F from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel +from packaging import version from transformers import CLIPTextModel, CLIPTokenizer from modelscope.metainfo import Models @@ -34,6 +35,7 @@ class StableDiffusion(TorchModel): """ super().__init__(model_dir, *args, **kwargs) revision = kwargs.pop('revision', None) + xformers_enable = kwargs.pop('xformers_enable', False) self.lora_tune = kwargs.pop('lora_tune', False) self.dreambooth_tune = kwargs.pop('dreambooth_tune', False) @@ -66,6 +68,18 @@ class StableDiffusion(TorchModel): self.unet.requires_grad_(False) self.unet = self.unet.to(self.device) + # xformers accelerate memory efficient attention + if xformers_enable: + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse('0.0.16'): + logger.warn( + 'xFormers 0.0.16 cannot be used for training in some GPUs. ' + 'If you observe problems during training, please update xFormers to at least 0.0.17.' + ) + self.unet.enable_xformers_memory_efficient_attention() + def tokenize_caption(self, captions): """ Convert caption text to token data. diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py index 841662c0..8259c7ce 100644 --- a/modelscope/utils/error.py +++ b/modelscope/utils/error.py @@ -168,3 +168,9 @@ TAMING_IMPORT_ERROR = """ {0} requires the timm library but it was not found in your environment. You can install it with pip: `pip install taming-transformers-rom1504` """ + +# docstyle-ignore +XFORMERS_IMPORT_ERROR = """ +{0} requires the timm library but it was not found in your environment. You can install it with pip: +`pip install xformers>=0.0.17` +""" diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py index 3e8be2e1..f2fc7e37 100644 --- a/modelscope/utils/import_utils.py +++ b/modelscope/utils/import_utils.py @@ -306,6 +306,7 @@ REQUIREMENTS_MAAPING = OrderedDict([ ('mpi4py', (is_package_available('mpi4py'), MPI4PY_IMPORT_ERROR)), ('open_clip', (is_package_available('open_clip'), OPENCLIP_IMPORT_ERROR)), ('taming', (is_package_available('taming'), TAMING_IMPORT_ERROR)), + ('xformers', (is_package_available('xformers'), XFORMERS_IMPORT_ERROR)), ]) SYSTEM_PACKAGE = set(['os', 'sys', 'typing']) From d20d033e07fe8a3ace848528b2f2b23bd6873c36 Mon Sep 17 00:00:00 2001 From: Jintao Date: Tue, 11 Jul 2023 17:35:11 +0800 Subject: [PATCH 5/6] add example/llm (#372) * add example/llm * fix lint test --- examples/pytorch/llm/_common.py | 449 ++++++++++++++++++ examples/pytorch/llm/baichuan_infer.py | 62 +++ examples/pytorch/llm/baichuan_sft.py | 199 ++++++++ examples/pytorch/llm/chatglm2_infer.py | 60 +++ examples/pytorch/llm/chatglm2_sft.py | 188 ++++++++ examples/pytorch/llm_agent/_common.py | 9 +- .../pytorch/llm_agent/baichuan_infer.ipynb | 42 +- examples/pytorch/llm_agent/baichuan_sft.ipynb | 124 ++--- .../pytorch/llm_agent/chatglm2_infer.ipynb | 42 +- examples/pytorch/llm_agent/chatglm2_sft.ipynb | 127 ++--- setup.cfg | 2 +- 11 files changed, 1124 insertions(+), 180 deletions(-) create mode 100644 examples/pytorch/llm/_common.py create mode 100644 examples/pytorch/llm/baichuan_infer.py create mode 100644 examples/pytorch/llm/baichuan_sft.py create mode 100644 examples/pytorch/llm/chatglm2_infer.py create mode 100644 examples/pytorch/llm/chatglm2_sft.py diff --git a/examples/pytorch/llm/_common.py b/examples/pytorch/llm/_common.py new file mode 100644 index 00000000..79a958ec --- /dev/null +++ b/examples/pytorch/llm/_common.py @@ -0,0 +1,449 @@ +import ast +import datetime as dt +import math +import os +import random +import re +import sys +from functools import partial +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import json +import matplotlib.pyplot as plt +import numpy as np +# +import torch +import torch.nn as nn +import torch.optim as optim +from datasets import Dataset as HFDataset +from datasets import concatenate_datasets +from matplotlib.axes import Axes +from matplotlib.figure import Figure +from numpy import ndarray +from tensorboard.backend.event_processing.event_accumulator import \ + EventAccumulator +from torch import Tensor +from torch import device as Device +from torch import dtype as Dtype +from torch.nn import Module +from torch.nn.parameter import Parameter +from torch.nn.utils.rnn import pad_sequence +from torch.optim import Optimizer +from torch.optim import lr_scheduler as lrs +from torch.optim.lr_scheduler import _LRScheduler as LRScheduler +from torch.utils.data import Dataset +# +from torchmetrics import Accuracy, MeanMetric +# +from tqdm import tqdm + +# +from modelscope import (Model, MsDataset, get_logger, read_config, + snapshot_download) +from modelscope.metrics.base import Metric +from modelscope.metrics.builder import METRICS +from modelscope.models.nlp.chatglm2 import ChatGLM2Tokenizer +from modelscope.msdatasets.dataset_cls.custom_datasets import \ + TorchCustomDataset +from modelscope.swift import LoRAConfig, Swift +from modelscope.trainers import EpochBasedTrainer +from modelscope.utils.config import Config, ConfigDict +from modelscope.utils.registry import default_group + +# +TEST_SPLIT_P = 0.01 +SPLIT_SEED = 42 +MAX_LENGTH: Optional[int] = 2048 +COLOR, COLOR_S = '#FFE2D9', '#FF7043' + +PROMPT = """### 用户 +{instruction} +### AI助手 +""" + +logger = get_logger() +# + + +def get_model_dir(model_id: str, model_revision: Optional[str] = None) -> str: + model_dir = snapshot_download(model_id, model_revision) + return model_dir + + +def _get_version(work_dir: str) -> int: + if os.path.isdir(work_dir): + fnames = os.listdir(work_dir) + else: + fnames = [] + v_list = [-1] + for fname in fnames: + m = re.match(r'v(\d+)', fname) + if m is None: + continue + v = m.group(1) + v_list.append(int(v)) + return max(v_list) + 1 + + +def get_work_dir(work_dir: str) -> str: + """add version""" + work_dir = os.path.abspath(work_dir) + version = _get_version(work_dir) + time = dt.datetime.now().strftime('%Y%m%d-%H%M%S') + # + work_dir = os.path.join(work_dir, f'v{version}-{time}') + logger.info(f'work_dir: {work_dir}') + return work_dir + + +def select_device(device_ids: List[int]) -> Device: + """Call this function before cuda is initialized. + Return: master device + """ + if torch.cuda.is_initialized(): + logger.warning('CUDA has been initialized! Device selection fails!') + return torch.device('cuda:0') + # + log_s = 'Using device: ' + if len(device_ids) == 0: # cpu + os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + device: str = 'cpu' + log_s += device + else: + os.environ['CUDA_VISIBLE_DEVICES'] = ','.join( + [str(d) for d in device_ids]) + assert torch.cuda.is_available( + ) and torch.cuda.device_count() >= len(device_ids) + log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. 'cuda:1,7,8' + device = 'cuda:0' + logger.info(log_s) + return torch.device(device) + + +def seed_everything(seed: Optional[int] = None, gpu_dtm: bool = False) -> int: + if seed is None: + seed_max = np.iinfo(np.int32).max + seed = random.randint(0, seed_max) + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + logger.info(f'Global seed set to {seed}') + if gpu_dtm: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + logger.info(f'Setting deterministic: {True}, benchmark: {False}') + return seed + + +def get_T_max(dataset_len: int, batch_size: int, max_epochs: int, + drop_last: bool) -> int: + """Calculate T_max in CosineAnnealingLR""" + if drop_last: + T_max = dataset_len // batch_size + else: + T_max = math.ceil(dataset_len / batch_size) + T_max *= max_epochs + return T_max + + +def tokenize_function(example: Dict[str, str], tokenizer) -> Dict[str, Any]: + """Only applicable to baichuan and chatglm2. Other models need to be tested""" + instruction = example['instruction'] + input_: str = example['input'] + if input_ is not None and input_ != '': + # instruction = instruction + '\n' + if input_.startswith('输入:'): + instruction = instruction + input_[3:] + else: + instruction = instruction + input_ + output = example['output'] + src_text = PROMPT.format(instruction=instruction, add_special_tokens=False) + src_input_ids: List[int] = tokenizer( + src_text, return_attention_mask=False, + add_special_tokens=True)['input_ids'] + # tokenizer.bos_token_id: Avoid `tgt_input_ids` being empty + tgt_input_ids = [tokenizer.bos_token_id] + if output is not None: + tgt_input_ids += tokenizer( + output, return_attention_mask=False, + add_special_tokens=False)['input_ids'] + tgt_input_ids += [tokenizer.eos_token_id] + labels = [-100] * len(src_input_ids) + tgt_input_ids + else: + labels = None + input_ids = src_input_ids + tgt_input_ids + # + if MAX_LENGTH is not None: + input_ids = input_ids[-MAX_LENGTH:] + if labels is not None: + labels = labels[-MAX_LENGTH:] + # + return {'input_ids': input_ids, 'labels': labels} + + +def stat_dataset(dataset: HFDataset) -> None: + """Statistical analysis was performed on the data set""" + _token_len = [] + for d in dataset: + _token_len.append(len(d['input_ids'])) + _token_len = np.array(_token_len) + mean = _token_len.mean().item() + std = _token_len.std().item() + min_ = _token_len.min().item() + max_ = _token_len.max().item() + logger.info( + f'Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}' + ) + + +def print_examples(examples: Dict[str, Any], tokenizer) -> None: + input_ids, labels = examples['input_ids'], examples['labels'] + print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}') + print() + print( + f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}' + ) + + +def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: + input_ids = [torch.tensor(b['input_ids']) for b in batch] + labels = [torch.tensor(b['labels']) for b in batch] + attention_mask = [ + torch.ones(len(input_ids[i]), dtype=torch.int64) + for i in range(len(input_ids)) + ] + # + input_ids = pad_sequence( + input_ids, batch_first=True, padding_value=tokenizer.pad_token_id) + attention_mask = pad_sequence( + attention_mask, batch_first=True, padding_value=0) + labels = pad_sequence(labels, batch_first=True, padding_value=-100) + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'labels': labels + } + + +def print_model_info(model: Module, name: Optional[str] = None) -> None: + if name is None: + name = model.__class__.__name__ + # + n_params = sum(p.numel() for p in model.parameters()) + n_grads = sum(p.numel() for p in model.parameters() if p.requires_grad) + n_buffers = sum(p.numel() for p in model.buffers()) + # + n_params /= 1e6 + n_grads /= 1e6 + n_buffers /= 1e6 + s = [ + f'{name}: ', + f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ', + f'{n_buffers:.4f}M Buffers', + ] + s += '.' + logger.info(''.join(s)) + + +def show_freeze_layers(model: Module, max_lines: int = 20) -> None: + named_p = list(model.named_parameters()) + for i, (n, p) in enumerate(named_p): + if i >= max_lines: + logger.info('...') + break + logger.info(f'{n}: requires_grad={p.requires_grad}') + + +@METRICS.register_module(group_key=default_group, module_name='my_metric') +class MyMetric(Metric): + + def __init__(self, vocab_size: int): + self.acc = Accuracy('multiclass', num_classes=vocab_size) + self.loss = MeanMetric() + + def add(self, outputs: Dict[str, Any], inputs: Dict[str, Any]) -> None: + loss: Tensor = outputs.loss + self.loss.update(loss) + # + labels: Tensor = inputs['labels'] + labels = labels[:, 1:] + labels_mask = labels != -100 + logits: Tensor = outputs.logits[:, :-1] + logits = logits[labels_mask].contiguous().view(-1, logits.shape[-1]) + pred = logits.argmax(dim=-1) + labels = labels[labels_mask].to(logits.device) + self.acc.update(pred, labels) + + def evaluate(self): + return { + 'acc': self.acc.compute().item(), + 'loss': self.loss.compute().item() + } + + def merge(self, other: 'MyMetric') -> None: + """This script does not support ddp""" + raise NotImplementedError + + +def get_baichuan7B_model_tokenizer(model_dir: Optional[str] = None, + load_model: bool = True): + if model_dir is None: + model_id = 'baichuan-inc/baichuan-7B' + model_dir = get_model_dir(model_id, None) + # + sys.path.insert(0, model_dir) + from configuration_baichuan import BaiChuanConfig + from tokenization_baichuan import BaiChuanTokenizer + from modeling_baichuan import BaiChuanForCausalLM + model_config = BaiChuanConfig.from_pretrained(model_dir) + model_config.torch_dtype = torch.float16 + logger.info(f'model_config: {model_config}') + tokenizer = BaiChuanTokenizer.from_pretrained(model_dir) + model = None + if load_model: + model = BaiChuanForCausalLM.from_pretrained( + model_dir, + config=model_config, + device_map='auto', + torch_dtype=torch.float16) + # + return model, tokenizer + + +def get_baichuan13B_model_tokenizer(model_dir: Optional[str] = None, + load_model: bool = True): + if model_dir is None: + model_id = 'baichuan-inc/Baichuan-13B-Base' + model_dir = get_model_dir(model_id, 'v1.0.1') + # + sys.path.insert(0, model_dir) + from configuration_baichuan import BaichuanConfig + from tokenization_baichuan import BaichuanTokenizer + from modeling_baichuan import BaichuanForCausalLM + model_config = BaichuanConfig.from_pretrained(model_dir) + model_config.torch_dtype = torch.float16 + logger.info(f'model_config: {model_config}') + tokenizer = BaichuanTokenizer.from_pretrained(model_dir) + model = None + if load_model: + model = BaichuanForCausalLM.from_pretrained( + model_dir, + config=model_config, + device_map='auto', + torch_dtype=torch.float16) + # + return model, tokenizer + + +def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None, + load_model: bool = True): + if model_dir is None: + model_id = 'ZhipuAI/chatglm2-6b' + model_dir = snapshot_download(model_id, None) + # + config = read_config(model_dir) + config['model'] = ConfigDict({'type': 'chatglm2-6b'}) + tokenizer = ChatGLM2Tokenizer.from_pretrained(model_dir) + model = None + if load_model: + model = Model.from_pretrained( + model_dir, + cfg_dict=config, + device_map='auto', + torch_dtype=torch.float16) + return model, tokenizer + + +def get_alpaca_en_zh_dataset( + tokenize_function, + only_val: bool = False) -> Tuple[HFDataset, HFDataset]: + """ + split: Literal['train', 'validation', None] + """ + + dataset_en: HFDataset = MsDataset.load( + 'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset() + dataset_zh: HFDataset = MsDataset.load( + 'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset() + dataset_en = dataset_en.remove_columns(['text']) + dataset: HFDataset = concatenate_datasets([dataset_zh, dataset_en]) + # + # dataset = dataset.select(range(1000)) # for debug + dataset = dataset.train_test_split(TEST_SPLIT_P, seed=SPLIT_SEED) + if only_val: + dataset = dataset['test'] + if tokenize_function is not None: + dataset = dataset.map(tokenize_function) + dataset = dataset.remove_columns(['instruction', 'input', 'output']) + # + if only_val: + return None, dataset + else: + return dataset['train'], dataset['test'] + + +Item = Dict[str, float] + + +def read_tensorboard_file(fpath: str) -> Dict[str, List[Item]]: + if not os.path.isfile(fpath): + raise FileNotFoundError(f'fpath: {fpath}') + ea = EventAccumulator(fpath) + ea.Reload() + res = {} + tags = ea.Tags()['scalars'] + for tag in tags: + values = ea.Scalars(tag) + r = [] + for v in values: + r.append({'step': v.step, 'value': v.value}) + res[tag] = r + return res + + +def tensorboard_smoothing(values: List[float], + smooth: float = 0.9) -> List[float]: + norm_factor = 1 + x = 0 + res = [] + for i in range(len(values)): + x = x * smooth + values[i] # Exponential decay + res.append(x / norm_factor) + # + norm_factor *= smooth + norm_factor += 1 + return res + + +def plot_image(tb_dir: str, + smooth_key: List[str], + smooth_val: float = 0.9, + figsize: Tuple[int, int] = (8, 5), + dpi: int = 100) -> None: + image_dir = os.path.join(os.path.dirname(tb_dir), 'images') + os.makedirs(image_dir, exist_ok=True) + # + fname = os.listdir(tb_dir)[0] + tb_path = os.path.join(tb_dir, fname) + data = read_tensorboard_file(tb_path) + # + for k in data.keys(): + _data = data[k] + steps = [d['step'] for d in _data] + values = [d['value'] for d in _data] + if len(values) == 0: + continue + _, ax = plt.subplots(1, 1, squeeze=True, figsize=figsize, dpi=dpi) + ax.set_title(k) + if len(values) == 1: + ax.scatter(steps, values, color=COLOR_S) + elif k in smooth_key: + ax.plot(steps, values, color=COLOR) + values_s = tensorboard_smoothing(values, smooth_val) + ax.plot(steps, values_s, color=COLOR_S) + else: + ax.plot(steps, values, color=COLOR_S) + fpath = os.path.join(image_dir, k.replace('/', '_')) + plt.savefig(fpath, dpi=dpi, bbox_inches='tight') diff --git a/examples/pytorch/llm/baichuan_infer.py b/examples/pytorch/llm/baichuan_infer.py new file mode 100644 index 00000000..f9a49c09 --- /dev/null +++ b/examples/pytorch/llm/baichuan_infer.py @@ -0,0 +1,62 @@ +# ### Setting up experimental environment. +from _common import * +from transformers import TextStreamer + +device_ids = [0, 1] +logger.info(device_ids) +select_device(device_ids) + +# ### Loading Model and Tokenizer +# Note: You need to set the value of `CKPT_FPATH` +BAICHUAN_TYPE = '13B' # Literal['7B', '13B'] +CKPT_FAPTH = '/path/to/your/xxx.pth' +LORA_TARGET_MODULES = ['W_pack'] + +if BAICHUAN_TYPE == '7B': + model, tokenizer = get_baichuan7B_model_tokenizer() +else: + model, tokenizer = get_baichuan13B_model_tokenizer() +if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id +model.bfloat16() # Consistent with training + +# ### Preparing lora +LORA_RANK = 8 +LORA_ALPHA = 32 +LORA_DROPOUT_P = 0 # Arbitrary value +lora_config = LoRAConfig( + replace_modules=LORA_TARGET_MODULES, + rank=LORA_RANK, + lora_alpha=LORA_ALPHA, + lora_dropout=LORA_DROPOUT_P, + pretrained_weights=CKPT_FAPTH) +logger.info(f'lora_config: {lora_config}') +Swift.prepare_model(model, lora_config) + +# ### Loading Dataset +_, test_dataset = get_alpaca_en_zh_dataset(None, True) + +# ### Inference +streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) +for d in test_dataset[:5]: + output = d['output'] + d['output'] = None + input_ids = tokenize_function(d, tokenizer)['input_ids'] + print(f'[TEST]{tokenizer.decode(input_ids)}', end='') + input_ids = torch.tensor(input_ids)[None].cuda() + attention_mask = torch.ones_like(input_ids) + generate_ids = model.generate( + input_ids=input_ids, + max_new_tokens=512, + attention_mask=attention_mask, + streamer=streamer, + pad_token_id=tokenizer.pad_token_id, + temperature=0.7, + top_k=50, + do_sample=True) + print() + print(f'[LABELS]{output}') + print( + '-----------------------------------------------------------------------------------' + ) + # input('next[ENTER]') diff --git a/examples/pytorch/llm/baichuan_sft.py b/examples/pytorch/llm/baichuan_sft.py new file mode 100644 index 00000000..18f71d22 --- /dev/null +++ b/examples/pytorch/llm/baichuan_sft.py @@ -0,0 +1,199 @@ +# ### Setting up experimental environment. +""" +pip install modelscope +pip install numpy pandas matplotlib scikit-learn +pip install transformers datasets +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +pip install tqdm +pip install tensorboard +pip install torchmetrics +pip install sentencepiece +pip install accelerate + +pip install numpy -U # Resolve torchmetrics dependencies and update numpy +""" + +from _common import * + +device_ids = [0, 1, 2, 3] +logger.info(device_ids) +select_device(device_ids) +seed_everything(42) + +# ### Loading Model and Tokenizer +BAICHUAN_TYPE = '13B' # Literal['7B', '13B'] +WORK_DIR = f'runs/baichuan_{BAICHUAN_TYPE}' +LORA_TARGET_MODULES = ['W_pack'] +# +if BAICHUAN_TYPE == '7B': + model_id = 'baichuan-inc/baichuan-7B' + model_dir = get_model_dir(model_id, None) + model, tokenizer = get_baichuan7B_model_tokenizer(model_dir) +else: + model_id = 'baichuan-inc/Baichuan-13B-Base' + model_dir = get_model_dir(model_id, 'v1.0.1') + model, tokenizer = get_baichuan13B_model_tokenizer(model_dir) +# +GRADIENT_CHECKPOINTING = True +if GRADIENT_CHECKPOINTING: + # baichuan13B does not implement the `get_input_embeddings` function + if BAICHUAN_TYPE == '13B': + + def get_input_embeddings(self): + return self.model.embed_tokens + + model.__class__.get_input_embeddings = get_input_embeddings.__get__( + model) + model.gradient_checkpointing_enable() + model.enable_input_require_grads() +if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id +# +logger.info( + f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, ' + f'pad_token_id: {tokenizer.pad_token_id}') + +# ### Preparing lora +LORA_RANK = 8 +LORA_ALPHA = 32 +LORA_DROPOUT_P = 0.1 +lora_config = LoRAConfig( + replace_modules=LORA_TARGET_MODULES, + rank=LORA_RANK, + lora_alpha=LORA_ALPHA, + lora_dropout=LORA_DROPOUT_P) +logger.info(f'lora_config: {lora_config}') +Swift.prepare_model(model, lora_config) +# +show_freeze_layers(model) +print_model_info(model) +_p = list(model.parameters())[100] +logger.info(f'device: {_p.device}, dtype: {_p.dtype}') +model.bfloat16() + +# ### Loading Dataset +tokenize_function = partial(tokenize_function, tokenizer=tokenizer) +train_dataset, val_dataset = get_alpaca_en_zh_dataset(tokenize_function) +# Data analysis +stat_dataset(train_dataset) +stat_dataset(val_dataset) +data_collate_fn = partial(data_collate_fn, tokenizer=tokenizer) +print_examples(train_dataset[0], tokenizer) + +# ### Setting Config +cfg_file = os.path.join(model_dir, 'configuration.json') +# +BATCH_SIZE = 1 +MAX_EPOCHS = 1 +T_max = get_T_max(len(train_dataset), BATCH_SIZE, MAX_EPOCHS, True) +WORK_DIR = get_work_dir(WORK_DIR) +EVAL_INTERVAL = 500 +CONFIG = Config({ + 'train': { + 'dataloader': { + 'batch_size_per_gpu': BATCH_SIZE, + 'workers_per_gpu': 1, + 'shuffle': True, + 'drop_last': True, + 'pin_memory': True + }, + 'max_epochs': + MAX_EPOCHS, + 'work_dir': + WORK_DIR, + 'optimizer': { + 'type': 'AdamW', + 'lr': 1e-4, + 'weight_decay': 0.01, + 'options': { + 'cumulative_iters': 16, + 'grad_clip': { + 'norm_type': 2, + 'max_norm': 2.0 + } + } + }, + 'lr_scheduler': { + 'type': 'CosineAnnealingLR', + 'T_max': T_max, + 'eta_min': 1e-5, + 'options': { + 'by_epoch': False, + 'warmup': { + 'type': 'LinearWarmup', + 'warmup_ratio': 0.1, + 'warmup_iters': 200 + } + } + }, + 'hooks': [ + { + 'type': 'CheckpointHook', + 'by_epoch': False, + 'interval': EVAL_INTERVAL, + 'max_checkpoint_num': 1 + }, + { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': EVAL_INTERVAL + }, + { + 'type': 'BestCkptSaverHook', + 'metric_key': 'acc', + 'save_best': True, + 'rule': 'max', + 'max_checkpoint_num': 1 + }, + { + 'type': 'TextLoggerHook', + 'by_epoch': True, # Whether EpochBasedTrainer is used + 'interval': 5 + }, + { + 'type': 'TensorboardHook', + 'by_epoch': False, + 'interval': 5 + } + ] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': BATCH_SIZE, + 'workers_per_gpu': 1, + 'shuffle': False, + 'drop_last': False, + 'pin_memory': True + }, + 'metrics': [{ + 'type': 'my_metric', + 'vocab_size': tokenizer.vocab_size + }] + } +}) + +# ### Finetuning + + +def cfg_modify_fn(cfg: Config) -> Config: + cfg.update(CONFIG) + return cfg + + +trainer = EpochBasedTrainer( + model=model, + cfg_file=cfg_file, + data_collator=data_collate_fn, + train_dataset=train_dataset, + eval_dataset=val_dataset, + remove_unused_data=True, + seed=42, + device='cpu', # No placement for model, leave the model to `device_map` + cfg_modify_fn=cfg_modify_fn, +) + +trainer.train() + +# ### Visualization +tb_dir = os.path.join(WORK_DIR, 'tensorboard_output') +plot_image(tb_dir, ['loss'], 0.9) diff --git a/examples/pytorch/llm/chatglm2_infer.py b/examples/pytorch/llm/chatglm2_infer.py new file mode 100644 index 00000000..741f9b18 --- /dev/null +++ b/examples/pytorch/llm/chatglm2_infer.py @@ -0,0 +1,60 @@ +# ### Setting up experimental environment. +from _common import * +from transformers import TextStreamer + +device_ids = [0, 1] +logger.info(device_ids) +select_device(device_ids) + +# ### Loading Model and Tokenizer +# Note: You need to set the value of `CKPT_FPATH` +CKPT_FAPTH = '/path/to/your/xxx.pth' +LORA_TARGET_MODULES = ['query_key_value'] + +model, tokenizer = get_chatglm2_model_tokenizer() +if tokenizer.eos_token_id is None: + tokenizer.eos_token_id = tokenizer.pad_token_id +if tokenizer.bos_token_id is None: + tokenizer.bos_token_id = 1 +model.bfloat16() # Consistent with training + +# ### Preparing lora +LORA_RANK = 8 +LORA_ALPHA = 32 +LORA_DROPOUT_P = 0 # Arbitrary value +lora_config = LoRAConfig( + replace_modules=LORA_TARGET_MODULES, + rank=LORA_RANK, + lora_alpha=LORA_ALPHA, + lora_dropout=LORA_DROPOUT_P, + pretrained_weights=CKPT_FAPTH) +logger.info(f'lora_config: {lora_config}') +Swift.prepare_model(model, lora_config) + +# ### Loading Dataset +_, test_dataset = get_alpaca_en_zh_dataset(None, True) + +# ### Inference +streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) +for d in test_dataset[:5]: + output = d['output'] + d['output'] = None + input_ids = tokenize_function(d, tokenizer)['input_ids'] + print(f'[TEST]{tokenizer.decode(input_ids)}', end='') + input_ids = torch.tensor(input_ids)[None].cuda() + attention_mask = torch.ones_like(input_ids) + generate_ids = model.generate( + input_ids=input_ids, + max_new_tokens=512, + attention_mask=attention_mask, + streamer=streamer, + pad_token_id=tokenizer.pad_token_id, + temperature=0.7, + top_k=50, + do_sample=True) + print() + print(f'[LABELS]{output}') + print( + '-----------------------------------------------------------------------------------' + ) + # input('next[ENTER]') diff --git a/examples/pytorch/llm/chatglm2_sft.py b/examples/pytorch/llm/chatglm2_sft.py new file mode 100644 index 00000000..ecd497a2 --- /dev/null +++ b/examples/pytorch/llm/chatglm2_sft.py @@ -0,0 +1,188 @@ +# ### Setting up experimental environment. +""" +pip install modelscope +pip install numpy pandas matplotlib scikit-learn +pip install transformers datasets +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +pip install tqdm +pip install tensorboard +pip install torchmetrics +pip install sentencepiece +pip install accelerate + +pip install numpy -U # Resolve torchmetrics dependencies and update numpy +""" + +from _common import * + +device_ids = [0, 1, 2, 3] +logger.info(device_ids) +select_device(device_ids) +seed_everything(42) + +# ### Loading Model and Tokenizer +model_id = 'ZhipuAI/chatglm2-6b' +WORK_DIR = 'runs/chatglm2' +LORA_TARGET_MODULES = ['query_key_value'] +# +model_dir = get_model_dir(model_id, None) +model, tokenizer = get_chatglm2_model_tokenizer(model_dir) +# chatglm2 does not support gradient_checkpointing +GRADIENT_CHECKPOINTING = False +if GRADIENT_CHECKPOINTING: + model.gradient_checkpointing_enable() + model.enable_input_require_grads() +logger.info(tokenizer.special_tokens) +if tokenizer.eos_token_id is None: + tokenizer.eos_token_id = tokenizer.pad_token_id +if tokenizer.bos_token_id is None: + tokenizer.bos_token_id = 1 +# +logger.info( + f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, ' + f'pad_token_id: {tokenizer.pad_token_id}') + +# ### Preparing lora +LORA_RANK = 8 +LORA_ALPHA = 32 +LORA_DROPOUT_P = 0.1 +lora_config = LoRAConfig( + replace_modules=LORA_TARGET_MODULES, + rank=LORA_RANK, + lora_alpha=LORA_ALPHA, + lora_dropout=LORA_DROPOUT_P) +logger.info(f'lora_config: {lora_config}') +Swift.prepare_model(model, lora_config) +# +show_freeze_layers(model) +print_model_info(model) +_p = list(model.parameters())[100] +logger.info(f'device: {_p.device}, dtype: {_p.dtype}') +model.bfloat16() + +# ### Loading Dataset +tokenize_function = partial(tokenize_function, tokenizer=tokenizer) +train_dataset, val_dataset = get_alpaca_en_zh_dataset(tokenize_function) +# Data analysis +stat_dataset(train_dataset) +stat_dataset(val_dataset) +data_collate_fn = partial(data_collate_fn, tokenizer=tokenizer) +print_examples(train_dataset[0], tokenizer) + +# ### Setting Config +cfg_file = os.path.join(model_dir, 'configuration.json') +# +BATCH_SIZE = 1 +MAX_EPOCHS = 1 +T_max = get_T_max(len(train_dataset), BATCH_SIZE, MAX_EPOCHS, True) +WORK_DIR = get_work_dir(WORK_DIR) +EVAL_INTERVAL = 500 +CONFIG = Config({ + 'train': { + 'dataloader': { + 'batch_size_per_gpu': BATCH_SIZE, + 'workers_per_gpu': 1, + 'shuffle': True, + 'drop_last': True, + 'pin_memory': True + }, + 'max_epochs': + MAX_EPOCHS, + 'work_dir': + WORK_DIR, + 'optimizer': { + 'type': 'AdamW', + 'lr': 1e-4, + 'weight_decay': 0.01, + 'options': { + 'cumulative_iters': 16, + 'grad_clip': { + 'norm_type': 2, + 'max_norm': 2.0 + } + } + }, + 'lr_scheduler': { + 'type': 'CosineAnnealingLR', + 'T_max': T_max, + 'eta_min': 1e-5, + 'options': { + 'by_epoch': False, + 'warmup': { + 'type': 'LinearWarmup', + 'warmup_ratio': 0.1, + 'warmup_iters': 200 + } + } + }, + 'hooks': [ + { + 'type': 'CheckpointHook', + 'by_epoch': False, + 'interval': EVAL_INTERVAL, + 'max_checkpoint_num': 1 + }, + { + 'type': 'EvaluationHook', + 'by_epoch': False, + 'interval': EVAL_INTERVAL + }, + { + 'type': 'BestCkptSaverHook', + 'metric_key': 'acc', + 'save_best': True, + 'rule': 'max', + 'max_checkpoint_num': 1 + }, + { + 'type': 'TextLoggerHook', + 'by_epoch': True, # Whether EpochBasedTrainer is used + 'interval': 5 + }, + { + 'type': 'TensorboardHook', + 'by_epoch': False, + 'interval': 5 + } + ] + }, + 'evaluation': { + 'dataloader': { + 'batch_size_per_gpu': BATCH_SIZE, + 'workers_per_gpu': 1, + 'shuffle': False, + 'drop_last': False, + 'pin_memory': True + }, + 'metrics': [{ + 'type': 'my_metric', + 'vocab_size': tokenizer.vocab_size + }] + } +}) + +# ### Finetuning + + +def cfg_modify_fn(cfg: Config) -> Config: + cfg.update(CONFIG) + return cfg + + +trainer = EpochBasedTrainer( + model=model, + cfg_file=cfg_file, + data_collator=data_collate_fn, + train_dataset=train_dataset, + eval_dataset=val_dataset, + remove_unused_data=True, + seed=42, + device='cpu', # No placement for model, leave the model to `device_map` + cfg_modify_fn=cfg_modify_fn, +) + +trainer.train() + +# ### Visualization +tb_dir = os.path.join(WORK_DIR, 'tensorboard_output') +plot_image(tb_dir, ['loss'], 0.9) diff --git a/examples/pytorch/llm_agent/_common.py b/examples/pytorch/llm_agent/_common.py index dd0cd7d4..04097b50 100644 --- a/examples/pytorch/llm_agent/_common.py +++ b/examples/pytorch/llm_agent/_common.py @@ -111,7 +111,7 @@ def select_device(device_ids: List[int]) -> Device: [str(d) for d in device_ids]) assert torch.cuda.is_available( ) and torch.cuda.device_count() >= len(device_ids) - log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. "cuda:1,7,8" + log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. 'cuda:1,7,8' device = 'cuda:0' logger.info(log_s) return torch.device(device) @@ -221,7 +221,7 @@ def print_examples(examples: Dict[str, Any], tokenizer) -> None: print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}') print() print( - f'[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}' + f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}' ) @@ -334,8 +334,7 @@ def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None, load_model: bool = True): if model_dir is None: model_id = 'ZhipuAI/chatglm2-6b' - model_revision = 'v1.0.3' - model_dir = snapshot_download(model_id, model_revision) + model_dir = snapshot_download(model_id, None) # config = read_config(model_dir) config['model'] = ConfigDict({'type': 'chatglm2-6b'}) @@ -355,7 +354,7 @@ def make_dataset( Dict[str, Any]] ) -> MyDataset: """ - split: Literal["train", "validation"] + split: Literal['train', 'validation'] """ dataset = MsDataset.load( 'modelscope/ms_hackathon_23_agent_train_dev', split=split) diff --git a/examples/pytorch/llm_agent/baichuan_infer.ipynb b/examples/pytorch/llm_agent/baichuan_infer.ipynb index 77719fc1..03f8f46b 100644 --- a/examples/pytorch/llm_agent/baichuan_infer.ipynb +++ b/examples/pytorch/llm_agent/baichuan_infer.ipynb @@ -16,15 +16,6 @@ "### 配置实验环境" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install transformers" - ] - }, { "cell_type": "code", "execution_count": 1, @@ -62,7 +53,7 @@ "source": [ "from _common import *\n", "from transformers import TextStreamer\n", - "device_ids = list(range(min(4, torch.cuda.device_count())))\n", + "device_ids = [0, 1]\n", "logger.info(device_ids)\n", "select_device(device_ids)" ] @@ -152,8 +143,8 @@ } ], "source": [ - "CKPT_FAPTH = \"/home/hackathon/my_git/agent/runs/baichuan/v10-20230702-172449/output_best/pytorch_model.bin\"\n", - "LORA_TARGET_MODULES = [\"W_pack\"]\n", + "CKPT_FAPTH = '/home/hackathon/my_git/agent/runs/baichuan/v10-20230702-172449/output_best/pytorch_model.bin'\n", + "LORA_TARGET_MODULES = ['W_pack']\n", "\n", "model, tokenizer = get_baichuan_model_tokenizer()\n", "if tokenizer.pad_token_id is None:\n", @@ -225,7 +216,7 @@ " lora_alpha=LORA_ALPHA,\n", " lora_dropout=LORA_DROPOUT_P,\n", " pretrained_weights=CKPT_FAPTH)\n", - "logger.info(f\"lora_config: {lora_config}\")\n", + "logger.info(f'lora_config: {lora_config}')\n", "Swift.prepare_model(model, lora_config)" ] }, @@ -289,8 +280,8 @@ } ], "source": [ - "test_dataset = make_dataset(\"validation\", lambda system, user, assistant:\n", - " {\"system\": system, \"user\": user, \"assistant\": assistant})" + "test_dataset = make_dataset('validation', lambda system, user, assistant:\n", + " {'system': system, 'user': user, 'assistant': assistant})" ] }, { @@ -451,20 +442,21 @@ "source": [ "streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "for d in test_dataset[:5]:\n", - " system = d[\"system\"]\n", - " user = d[\"user\"]\n", - " assistant = d[\"assistant\"]\n", - " input_ids = tokenize_function(system, user, None, tokenizer)[\"input_ids\"]\n", - " print(f\"[TEST]{tokenizer.decode(input_ids)}\", end=\"\")\n", + " system = d['system']\n", + " user = d['user']\n", + " assistant = d['assistant']\n", + " input_ids = tokenize_function(system, user, None, tokenizer)['input_ids']\n", + " print(f'[TEST]{tokenizer.decode(input_ids)}', end='')\n", " input_ids = torch.tensor(input_ids)[None].cuda()\n", " attention_mask = torch.ones_like(input_ids)\n", " generate_ids = model.generate(input_ids=input_ids, max_new_tokens=512,\n", " attention_mask=attention_mask,\n", - " streamer=streamer, pad_token_id=tokenizer.pad_token_id)\n", + " streamer=streamer, pad_token_id=tokenizer.pad_token_id, \n", + " temperature=0.7, top_k=50, do_sample=True)\n", " print()\n", - " print(f\"[LABELS]{assistant}\")\n", - " print(\"-----------------------------------------------------------------------------------\")\n", - " # input(\"next[ENTER]\")" + " print(f'[LABELS]{assistant}')\n", + " print('-----------------------------------------------------------------------------------')\n", + " # input('next[ENTER]')" ] } ], @@ -484,7 +476,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/pytorch/llm_agent/baichuan_sft.ipynb b/examples/pytorch/llm_agent/baichuan_sft.ipynb index 5e656a24..cb732612 100644 --- a/examples/pytorch/llm_agent/baichuan_sft.ipynb +++ b/examples/pytorch/llm_agent/baichuan_sft.ipynb @@ -36,10 +36,12 @@ "# !pip install modelscope -U\n", "# !pip install numpy pandas matplotlib scikit-learn\n", "# !pip install transformers datasets\n", - "# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n", + "# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n", "# !pip install tqdm\n", "# !pip install tensorboard\n", "# !pip install torchmetrics\n", + "# !pip install sentencepiece\n", + "# !pip install accelerate\n", "#\n", "# !pip install numpy -U # Resolve torchmetrics dependencies and update numpy" ] @@ -73,7 +75,7 @@ ], "source": [ "from _common import *\n", - "device_ids = list(range(min(4, torch.cuda.device_count())))\n", + "device_ids = [0, 1, 2, 3]\n", "logger.info(device_ids)\n", "select_device(device_ids)\n", "_ = seed_everything(42)" @@ -130,9 +132,9 @@ } ], "source": [ - "model_id = \"baichuan-inc/baichuan-7B\"\n", - "WORK_DIR = \"runs/baichuan\"\n", - "LORA_TARGET_MODULES = [\"W_pack\"]\n", + "model_id = 'baichuan-inc/baichuan-7B'\n", + "WORK_DIR = 'runs/baichuan'\n", + "LORA_TARGET_MODULES = ['W_pack']\n", "#\n", "model_dir = get_model_dir(model_id, None)\n", "model, tokenizer = get_baichuan_model_tokenizer(model_dir)\n", @@ -144,8 +146,8 @@ "if tokenizer.pad_token_id is None:\n", " tokenizer.pad_token_id = tokenizer.eos_token_id\n", "#\n", - "logger.info(f\"bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, \"\n", - " f\"pad_token_id: {tokenizer.pad_token_id}\")" + "logger.info(f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, '\n", + " f'pad_token_id: {tokenizer.pad_token_id}')" ] }, { @@ -237,13 +239,13 @@ " rank=LORA_RANK,\n", " lora_alpha=LORA_ALPHA,\n", " lora_dropout=LORA_DROPOUT_P)\n", - "logger.info(f\"lora_config: {lora_config}\")\n", + "logger.info(f'lora_config: {lora_config}')\n", "Swift.prepare_model(model, lora_config)\n", "#\n", "show_freeze_layers(model)\n", "print_model_info(model)\n", "_p = list(model.parameters())[100]\n", - "logger.info(f\"device: {_p.device}, dtype: {_p.dtype}\")\n", + "logger.info(f'device: {_p.device}, dtype: {_p.dtype}')\n", "model.bfloat16()" ] }, @@ -308,8 +310,8 @@ ], "source": [ "tokenize_function = partial(tokenize_function, tokenizer=tokenizer)\n", - "train_dataset = make_dataset(\"train\", tokenize_function)\n", - "val_dataset = make_dataset(\"validation\", tokenize_function)\n", + "train_dataset = make_dataset('train', tokenize_function)\n", + "val_dataset = make_dataset('validation', tokenize_function)\n", "# Data analysis\n", "stat_dataset(train_dataset)\n", "stat_dataset(val_dataset)\n", @@ -339,7 +341,7 @@ } ], "source": [ - "cfg_file = os.path.join(model_dir, \"configuration.json\")\n", + "cfg_file = os.path.join(model_dir, 'configuration.json')\n", "#\n", "BATCH_SIZE = 1\n", "MAX_EPOCHS = 1\n", @@ -347,62 +349,62 @@ "WORK_DIR = get_work_dir(WORK_DIR)\n", "EVAL_INTERVAL = 200\n", "CONFIG = Config({\n", - " \"train\": {\n", - " \"dataloader\": {\n", - " \"batch_size_per_gpu\": BATCH_SIZE,\n", - " \"workers_per_gpu\": 1,\n", - " \"shuffle\": True,\n", - " \"drop_last\": True,\n", - " \"pin_memory\": True\n", + " 'train': {\n", + " 'dataloader': {\n", + " 'batch_size_per_gpu': BATCH_SIZE,\n", + " 'workers_per_gpu': 1,\n", + " 'shuffle': True,\n", + " 'drop_last': True,\n", + " 'pin_memory': True\n", " },\n", - " \"max_epochs\": MAX_EPOCHS,\n", - " \"work_dir\": WORK_DIR,\n", - " \"optimizer\": {\n", - " \"type\": \"AdamW\",\n", - " \"lr\": 1e-4,\n", - " \"weight_decay\": 0.01,\n", - " \"options\": {\n", - " \"cumulative_iters\": 16, \"grad_clip\": {\n", - " \"norm_type\": 2,\n", - " \"max_norm\": 2.0\n", + " 'max_epochs': MAX_EPOCHS,\n", + " 'work_dir': WORK_DIR,\n", + " 'optimizer': {\n", + " 'type': 'AdamW',\n", + " 'lr': 1e-4,\n", + " 'weight_decay': 0.01,\n", + " 'options': {\n", + " 'cumulative_iters': 16, 'grad_clip': {\n", + " 'norm_type': 2,\n", + " 'max_norm': 2.0\n", " }\n", " }\n", " },\n", - " \"lr_scheduler\": {\n", - " \"type\": \"CosineAnnealingLR\",\n", - " \"T_max\": T_max,\n", - " \"eta_min\": 1e-5,\n", - " \"options\": {\n", - " \"by_epoch\": False,\n", - " \"warmup\": {\n", + " 'lr_scheduler': {\n", + " 'type': 'CosineAnnealingLR',\n", + " 'T_max': T_max,\n", + " 'eta_min': 1e-5,\n", + " 'options': {\n", + " 'by_epoch': False,\n", + " 'warmup': {\n", " 'type': 'LinearWarmup',\n", " 'warmup_ratio': 0.1,\n", - " \"warmup_iters\": 200\n", + " 'warmup_iters': 200\n", " }\n", " }\n", " },\n", - " \"hooks\": [\n", - " {\"type\": \"CheckpointHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL, \"max_checkpoint_num\": 1},\n", - " {\"type\": \"EvaluationHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL},\n", - " {\"type\": \"BestCkptSaverHook\",\n", - " \"metric_key\": \"acc\",\n", - " \"save_best\": True, \"rule\": \"max\", \"max_checkpoint_num\": 1},\n", - " {\"type\": \"TextLoggerHook\",\n", - " \"by_epoch\": True, # Whether EpochBasedTrainer is used\n", - " \"interval\": 5},\n", - " {\"type\": \"TensorboardHook\", \"by_epoch\": False, \"interval\": 5}\n", + " 'hooks': [\n", + " {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1},\n", + " {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL},\n", + " {'type': 'BestCkptSaverHook',\n", + " 'metric_key': 'acc',\n", + " 'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1},\n", + " {'type': 'TextLoggerHook',\n", + " 'by_epoch': True, # Whether EpochBasedTrainer is used\n", + " 'interval': 5},\n", + " {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5}\n", " ]\n", " },\n", - " \"evaluation\": {\n", - " \"dataloader\": {\n", - " \"batch_size_per_gpu\": BATCH_SIZE,\n", - " \"workers_per_gpu\": 1,\n", - " \"shuffle\": False,\n", - " \"drop_last\": False,\n", - " \"pin_memory\": True\n", + " 'evaluation': {\n", + " 'dataloader': {\n", + " 'batch_size_per_gpu': BATCH_SIZE,\n", + " 'workers_per_gpu': 1,\n", + " 'shuffle': False,\n", + " 'drop_last': False,\n", + " 'pin_memory': True\n", " },\n", - " \"metrics\": [\n", - " {\"type\": \"my_metric\", \"vocab_size\": tokenizer.vocab_size}\n", + " 'metrics': [\n", + " {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size}\n", " ]\n", " }\n", "})" @@ -1778,16 +1780,16 @@ } ], "source": [ - "tb_dir = os.path.join(WORK_DIR, \"tensorboard_output\")\n", + "tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')\n", "fname = os.listdir(tb_dir)[0]\n", "tb_path = os.path.join(tb_dir, fname)\n", "#\n", "data = read_tensorboard_file(tb_path)\n", "print(data.keys())\n", - "_ = plot_image(data, \"loss\", 0.9)\n", - "_ = plot_image(data, \"lr\", 0)\n", - "_ = plot_image(data, \"evaluation/acc\", 0)\n", - "_ = plot_image(data, \"evaluation/loss\", 0)" + "_ = plot_image(data, 'loss', 0.9)\n", + "_ = plot_image(data, 'lr', 0)\n", + "_ = plot_image(data, 'evaluation/acc', 0)\n", + "_ = plot_image(data, 'evaluation/loss', 0)" ] }, { diff --git a/examples/pytorch/llm_agent/chatglm2_infer.ipynb b/examples/pytorch/llm_agent/chatglm2_infer.ipynb index 29388858..237d27c8 100644 --- a/examples/pytorch/llm_agent/chatglm2_infer.ipynb +++ b/examples/pytorch/llm_agent/chatglm2_infer.ipynb @@ -17,15 +17,6 @@ "The following code is copied from baichuan_infer.ipynb" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install transformers" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -63,7 +54,7 @@ "source": [ "from _common import *\n", "from transformers import TextStreamer\n", - "device_ids = list(range(min(4, torch.cuda.device_count())))\n", + "device_ids = [0, 1]\n", "logger.info(device_ids)\n", "select_device(device_ids)" ] @@ -149,8 +140,8 @@ } ], "source": [ - "CKPT_FAPTH = \"/home/hackathon/my_git/agent/runs/chatglm2/v1-20230702-203505/output_best/pytorch_model.bin\"\n", - "LORA_TARGET_MODULES = [\"query_key_value\"]\n", + "CKPT_FAPTH = '/home/hackathon/my_git/agent/runs/chatglm2/v1-20230702-203505/output_best/pytorch_model.bin'\n", + "LORA_TARGET_MODULES = ['query_key_value']\n", "\n", "model, tokenizer = get_chatglm2_model_tokenizer()\n", "if tokenizer.eos_token_id is None:\n", @@ -230,7 +221,7 @@ " lora_alpha=LORA_ALPHA,\n", " lora_dropout=LORA_DROPOUT_P,\n", " pretrained_weights=CKPT_FAPTH)\n", - "logger.info(f\"lora_config: {lora_config}\")\n", + "logger.info(f'lora_config: {lora_config}')\n", "Swift.prepare_model(model, lora_config)" ] }, @@ -295,8 +286,8 @@ } ], "source": [ - "test_dataset = make_dataset(\"validation\", lambda system, user, assistant:\n", - " {\"system\": system, \"user\": user, \"assistant\": assistant})" + "test_dataset = make_dataset('validation', lambda system, user, assistant:\n", + " {'system': system, 'user': user, 'assistant': assistant})" ] }, { @@ -484,20 +475,21 @@ "source": [ "streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "for d in test_dataset[:5]:\n", - " system = d[\"system\"]\n", - " user = d[\"user\"]\n", - " assistant = d[\"assistant\"]\n", - " input_ids = tokenize_function(system, user, None, tokenizer)[\"input_ids\"]\n", - " print(f\"[TEST]{tokenizer.decode(input_ids)}\", end=\"\")\n", + " system = d['system']\n", + " user = d['user']\n", + " assistant = d['assistant']\n", + " input_ids = tokenize_function(system, user, None, tokenizer)['input_ids']\n", + " print(f'[TEST]{tokenizer.decode(input_ids)}', end='')\n", " input_ids = torch.tensor(input_ids)[None].cuda()\n", " attention_mask = torch.ones_like(input_ids)\n", " generate_ids = model.generate(input_ids=input_ids, max_new_tokens=512,\n", " attention_mask=attention_mask,\n", - " streamer=streamer, pad_token_id=tokenizer.pad_token_id)\n", + " streamer=streamer, pad_token_id=tokenizer.pad_token_id, \n", + " temperature=0.7, top_k=50, do_sample=True)\n", " print()\n", - " print(f\"[LABELS]{assistant}\")\n", - " print(\"-----------------------------------------------------------------------------------\")\n", - " # input(\"next[ENTER]\")" + " print(f'[LABELS]{assistant}')\n", + " print('-----------------------------------------------------------------------------------')\n", + " # input('next[ENTER]')" ] } ], @@ -517,7 +509,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.12" }, "orig_nbformat": 4 }, diff --git a/examples/pytorch/llm_agent/chatglm2_sft.ipynb b/examples/pytorch/llm_agent/chatglm2_sft.ipynb index 1f9306f1..70d9b8a1 100644 --- a/examples/pytorch/llm_agent/chatglm2_sft.ipynb +++ b/examples/pytorch/llm_agent/chatglm2_sft.ipynb @@ -43,10 +43,12 @@ "# !pip install modelscope -U\n", "# !pip install numpy pandas matplotlib scikit-learn\n", "# !pip install transformers datasets\n", - "# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n", + "# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n", "# !pip install tqdm\n", "# !pip install tensorboard\n", "# !pip install torchmetrics\n", + "# !pip install sentencepiece\n", + "# !pip install accelerate\n", "#\n", "# !pip install numpy -U # Resolve torchmetrics dependencies and update numpy" ] @@ -78,7 +80,7 @@ ], "source": [ "from _common import *\n", - "device_ids = list(range(min(4, torch.cuda.device_count())))\n", + "device_ids = [0, 1, 2, 3]\n", "logger.info(device_ids)\n", "select_device(device_ids)\n", "_ = seed_everything(42)" @@ -134,12 +136,11 @@ } ], "source": [ - "model_id = \"ZhipuAI/chatglm2-6b\"\n", - "model_revision = \"v1.0.3\"\n", - "WORK_DIR = \"runs/chatglm2\"\n", - "LORA_TARGET_MODULES = [\"query_key_value\"]\n", + "model_id = 'ZhipuAI/chatglm2-6b'\n", + "WORK_DIR = 'runs/chatglm2'\n", + "LORA_TARGET_MODULES = ['query_key_value']\n", "#\n", - "model_dir = get_model_dir(model_id, model_revision)\n", + "model_dir = get_model_dir(model_id, None)\n", "model, tokenizer = get_chatglm2_model_tokenizer(model_dir)\n", "# chatglm2 does not support gradient_checkpointing\n", "GRADIENT_CHECKPOINTING = False\n", @@ -152,8 +153,8 @@ "if tokenizer.bos_token_id is None:\n", " tokenizer.bos_token_id = 1\n", "#\n", - "logger.info(f\"bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, \"\n", - " f\"pad_token_id: {tokenizer.pad_token_id}\")" + "logger.info(f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, '\n", + " f'pad_token_id: {tokenizer.pad_token_id}')" ] }, { @@ -251,13 +252,13 @@ " rank=LORA_RANK,\n", " lora_alpha=LORA_ALPHA,\n", " lora_dropout=LORA_DROPOUT_P)\n", - "logger.info(f\"lora_config: {lora_config}\")\n", + "logger.info(f'lora_config: {lora_config}')\n", "Swift.prepare_model(model, lora_config)\n", "#\n", "show_freeze_layers(model)\n", "print_model_info(model)\n", "_p = list(model.parameters())[100]\n", - "logger.info(f\"device: {_p.device}, dtype: {_p.dtype}\")\n", + "logger.info(f'device: {_p.device}, dtype: {_p.dtype}')\n", "model.bfloat16()" ] }, @@ -399,8 +400,8 @@ ], "source": [ "tokenize_function = partial(tokenize_function, tokenizer=tokenizer)\n", - "train_dataset = make_dataset(\"train\", tokenize_function)\n", - "val_dataset = make_dataset(\"validation\", tokenize_function)\n", + "train_dataset = make_dataset('train', tokenize_function)\n", + "val_dataset = make_dataset('validation', tokenize_function)\n", "# Data analysis\n", "stat_dataset(train_dataset)\n", "stat_dataset(val_dataset)\n", @@ -431,7 +432,7 @@ } ], "source": [ - "cfg_file = os.path.join(model_dir, \"configuration.json\")\n", + "cfg_file = os.path.join(model_dir, 'configuration.json')\n", "#\n", "BATCH_SIZE = 1\n", "MAX_EPOCHS = 1\n", @@ -439,62 +440,62 @@ "WORK_DIR = get_work_dir(WORK_DIR)\n", "EVAL_INTERVAL = 200\n", "CONFIG = Config({\n", - " \"train\": {\n", - " \"dataloader\": {\n", - " \"batch_size_per_gpu\": BATCH_SIZE,\n", - " \"workers_per_gpu\": 1,\n", - " \"shuffle\": True,\n", - " \"drop_last\": True,\n", - " \"pin_memory\": True\n", + " 'train': {\n", + " 'dataloader': {\n", + " 'batch_size_per_gpu': BATCH_SIZE,\n", + " 'workers_per_gpu': 1,\n", + " 'shuffle': True,\n", + " 'drop_last': True,\n", + " 'pin_memory': True\n", " },\n", - " \"max_epochs\": MAX_EPOCHS,\n", - " \"work_dir\": WORK_DIR,\n", - " \"optimizer\": {\n", - " \"type\": \"AdamW\",\n", - " \"lr\": 1e-4,\n", - " \"weight_decay\": 0.01,\n", - " \"options\": {\n", - " \"cumulative_iters\": 16, \"grad_clip\": {\n", - " \"norm_type\": 2,\n", - " \"max_norm\": 2.0\n", + " 'max_epochs': MAX_EPOCHS,\n", + " 'work_dir': WORK_DIR,\n", + " 'optimizer': {\n", + " 'type': 'AdamW',\n", + " 'lr': 1e-4,\n", + " 'weight_decay': 0.01,\n", + " 'options': {\n", + " 'cumulative_iters': 16, 'grad_clip': {\n", + " 'norm_type': 2,\n", + " 'max_norm': 2.0\n", " }\n", " }\n", " },\n", - " \"lr_scheduler\": {\n", - " \"type\": \"CosineAnnealingLR\",\n", - " \"T_max\": T_max,\n", - " \"eta_min\": 1e-5,\n", - " \"options\": {\n", - " \"by_epoch\": False,\n", - " \"warmup\": {\n", + " 'lr_scheduler': {\n", + " 'type': 'CosineAnnealingLR',\n", + " 'T_max': T_max,\n", + " 'eta_min': 1e-5,\n", + " 'options': {\n", + " 'by_epoch': False,\n", + " 'warmup': {\n", " 'type': 'LinearWarmup',\n", " 'warmup_ratio': 0.1,\n", - " \"warmup_iters\": 200\n", + " 'warmup_iters': 200\n", " }\n", " }\n", " },\n", - " \"hooks\": [\n", - " {\"type\": \"CheckpointHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL, \"max_checkpoint_num\": 1},\n", - " {\"type\": \"EvaluationHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL},\n", - " {\"type\": \"BestCkptSaverHook\",\n", - " \"metric_key\": \"acc\",\n", - " \"save_best\": True, \"rule\": \"max\", \"max_checkpoint_num\": 1},\n", - " {\"type\": \"TextLoggerHook\",\n", - " \"by_epoch\": True, # Whether EpochBasedTrainer is used\n", - " \"interval\": 5},\n", - " {\"type\": \"TensorboardHook\", \"by_epoch\": False, \"interval\": 5}\n", + " 'hooks': [\n", + " {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1},\n", + " {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL},\n", + " {'type': 'BestCkptSaverHook',\n", + " 'metric_key': 'acc',\n", + " 'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1},\n", + " {'type': 'TextLoggerHook',\n", + " 'by_epoch': True, # Whether EpochBasedTrainer is used\n", + " 'interval': 5},\n", + " {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5}\n", " ]\n", " },\n", - " \"evaluation\": {\n", - " \"dataloader\": {\n", - " \"batch_size_per_gpu\": BATCH_SIZE,\n", - " \"workers_per_gpu\": 1,\n", - " \"shuffle\": False,\n", - " \"drop_last\": False,\n", - " \"pin_memory\": True\n", + " 'evaluation': {\n", + " 'dataloader': {\n", + " 'batch_size_per_gpu': BATCH_SIZE,\n", + " 'workers_per_gpu': 1,\n", + " 'shuffle': False,\n", + " 'drop_last': False,\n", + " 'pin_memory': True\n", " },\n", - " \"metrics\": [\n", - " {\"type\": \"my_metric\", \"vocab_size\": tokenizer.vocab_size}\n", + " 'metrics': [\n", + " {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size}\n", " ]\n", " }\n", "})" @@ -1884,16 +1885,16 @@ } ], "source": [ - "tb_dir = os.path.join(WORK_DIR, \"tensorboard_output\")\n", + "tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')\n", "fname = os.listdir(tb_dir)[0]\n", "tb_path = os.path.join(tb_dir, fname)\n", "#\n", "data = read_tensorboard_file(tb_path)\n", "print(data.keys())\n", - "_ = plot_image(data, \"loss\", 0.9)\n", - "_ = plot_image(data, \"lr\", 0)\n", - "_ = plot_image(data, \"evaluation/acc\", 0)\n", - "_ = plot_image(data, \"evaluation/loss\", 0)" + "_ = plot_image(data, 'loss', 0.9)\n", + "_ = plot_image(data, 'lr', 0)\n", + "_ = plot_image(data, 'evaluation/acc', 0)\n", + "_ = plot_image(data, 'evaluation/loss', 0)" ] }, { diff --git a/setup.cfg b/setup.cfg index bfee5eec..80d07c5a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,7 +21,7 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids [flake8] max-line-length = 120 select = B,C,E,F,P,T4,W,B9 -ignore = F401,F405,F821,W503,E251 +ignore = F401,F403,F405,F821,W503,E251 exclude = docs/src,*.pyi,.git [darglint] From de32a8f3e69c0ff88a2ccd43e6758ccffc9ebe02 Mon Sep 17 00:00:00 2001 From: "xingjun.wang" Date: Tue, 11 Jul 2023 17:48:22 +0800 Subject: [PATCH 6/6] pre commit --- modelscope/msdatasets/download/dataset_builder.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modelscope/msdatasets/download/dataset_builder.py b/modelscope/msdatasets/download/dataset_builder.py index ad5ebbcf..0c5c4154 100644 --- a/modelscope/msdatasets/download/dataset_builder.py +++ b/modelscope/msdatasets/download/dataset_builder.py @@ -223,18 +223,21 @@ class CsvDatasetBuilder(csv.Csv): if field_name.endswith(':FILE'): transform_fields.append(field_name) - base_extracted_dir: Union[str, list] = self.split_path_dict.get(split_name, '') + base_extracted_dir: Union[str, list] = self.split_path_dict.get( + split_name, '') for field_name in transform_fields: - if isinstance(base_extracted_dir, list) and len(base_extracted_dir) > 0: + if isinstance(base_extracted_dir, + list) and len(base_extracted_dir) > 0: if df.shape[0] != len(base_extracted_dir): logger.error( f"Number of lines in meta-csv file for split '{split_name}' ({df.shape[0]}) " - f"does not match number of data-files({len(base_extracted_dir)})!" + f'does not match number of data-files({len(base_extracted_dir)})!' ) else: df[field_name] = base_extracted_dir elif isinstance(base_extracted_dir, str) and base_extracted_dir: - df[field_name] = df[field_name].apply(lambda x: os.path.join(base_extracted_dir, x)) + df[field_name] = df[field_name].apply( + lambda x: os.path.join(base_extracted_dir, x)) else: logger.warning(f'Nothing to do for field {field_name}')