diff --git a/examples/pytorch/llm/llm_infer.py b/examples/pytorch/llm/llm_infer.py index 614e3d36..d496186f 100644 --- a/examples/pytorch/llm/llm_infer.py +++ b/examples/pytorch/llm/llm_infer.py @@ -17,7 +17,7 @@ class InferArguments: default='lora', metadata={'choices': ['lora', 'full']}) ckpt_path: str = '/path/to/your/iter_xxx.pth' eval_human: bool = False # False: eval test_dataset - ignore_args_error: bool = True # False: notebook compatibility + ignore_args_error: bool = False # True: notebook compatibility dataset: str = field( default='alpaca-en,alpaca-zh', @@ -96,7 +96,7 @@ def llm_infer(args: InferArguments) -> None: inference(input_ids, model, tokenizer, streamer, generation_config) print('-' * 80) else: - dataset = get_dataset(args.dataset) + dataset = get_dataset(args.dataset.split(',')) _, test_dataset = process_dataset(dataset, args.dataset_test_size, args.dataset_sample, args.dataset_seed) diff --git a/examples/pytorch/llm/llm_sft.py b/examples/pytorch/llm/llm_sft.py index a7dabf77..3fad52bb 100644 --- a/examples/pytorch/llm/llm_sft.py +++ b/examples/pytorch/llm/llm_sft.py @@ -30,7 +30,7 @@ class SftArguments: # baichuan-7b: 'lora': 16G; 'full': 80G sft_type: str = field( default='lora', metadata={'choices': ['lora', 'full']}) - ignore_args_error: bool = True # False: notebook compatibility + ignore_args_error: bool = False # True: notebook compatibility dataset: str = field( default='alpaca-en,alpaca-zh', @@ -121,7 +121,7 @@ def llm_sft(args: SftArguments) -> None: logger.info(f'device: {_p.device}, dtype: {_p.dtype}') # ### Loading Dataset - dataset = get_dataset(args.dataset) + dataset = get_dataset(args.dataset.split(',')) train_dataset, val_dataset = process_dataset(dataset, args.dataset_test_size, args.dataset_sample, diff --git a/examples/pytorch/llm/utils/__init__.py b/examples/pytorch/llm/utils/__init__.py index e4772c03..c5051a97 100644 --- a/examples/pytorch/llm/utils/__init__.py +++ b/examples/pytorch/llm/utils/__init__.py @@ -1,5 +1,3 @@ -from _parser import * - from .dataset import * from .models import * from .utils import * diff --git a/examples/pytorch/llm/utils/dataset.py b/examples/pytorch/llm/utils/dataset.py index 3035ba78..619e3fbc 100644 --- a/examples/pytorch/llm/utils/dataset.py +++ b/examples/pytorch/llm/utils/dataset.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple +from typing import List, Optional, Tuple import numpy as np from datasets import Dataset as HfDataset @@ -62,8 +62,7 @@ DATASET_MAPPER = { } -def get_dataset(dataset_names: str) -> HfDataset: - dataset_name_list = dataset_names.split(',') +def get_dataset(dataset_name_list: List[str]) -> HfDataset: dataset_list = [] for dataset_name in dataset_name_list: get_function = DATASET_MAPPER[dataset_name] diff --git a/examples/pytorch/llm/utils/models.py b/examples/pytorch/llm/utils/models.py index c95df561..9613581c 100644 --- a/examples/pytorch/llm/utils/models.py +++ b/examples/pytorch/llm/utils/models.py @@ -1,4 +1,6 @@ -from typing import NamedTuple +import os +import sys +from typing import Any, Dict, NamedTuple, Optional import torch from torch import dtype as Dtype @@ -10,22 +12,18 @@ from modelscope.models.nlp.chatglm2 import ChatGLM2Config, ChatGLM2Tokenizer logger = get_logger() -def _add_special_token(tokenizer): - if tokenizer.eos_token_id is None: - tokenizer.eos_token_id = 2 - if tokenizer.bos_token_id is None: - tokenizer.bos_token_id = 1 - if tokenizer.pad_token_id is None: - tokenizer.pad_token_id = 0 - logger.info(f'bos_token_id: {tokenizer.bos_token_id}, ' - f'eos_token_id: {tokenizer.eos_token_id}, ' - f'pad_token_id: {tokenizer.pad_token_id}') +def _add_special_token(tokenizer, special_token_mapper: Dict[str, + Any]) -> None: + for k, v in special_token_mapper: + setattr(tokenizer, k, v) + assert tokenizer.eos_token is not None + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token def get_model_tokenizer_default(model_dir: str, - load_model: bool = True, - add_special_token: bool = True, - torch_dtype: Dtype = torch.float16): + torch_dtype: Dtype, + load_model: bool = True): """load from an independent repository""" model_config = AutoConfig.from_pretrained( model_dir, trust_remote_code=True) @@ -41,16 +39,12 @@ def get_model_tokenizer_default(model_dir: str, device_map='auto', torch_dtype=torch_dtype, trust_remote_code=True) - - if add_special_token: - _add_special_token(tokenizer) return model, tokenizer def get_model_tokenizer_chatglm2(model_dir: str, - load_model: bool = True, - add_special_token: bool = True, - torch_dtype: Dtype = torch.float16): + torch_dtype: Dtype, + load_model: bool = True): """load from ms library""" config = read_config(model_dir) logger.info(config) @@ -66,8 +60,6 @@ def get_model_tokenizer_chatglm2(model_dir: str, config=model_config, device_map='auto', torch_dtype=torch_dtype) - if add_special_token: - _add_special_token(tokenizer) return model, tokenizer @@ -79,18 +71,21 @@ class LoRATM(NamedTuple): # Reference: 'https://modelscope.cn/models/{model_id}/summary' +# keys: 'model_id', 'revision', 'torch_dtype', 'get_function', +# 'ignore_file_pattern', 'special_token_mapper', 'lora_TM' MODEL_MAPPER = { 'baichuan-7b': { - 'model_id': 'baichuan-inc/baichuan-7B', + 'model_id': 'baichuan-inc/baichuan-7B', # model id or model dir 'revision': 'v1.0.7', 'lora_TM': LoRATM.baichuan }, 'baichuan-13b': { 'model_id': 'baichuan-inc/Baichuan-13B-Base', 'revision': 'v1.0.3', + 'torch_dtype': torch.bfloat16, 'lora_TM': LoRATM.baichuan }, - 'chatglm2': { + 'chatglm2-6b': { 'model_id': 'ZhipuAI/chatglm2-6b', 'revision': 'v1.0.6', 'get_function': get_model_tokenizer_chatglm2, @@ -116,18 +111,25 @@ MODEL_MAPPER = { def get_model_tokenizer(model_type: str, - load_model: bool = True, - add_special_token: bool = True, - torch_dtype: Dtype = torch.float16): + torch_dtype: Optional[Dtype] = None, + load_model: bool = True): data = MODEL_MAPPER.get(model_type) if data is None: raise ValueError(f'model_type: {model_type}') + model_id = data['model_id'] - revision = data.get('revision', 'master') get_function = data.get('get_function', get_model_tokenizer_default) ignore_file_pattern = data.get('ignore_file_pattern', []) - model_dir = snapshot_download( - model_id, revision, ignore_file_pattern=ignore_file_pattern) - model, tokenizer = get_function(model_dir, load_model, add_special_token, - torch_dtype) + special_token_mapper = data.get('special_token_mapper', {}) + if torch_dtype is None: + torch_dtype = data.get('torch_dtype', torch.float16) + + model_dir = model_id + if not os.path.exists(model_id): + revision = data.get('revision', 'master') + model_dir = snapshot_download( + model_id, revision, ignore_file_pattern=ignore_file_pattern) + + model, tokenizer = get_function(model_dir, torch_dtype, load_model) + _add_special_token(tokenizer, special_token_mapper) return model, tokenizer, model_dir diff --git a/examples/pytorch/llm/utils/utils.py b/examples/pytorch/llm/utils/utils.py index 5b8ee163..3542c82a 100644 --- a/examples/pytorch/llm/utils/utils.py +++ b/examples/pytorch/llm/utils/utils.py @@ -7,7 +7,7 @@ import sys from dataclasses import dataclass, field from functools import partial from types import MethodType -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Counter, Dict, List, Optional, Tuple, Union import matplotlib.pyplot as plt import numpy as np @@ -152,10 +152,9 @@ def print_example(example: Dict[str, Any], tokenizer) -> None: print(f'[INPUT_IDS] {input_ids}') print(f'[INPUT] {tokenizer.decode(input_ids)}') print() + n_mask = Counter(labels)[-100] print(f'[LABLES_IDS] {labels}') - print( - f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}' - ) + print(f'[LABLES] <-100 * {n_mask}>{tokenizer.decode(labels[n_mask:])}') def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]: @@ -198,10 +197,10 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None: logger.info(''.join(s)) -def show_freeze_layers(model: Module, max_lines: int = 20) -> None: +def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None: named_p = list(model.named_parameters()) for i, (n, p) in enumerate(named_p): - if i >= max_lines: + if max_lines is not None and i >= max_lines: logger.info('...') break logger.info(f'{n}: requires_grad={p.requires_grad}') diff --git a/modelscope/utils/checkpoint.py b/modelscope/utils/checkpoint.py index 147b80e9..0d27782b 100644 --- a/modelscope/utils/checkpoint.py +++ b/modelscope/utils/checkpoint.py @@ -611,10 +611,6 @@ def save_pretrained(model, raise Exception( 'At least pass in one checkpoint name for saving method') - # Clean the folder from a previous save - if os.path.exists(target_folder): - rmtree(target_folder) - # Single ckpt path, sharded ckpt logic will be added later output_ckpt_path = os.path.join(target_folder, save_checkpoint_name) @@ -629,7 +625,8 @@ def save_pretrained(model, copytree( model.model_dir, target_folder, - ignore=ignore_patterns(*ignore_file_set)) + ignore=ignore_patterns(*ignore_file_set), + dirs_exist_ok=True) # Save the ckpt to the save directory try: diff --git a/modelscope/utils/torch_utils.py b/modelscope/utils/torch_utils.py index 1a673458..fb2d1265 100644 --- a/modelscope/utils/torch_utils.py +++ b/modelscope/utils/torch_utils.py @@ -357,6 +357,5 @@ def all_gather(data, group=None): def is_on_same_device(model: torch.nn.Module) -> bool: - device_set = set(map(lambda p: p.device.type, - model.parameters())) - {'cpu'} + device_set = set(str(p.device) for p in model.parameters()) - {'cpu'} return len(device_set) <= 1