mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 12:10:09 +01:00
Merge branch 'master-github' into master-merge-github-230728
This commit is contained in:
@@ -17,7 +17,7 @@ class InferArguments:
|
||||
default='lora', metadata={'choices': ['lora', 'full']})
|
||||
ckpt_path: str = '/path/to/your/iter_xxx.pth'
|
||||
eval_human: bool = False # False: eval test_dataset
|
||||
ignore_args_error: bool = True # False: notebook compatibility
|
||||
ignore_args_error: bool = False # True: notebook compatibility
|
||||
|
||||
dataset: str = field(
|
||||
default='alpaca-en,alpaca-zh',
|
||||
@@ -96,7 +96,7 @@ def llm_infer(args: InferArguments) -> None:
|
||||
inference(input_ids, model, tokenizer, streamer, generation_config)
|
||||
print('-' * 80)
|
||||
else:
|
||||
dataset = get_dataset(args.dataset)
|
||||
dataset = get_dataset(args.dataset.split(','))
|
||||
_, test_dataset = process_dataset(dataset, args.dataset_test_size,
|
||||
args.dataset_sample,
|
||||
args.dataset_seed)
|
||||
|
||||
@@ -30,7 +30,7 @@ class SftArguments:
|
||||
# baichuan-7b: 'lora': 16G; 'full': 80G
|
||||
sft_type: str = field(
|
||||
default='lora', metadata={'choices': ['lora', 'full']})
|
||||
ignore_args_error: bool = True # False: notebook compatibility
|
||||
ignore_args_error: bool = False # True: notebook compatibility
|
||||
|
||||
dataset: str = field(
|
||||
default='alpaca-en,alpaca-zh',
|
||||
@@ -121,7 +121,7 @@ def llm_sft(args: SftArguments) -> None:
|
||||
logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
|
||||
|
||||
# ### Loading Dataset
|
||||
dataset = get_dataset(args.dataset)
|
||||
dataset = get_dataset(args.dataset.split(','))
|
||||
train_dataset, val_dataset = process_dataset(dataset,
|
||||
args.dataset_test_size,
|
||||
args.dataset_sample,
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from _parser import *
|
||||
|
||||
from .dataset import *
|
||||
from .models import *
|
||||
from .utils import *
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Optional, Tuple
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from datasets import Dataset as HfDataset
|
||||
@@ -62,8 +62,7 @@ DATASET_MAPPER = {
|
||||
}
|
||||
|
||||
|
||||
def get_dataset(dataset_names: str) -> HfDataset:
|
||||
dataset_name_list = dataset_names.split(',')
|
||||
def get_dataset(dataset_name_list: List[str]) -> HfDataset:
|
||||
dataset_list = []
|
||||
for dataset_name in dataset_name_list:
|
||||
get_function = DATASET_MAPPER[dataset_name]
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from typing import NamedTuple
|
||||
import os
|
||||
import sys
|
||||
from typing import Any, Dict, NamedTuple, Optional
|
||||
|
||||
import torch
|
||||
from torch import dtype as Dtype
|
||||
@@ -10,22 +12,18 @@ from modelscope.models.nlp.chatglm2 import ChatGLM2Config, ChatGLM2Tokenizer
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def _add_special_token(tokenizer):
|
||||
if tokenizer.eos_token_id is None:
|
||||
tokenizer.eos_token_id = 2
|
||||
if tokenizer.bos_token_id is None:
|
||||
tokenizer.bos_token_id = 1
|
||||
if tokenizer.pad_token_id is None:
|
||||
tokenizer.pad_token_id = 0
|
||||
logger.info(f'bos_token_id: {tokenizer.bos_token_id}, '
|
||||
f'eos_token_id: {tokenizer.eos_token_id}, '
|
||||
f'pad_token_id: {tokenizer.pad_token_id}')
|
||||
def _add_special_token(tokenizer, special_token_mapper: Dict[str,
|
||||
Any]) -> None:
|
||||
for k, v in special_token_mapper:
|
||||
setattr(tokenizer, k, v)
|
||||
assert tokenizer.eos_token is not None
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
|
||||
def get_model_tokenizer_default(model_dir: str,
|
||||
load_model: bool = True,
|
||||
add_special_token: bool = True,
|
||||
torch_dtype: Dtype = torch.float16):
|
||||
torch_dtype: Dtype,
|
||||
load_model: bool = True):
|
||||
"""load from an independent repository"""
|
||||
model_config = AutoConfig.from_pretrained(
|
||||
model_dir, trust_remote_code=True)
|
||||
@@ -41,16 +39,12 @@ def get_model_tokenizer_default(model_dir: str,
|
||||
device_map='auto',
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True)
|
||||
|
||||
if add_special_token:
|
||||
_add_special_token(tokenizer)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def get_model_tokenizer_chatglm2(model_dir: str,
|
||||
load_model: bool = True,
|
||||
add_special_token: bool = True,
|
||||
torch_dtype: Dtype = torch.float16):
|
||||
torch_dtype: Dtype,
|
||||
load_model: bool = True):
|
||||
"""load from ms library"""
|
||||
config = read_config(model_dir)
|
||||
logger.info(config)
|
||||
@@ -66,8 +60,6 @@ def get_model_tokenizer_chatglm2(model_dir: str,
|
||||
config=model_config,
|
||||
device_map='auto',
|
||||
torch_dtype=torch_dtype)
|
||||
if add_special_token:
|
||||
_add_special_token(tokenizer)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
@@ -79,18 +71,21 @@ class LoRATM(NamedTuple):
|
||||
|
||||
|
||||
# Reference: 'https://modelscope.cn/models/{model_id}/summary'
|
||||
# keys: 'model_id', 'revision', 'torch_dtype', 'get_function',
|
||||
# 'ignore_file_pattern', 'special_token_mapper', 'lora_TM'
|
||||
MODEL_MAPPER = {
|
||||
'baichuan-7b': {
|
||||
'model_id': 'baichuan-inc/baichuan-7B',
|
||||
'model_id': 'baichuan-inc/baichuan-7B', # model id or model dir
|
||||
'revision': 'v1.0.7',
|
||||
'lora_TM': LoRATM.baichuan
|
||||
},
|
||||
'baichuan-13b': {
|
||||
'model_id': 'baichuan-inc/Baichuan-13B-Base',
|
||||
'revision': 'v1.0.3',
|
||||
'torch_dtype': torch.bfloat16,
|
||||
'lora_TM': LoRATM.baichuan
|
||||
},
|
||||
'chatglm2': {
|
||||
'chatglm2-6b': {
|
||||
'model_id': 'ZhipuAI/chatglm2-6b',
|
||||
'revision': 'v1.0.6',
|
||||
'get_function': get_model_tokenizer_chatglm2,
|
||||
@@ -116,18 +111,25 @@ MODEL_MAPPER = {
|
||||
|
||||
|
||||
def get_model_tokenizer(model_type: str,
|
||||
load_model: bool = True,
|
||||
add_special_token: bool = True,
|
||||
torch_dtype: Dtype = torch.float16):
|
||||
torch_dtype: Optional[Dtype] = None,
|
||||
load_model: bool = True):
|
||||
data = MODEL_MAPPER.get(model_type)
|
||||
if data is None:
|
||||
raise ValueError(f'model_type: {model_type}')
|
||||
|
||||
model_id = data['model_id']
|
||||
revision = data.get('revision', 'master')
|
||||
get_function = data.get('get_function', get_model_tokenizer_default)
|
||||
ignore_file_pattern = data.get('ignore_file_pattern', [])
|
||||
model_dir = snapshot_download(
|
||||
model_id, revision, ignore_file_pattern=ignore_file_pattern)
|
||||
model, tokenizer = get_function(model_dir, load_model, add_special_token,
|
||||
torch_dtype)
|
||||
special_token_mapper = data.get('special_token_mapper', {})
|
||||
if torch_dtype is None:
|
||||
torch_dtype = data.get('torch_dtype', torch.float16)
|
||||
|
||||
model_dir = model_id
|
||||
if not os.path.exists(model_id):
|
||||
revision = data.get('revision', 'master')
|
||||
model_dir = snapshot_download(
|
||||
model_id, revision, ignore_file_pattern=ignore_file_pattern)
|
||||
|
||||
model, tokenizer = get_function(model_dir, torch_dtype, load_model)
|
||||
_add_special_token(tokenizer, special_token_mapper)
|
||||
return model, tokenizer, model_dir
|
||||
|
||||
@@ -7,7 +7,7 @@ import sys
|
||||
from dataclasses import dataclass, field
|
||||
from functools import partial
|
||||
from types import MethodType
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from typing import Any, Callable, Counter, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
@@ -152,10 +152,9 @@ def print_example(example: Dict[str, Any], tokenizer) -> None:
|
||||
print(f'[INPUT_IDS] {input_ids}')
|
||||
print(f'[INPUT] {tokenizer.decode(input_ids)}')
|
||||
print()
|
||||
n_mask = Counter(labels)[-100]
|
||||
print(f'[LABLES_IDS] {labels}')
|
||||
print(
|
||||
f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
|
||||
)
|
||||
print(f'[LABLES] <-100 * {n_mask}>{tokenizer.decode(labels[n_mask:])}')
|
||||
|
||||
|
||||
def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
|
||||
@@ -198,10 +197,10 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
|
||||
logger.info(''.join(s))
|
||||
|
||||
|
||||
def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
|
||||
def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None:
|
||||
named_p = list(model.named_parameters())
|
||||
for i, (n, p) in enumerate(named_p):
|
||||
if i >= max_lines:
|
||||
if max_lines is not None and i >= max_lines:
|
||||
logger.info('...')
|
||||
break
|
||||
logger.info(f'{n}: requires_grad={p.requires_grad}')
|
||||
|
||||
@@ -611,10 +611,6 @@ def save_pretrained(model,
|
||||
raise Exception(
|
||||
'At least pass in one checkpoint name for saving method')
|
||||
|
||||
# Clean the folder from a previous save
|
||||
if os.path.exists(target_folder):
|
||||
rmtree(target_folder)
|
||||
|
||||
# Single ckpt path, sharded ckpt logic will be added later
|
||||
output_ckpt_path = os.path.join(target_folder, save_checkpoint_name)
|
||||
|
||||
@@ -629,7 +625,8 @@ def save_pretrained(model,
|
||||
copytree(
|
||||
model.model_dir,
|
||||
target_folder,
|
||||
ignore=ignore_patterns(*ignore_file_set))
|
||||
ignore=ignore_patterns(*ignore_file_set),
|
||||
dirs_exist_ok=True)
|
||||
|
||||
# Save the ckpt to the save directory
|
||||
try:
|
||||
|
||||
@@ -357,6 +357,5 @@ def all_gather(data, group=None):
|
||||
|
||||
|
||||
def is_on_same_device(model: torch.nn.Module) -> bool:
|
||||
device_set = set(map(lambda p: p.device.type,
|
||||
model.parameters())) - {'cpu'}
|
||||
device_set = set(str(p.device) for p in model.parameters()) - {'cpu'}
|
||||
return len(device_set) <= 1
|
||||
|
||||
Reference in New Issue
Block a user