Merge branch 'master-github' into master-merge-github-230728

This commit is contained in:
suluyan.sly
2023-07-29 10:42:54 +08:00
8 changed files with 48 additions and 54 deletions

View File

@@ -17,7 +17,7 @@ class InferArguments:
default='lora', metadata={'choices': ['lora', 'full']})
ckpt_path: str = '/path/to/your/iter_xxx.pth'
eval_human: bool = False # False: eval test_dataset
ignore_args_error: bool = True # False: notebook compatibility
ignore_args_error: bool = False # True: notebook compatibility
dataset: str = field(
default='alpaca-en,alpaca-zh',
@@ -96,7 +96,7 @@ def llm_infer(args: InferArguments) -> None:
inference(input_ids, model, tokenizer, streamer, generation_config)
print('-' * 80)
else:
dataset = get_dataset(args.dataset)
dataset = get_dataset(args.dataset.split(','))
_, test_dataset = process_dataset(dataset, args.dataset_test_size,
args.dataset_sample,
args.dataset_seed)

View File

@@ -30,7 +30,7 @@ class SftArguments:
# baichuan-7b: 'lora': 16G; 'full': 80G
sft_type: str = field(
default='lora', metadata={'choices': ['lora', 'full']})
ignore_args_error: bool = True # False: notebook compatibility
ignore_args_error: bool = False # True: notebook compatibility
dataset: str = field(
default='alpaca-en,alpaca-zh',
@@ -121,7 +121,7 @@ def llm_sft(args: SftArguments) -> None:
logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
# ### Loading Dataset
dataset = get_dataset(args.dataset)
dataset = get_dataset(args.dataset.split(','))
train_dataset, val_dataset = process_dataset(dataset,
args.dataset_test_size,
args.dataset_sample,

View File

@@ -1,5 +1,3 @@
from _parser import *
from .dataset import *
from .models import *
from .utils import *

View File

@@ -1,4 +1,4 @@
from typing import Optional, Tuple
from typing import List, Optional, Tuple
import numpy as np
from datasets import Dataset as HfDataset
@@ -62,8 +62,7 @@ DATASET_MAPPER = {
}
def get_dataset(dataset_names: str) -> HfDataset:
dataset_name_list = dataset_names.split(',')
def get_dataset(dataset_name_list: List[str]) -> HfDataset:
dataset_list = []
for dataset_name in dataset_name_list:
get_function = DATASET_MAPPER[dataset_name]

View File

@@ -1,4 +1,6 @@
from typing import NamedTuple
import os
import sys
from typing import Any, Dict, NamedTuple, Optional
import torch
from torch import dtype as Dtype
@@ -10,22 +12,18 @@ from modelscope.models.nlp.chatglm2 import ChatGLM2Config, ChatGLM2Tokenizer
logger = get_logger()
def _add_special_token(tokenizer):
if tokenizer.eos_token_id is None:
tokenizer.eos_token_id = 2
if tokenizer.bos_token_id is None:
tokenizer.bos_token_id = 1
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = 0
logger.info(f'bos_token_id: {tokenizer.bos_token_id}, '
f'eos_token_id: {tokenizer.eos_token_id}, '
f'pad_token_id: {tokenizer.pad_token_id}')
def _add_special_token(tokenizer, special_token_mapper: Dict[str,
Any]) -> None:
for k, v in special_token_mapper:
setattr(tokenizer, k, v)
assert tokenizer.eos_token is not None
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
def get_model_tokenizer_default(model_dir: str,
load_model: bool = True,
add_special_token: bool = True,
torch_dtype: Dtype = torch.float16):
torch_dtype: Dtype,
load_model: bool = True):
"""load from an independent repository"""
model_config = AutoConfig.from_pretrained(
model_dir, trust_remote_code=True)
@@ -41,16 +39,12 @@ def get_model_tokenizer_default(model_dir: str,
device_map='auto',
torch_dtype=torch_dtype,
trust_remote_code=True)
if add_special_token:
_add_special_token(tokenizer)
return model, tokenizer
def get_model_tokenizer_chatglm2(model_dir: str,
load_model: bool = True,
add_special_token: bool = True,
torch_dtype: Dtype = torch.float16):
torch_dtype: Dtype,
load_model: bool = True):
"""load from ms library"""
config = read_config(model_dir)
logger.info(config)
@@ -66,8 +60,6 @@ def get_model_tokenizer_chatglm2(model_dir: str,
config=model_config,
device_map='auto',
torch_dtype=torch_dtype)
if add_special_token:
_add_special_token(tokenizer)
return model, tokenizer
@@ -79,18 +71,21 @@ class LoRATM(NamedTuple):
# Reference: 'https://modelscope.cn/models/{model_id}/summary'
# keys: 'model_id', 'revision', 'torch_dtype', 'get_function',
# 'ignore_file_pattern', 'special_token_mapper', 'lora_TM'
MODEL_MAPPER = {
'baichuan-7b': {
'model_id': 'baichuan-inc/baichuan-7B',
'model_id': 'baichuan-inc/baichuan-7B', # model id or model dir
'revision': 'v1.0.7',
'lora_TM': LoRATM.baichuan
},
'baichuan-13b': {
'model_id': 'baichuan-inc/Baichuan-13B-Base',
'revision': 'v1.0.3',
'torch_dtype': torch.bfloat16,
'lora_TM': LoRATM.baichuan
},
'chatglm2': {
'chatglm2-6b': {
'model_id': 'ZhipuAI/chatglm2-6b',
'revision': 'v1.0.6',
'get_function': get_model_tokenizer_chatglm2,
@@ -116,18 +111,25 @@ MODEL_MAPPER = {
def get_model_tokenizer(model_type: str,
load_model: bool = True,
add_special_token: bool = True,
torch_dtype: Dtype = torch.float16):
torch_dtype: Optional[Dtype] = None,
load_model: bool = True):
data = MODEL_MAPPER.get(model_type)
if data is None:
raise ValueError(f'model_type: {model_type}')
model_id = data['model_id']
revision = data.get('revision', 'master')
get_function = data.get('get_function', get_model_tokenizer_default)
ignore_file_pattern = data.get('ignore_file_pattern', [])
model_dir = snapshot_download(
model_id, revision, ignore_file_pattern=ignore_file_pattern)
model, tokenizer = get_function(model_dir, load_model, add_special_token,
torch_dtype)
special_token_mapper = data.get('special_token_mapper', {})
if torch_dtype is None:
torch_dtype = data.get('torch_dtype', torch.float16)
model_dir = model_id
if not os.path.exists(model_id):
revision = data.get('revision', 'master')
model_dir = snapshot_download(
model_id, revision, ignore_file_pattern=ignore_file_pattern)
model, tokenizer = get_function(model_dir, torch_dtype, load_model)
_add_special_token(tokenizer, special_token_mapper)
return model, tokenizer, model_dir

View File

@@ -7,7 +7,7 @@ import sys
from dataclasses import dataclass, field
from functools import partial
from types import MethodType
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, Counter, Dict, List, Optional, Tuple, Union
import matplotlib.pyplot as plt
import numpy as np
@@ -152,10 +152,9 @@ def print_example(example: Dict[str, Any], tokenizer) -> None:
print(f'[INPUT_IDS] {input_ids}')
print(f'[INPUT] {tokenizer.decode(input_ids)}')
print()
n_mask = Counter(labels)[-100]
print(f'[LABLES_IDS] {labels}')
print(
f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
)
print(f'[LABLES] <-100 * {n_mask}>{tokenizer.decode(labels[n_mask:])}')
def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
@@ -198,10 +197,10 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
logger.info(''.join(s))
def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
def show_freeze_layers(model: Module, max_lines: Optional[int] = 20) -> None:
named_p = list(model.named_parameters())
for i, (n, p) in enumerate(named_p):
if i >= max_lines:
if max_lines is not None and i >= max_lines:
logger.info('...')
break
logger.info(f'{n}: requires_grad={p.requires_grad}')

View File

@@ -611,10 +611,6 @@ def save_pretrained(model,
raise Exception(
'At least pass in one checkpoint name for saving method')
# Clean the folder from a previous save
if os.path.exists(target_folder):
rmtree(target_folder)
# Single ckpt path, sharded ckpt logic will be added later
output_ckpt_path = os.path.join(target_folder, save_checkpoint_name)
@@ -629,7 +625,8 @@ def save_pretrained(model,
copytree(
model.model_dir,
target_folder,
ignore=ignore_patterns(*ignore_file_set))
ignore=ignore_patterns(*ignore_file_set),
dirs_exist_ok=True)
# Save the ckpt to the save directory
try:

View File

@@ -357,6 +357,5 @@ def all_gather(data, group=None):
def is_on_same_device(model: torch.nn.Module) -> bool:
device_set = set(map(lambda p: p.device.type,
model.parameters())) - {'cpu'}
device_set = set(str(p.device) for p in model.parameters()) - {'cpu'}
return len(device_set) <= 1