Files
modelscope/examples/pytorch/llm/llm_sft.py
2023-08-29 17:27:18 +08:00

301 lines
10 KiB
Python

# ### Setting up experimental environment.
"""
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
pip install sentencepiece charset_normalizer cpm_kernels tiktoken -U
pip install transformers datasets scikit-learn -U
pip install matplotlib tqdm tensorboard torchmetrics -U
pip install accelerate transformers_stream_generator -U
# Install the latest version of modelscope from source
git clone https://github.com/modelscope/modelscope.git
cd modelscope
pip install -r requirements.txt
pip install .
"""
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import warnings
from dataclasses import dataclass, field
from functools import partial
from typing import List, Optional
import torch
from swift import LoRAConfig, Swift
from torch import Tensor
from utils import (DATASET_MAPPING, DEFAULT_PROMPT, MODEL_MAPPING,
data_collate_fn, get_dataset, get_model_tokenizer,
get_T_max, get_work_dir, parse_args, plot_images,
print_example, print_model_info, process_dataset,
seed_everything, show_freeze_layers, stat_dataset,
tokenize_function)
from modelscope import get_logger
from modelscope.trainers import EpochBasedTrainer
from modelscope.utils.config import Config
warnings.warn(
'This directory has been migrated to '
'https://github.com/modelscope/swift/tree/main/examples/pytorch/llm, '
'and the files in this directory are no longer maintained.',
DeprecationWarning)
logger = get_logger()
@dataclass
class SftArguments:
seed: int = 42
model_type: str = field(
default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
# baichuan-7b: 'lora': 16G; 'full': 80G
sft_type: str = field(
default='lora', metadata={'choices': ['lora', 'full']})
output_dir: Optional[str] = None
ignore_args_error: bool = False # True: notebook compatibility
dataset: str = field(
default='alpaca-en,alpaca-zh',
metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
dataset_seed: int = 42
dataset_sample: int = 20000 # -1: all dataset
dataset_test_size: float = 0.01
prompt: str = DEFAULT_PROMPT
max_length: Optional[int] = 2048
lora_target_modules: Optional[List[str]] = None
lora_rank: int = 8
lora_alpha: int = 32
lora_dropout_p: float = 0.1
gradient_checkpoint: bool = True
batch_size: int = 1
max_epochs: int = 1
learning_rate: Optional[float] = None
weight_decay: float = 0.01
n_accumulate_grad: int = 16
grad_clip_norm: float = 1.
warmup_iters: int = 200
save_trainer_state: Optional[bool] = None
eval_interval: int = 500
last_save_interval: Optional[int] = None
last_max_checkpoint_num: int = 1
best_max_checkpoint_num: int = 1
logging_interval: int = 5
tb_interval: int = 5
# other
use_flash_attn: Optional[bool] = field(
default=None,
metadata={
'help': "This parameter is used only when model_type='qwen-7b'"
})
def __post_init__(self):
if self.sft_type == 'lora':
if self.learning_rate is None:
self.learning_rate = 1e-4
if self.save_trainer_state is None:
self.save_trainer_state = True
if self.last_save_interval is None:
self.last_save_interval = self.eval_interval
elif self.sft_type == 'full':
if self.learning_rate is None:
self.learning_rate = 1e-5
if self.save_trainer_state is None:
self.save_trainer_state = False # save disk space
if self.last_save_interval is None:
# Saving the model takes a long time
self.last_save_interval = self.eval_interval * 4
else:
raise ValueError(f'sft_type: {self.sft_type}')
if self.output_dir is None:
self.output_dir = 'runs'
self.output_dir = os.path.join(self.output_dir, self.model_type)
if self.lora_target_modules is None:
self.lora_target_modules = MODEL_MAPPING[
self.model_type]['lora_TM']
if self.use_flash_attn is None:
self.use_flash_attn = 'auto'
def llm_sft(args: SftArguments) -> None:
seed_everything(args.seed)
# ### Loading Model and Tokenizer
support_bf16 = torch.cuda.is_bf16_supported()
if not support_bf16:
logger.warning(f'support_bf16: {support_bf16}')
kwargs = {'low_cpu_mem_usage': True, 'device_map': 'auto'}
if args.model_type == 'qwen-7b':
kwargs['use_flash_attn'] = args.use_flash_attn
model, tokenizer, model_dir = get_model_tokenizer(
args.model_type, torch_dtype=torch.bfloat16, **kwargs)
if args.gradient_checkpoint:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
# ### Preparing lora
if args.sft_type == 'lora':
lora_config = LoRAConfig(
target_modules=args.lora_target_modules,
r=args.lora_rank,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout_p)
logger.info(f'lora_config: {lora_config}')
model = Swift.prepare_model(model, lora_config)
show_freeze_layers(model)
print_model_info(model)
# check the device and dtype of the model
_p: Tensor = list(model.parameters())[-1]
logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
# ### Loading Dataset
dataset = get_dataset(args.dataset.split(','))
train_dataset, val_dataset = process_dataset(dataset,
args.dataset_test_size,
args.dataset_sample,
args.dataset_seed)
tokenize_func = partial(
tokenize_function,
tokenizer=tokenizer,
prompt=args.prompt,
max_length=args.max_length)
train_dataset = train_dataset.map(tokenize_func)
val_dataset = val_dataset.map(tokenize_func)
del dataset
# Data analysis
stat_dataset(train_dataset)
stat_dataset(val_dataset)
data_collator = partial(data_collate_fn, tokenizer=tokenizer)
print_example(train_dataset[0], tokenizer)
# ### Setting Config
cfg_file = os.path.join(model_dir, 'configuration.json')
T_max = get_T_max(
len(train_dataset), args.batch_size, args.max_epochs, True)
work_dir = get_work_dir(args.output_dir)
config = Config({
'train': {
'dataloader': {
'batch_size_per_gpu': args.batch_size,
'workers_per_gpu': 1,
'shuffle': True,
'drop_last': True,
'pin_memory': True
},
'max_epochs':
args.max_epochs,
'work_dir':
work_dir,
'optimizer': {
'type': 'AdamW',
'lr': args.learning_rate,
'weight_decay': args.weight_decay,
'options': {
'cumulative_iters': args.n_accumulate_grad,
'grad_clip': {
'norm_type': 2,
'max_norm': args.grad_clip_norm
}
}
},
'lr_scheduler': {
'type': 'CosineAnnealingLR',
'T_max': T_max,
'eta_min': args.learning_rate * 0.1,
'options': {
'by_epoch': False,
'warmup': {
'type': 'LinearWarmup',
'warmup_ratio': 0.1,
'warmup_iters': args.warmup_iters
}
}
},
'hooks': [
{
'type': 'CheckpointHook',
'by_epoch': False,
'interval': args.last_save_interval,
'max_checkpoint_num': args.last_max_checkpoint_num,
'save_trainer_state': args.save_trainer_state
},
{
'type': 'EvaluationHook',
'by_epoch': False,
'interval': args.eval_interval
},
{
'type': 'BestCkptSaverHook',
'metric_key': 'loss',
'save_best': True,
'rule': 'min',
'max_checkpoint_num': args.best_max_checkpoint_num,
'save_trainer_state': args.save_trainer_state
},
{
'type': 'TextLoggerHook',
'by_epoch': True, # Whether EpochBasedTrainer is used
'interval': args.logging_interval
},
{
'type': 'TensorboardHook',
'by_epoch': False,
'interval': args.tb_interval
}
]
},
'evaluation': {
'dataloader': {
'batch_size_per_gpu': args.batch_size,
'workers_per_gpu': 1,
'shuffle': False,
'drop_last': False,
'pin_memory': True
},
'metrics': [{
'type': 'my_metric',
'vocab_size': tokenizer.vocab_size
}]
}
})
# ### Finetuning
def cfg_modify_fn(cfg: Config) -> Config:
cfg.update(config)
return cfg
trainer = EpochBasedTrainer(
model=model,
cfg_file=cfg_file,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
remove_unused_data=True,
seed=42,
cfg_modify_fn=cfg_modify_fn,
)
trainer.train()
# ### Visualization
tb_dir = os.path.join(work_dir, 'tensorboard_output')
plot_images(tb_dir, ['loss'], 0.9)
if __name__ == '__main__':
args, remaining_argv = parse_args(SftArguments)
if len(remaining_argv) > 0:
if args.ignore_args_error:
logger.warning(f'remaining_argv: {remaining_argv}')
else:
raise ValueError(f'remaining_argv: {remaining_argv}')
llm_sft(args)