Files
modelscope/examples/pytorch/llm/llm_sft.py

301 lines
10 KiB
Python
Raw Normal View History

# ### Setting up experimental environment.
"""
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y
pip install sentencepiece charset_normalizer cpm_kernels tiktoken -U
pip install transformers datasets scikit-learn -U
pip install matplotlib tqdm tensorboard torchmetrics -U
pip install accelerate transformers_stream_generator -U
# Install the latest version of modelscope from source
git clone https://github.com/modelscope/modelscope.git
cd modelscope
pip install -r requirements.txt
pip install .
"""
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import warnings
from dataclasses import dataclass, field
from functools import partial
from typing import List, Optional
2023-07-26 18:12:55 +08:00
import torch
2023-08-29 17:27:18 +08:00
from swift import LoRAConfig, Swift
from torch import Tensor
from utils import (DATASET_MAPPING, DEFAULT_PROMPT, MODEL_MAPPING,
data_collate_fn, get_dataset, get_model_tokenizer,
get_T_max, get_work_dir, parse_args, plot_images,
print_example, print_model_info, process_dataset,
seed_everything, show_freeze_layers, stat_dataset,
tokenize_function)
from modelscope import get_logger
from modelscope.trainers import EpochBasedTrainer
from modelscope.utils.config import Config
warnings.warn(
'This directory has been migrated to '
'https://github.com/modelscope/swift/tree/main/examples/pytorch/llm, '
'and the files in this directory are no longer maintained.',
DeprecationWarning)
logger = get_logger()
@dataclass
class SftArguments:
seed: int = 42
model_type: str = field(
default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
# baichuan-7b: 'lora': 16G; 'full': 80G
sft_type: str = field(
default='lora', metadata={'choices': ['lora', 'full']})
output_dir: Optional[str] = None
2023-07-29 00:06:27 +08:00
ignore_args_error: bool = False # True: notebook compatibility
2023-07-26 18:12:55 +08:00
dataset: str = field(
default='alpaca-en,alpaca-zh',
metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
2023-07-26 18:12:55 +08:00
dataset_seed: int = 42
dataset_sample: int = 20000 # -1: all dataset
2023-07-26 18:12:55 +08:00
dataset_test_size: float = 0.01
prompt: str = DEFAULT_PROMPT
max_length: Optional[int] = 2048
lora_target_modules: Optional[List[str]] = None
lora_rank: int = 8
lora_alpha: int = 32
lora_dropout_p: float = 0.1
gradient_checkpoint: bool = True
batch_size: int = 1
max_epochs: int = 1
learning_rate: Optional[float] = None
weight_decay: float = 0.01
n_accumulate_grad: int = 16
grad_clip_norm: float = 1.
warmup_iters: int = 200
save_trainer_state: Optional[bool] = None
eval_interval: int = 500
last_save_interval: Optional[int] = None
last_max_checkpoint_num: int = 1
best_max_checkpoint_num: int = 1
logging_interval: int = 5
tb_interval: int = 5
# other
use_flash_attn: Optional[bool] = field(
default=None,
metadata={
'help': "This parameter is used only when model_type='qwen-7b'"
})
def __post_init__(self):
if self.sft_type == 'lora':
if self.learning_rate is None:
self.learning_rate = 1e-4
if self.save_trainer_state is None:
self.save_trainer_state = True
if self.last_save_interval is None:
self.last_save_interval = self.eval_interval
elif self.sft_type == 'full':
if self.learning_rate is None:
self.learning_rate = 1e-5
if self.save_trainer_state is None:
self.save_trainer_state = False # save disk space
if self.last_save_interval is None:
# Saving the model takes a long time
self.last_save_interval = self.eval_interval * 4
else:
raise ValueError(f'sft_type: {self.sft_type}')
if self.output_dir is None:
self.output_dir = 'runs'
self.output_dir = os.path.join(self.output_dir, self.model_type)
if self.lora_target_modules is None:
self.lora_target_modules = MODEL_MAPPING[
self.model_type]['lora_TM']
if self.use_flash_attn is None:
self.use_flash_attn = 'auto'
def llm_sft(args: SftArguments) -> None:
seed_everything(args.seed)
# ### Loading Model and Tokenizer
support_bf16 = torch.cuda.is_bf16_supported()
if not support_bf16:
logger.warning(f'support_bf16: {support_bf16}')
kwargs = {'low_cpu_mem_usage': True, 'device_map': 'auto'}
if args.model_type == 'qwen-7b':
kwargs['use_flash_attn'] = args.use_flash_attn
model, tokenizer, model_dir = get_model_tokenizer(
args.model_type, torch_dtype=torch.bfloat16, **kwargs)
if args.gradient_checkpoint:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
# ### Preparing lora
if args.sft_type == 'lora':
lora_config = LoRAConfig(
2023-08-29 17:27:18 +08:00
target_modules=args.lora_target_modules,
r=args.lora_rank,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout_p)
logger.info(f'lora_config: {lora_config}')
2023-07-26 18:12:55 +08:00
model = Swift.prepare_model(model, lora_config)
show_freeze_layers(model)
print_model_info(model)
# check the device and dtype of the model
2023-07-26 18:12:55 +08:00
_p: Tensor = list(model.parameters())[-1]
logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
# ### Loading Dataset
2023-07-29 00:06:27 +08:00
dataset = get_dataset(args.dataset.split(','))
2023-07-26 18:12:55 +08:00
train_dataset, val_dataset = process_dataset(dataset,
args.dataset_test_size,
args.dataset_sample,
args.dataset_seed)
tokenize_func = partial(
tokenize_function,
tokenizer=tokenizer,
prompt=args.prompt,
max_length=args.max_length)
train_dataset = train_dataset.map(tokenize_func)
val_dataset = val_dataset.map(tokenize_func)
del dataset
# Data analysis
stat_dataset(train_dataset)
stat_dataset(val_dataset)
data_collator = partial(data_collate_fn, tokenizer=tokenizer)
print_example(train_dataset[0], tokenizer)
# ### Setting Config
cfg_file = os.path.join(model_dir, 'configuration.json')
T_max = get_T_max(
len(train_dataset), args.batch_size, args.max_epochs, True)
work_dir = get_work_dir(args.output_dir)
config = Config({
'train': {
'dataloader': {
'batch_size_per_gpu': args.batch_size,
'workers_per_gpu': 1,
'shuffle': True,
'drop_last': True,
'pin_memory': True
},
'max_epochs':
args.max_epochs,
'work_dir':
work_dir,
'optimizer': {
'type': 'AdamW',
'lr': args.learning_rate,
'weight_decay': args.weight_decay,
'options': {
'cumulative_iters': args.n_accumulate_grad,
'grad_clip': {
'norm_type': 2,
'max_norm': args.grad_clip_norm
}
}
},
'lr_scheduler': {
'type': 'CosineAnnealingLR',
'T_max': T_max,
'eta_min': args.learning_rate * 0.1,
'options': {
'by_epoch': False,
'warmup': {
'type': 'LinearWarmup',
'warmup_ratio': 0.1,
'warmup_iters': args.warmup_iters
}
}
},
'hooks': [
{
'type': 'CheckpointHook',
'by_epoch': False,
'interval': args.last_save_interval,
'max_checkpoint_num': args.last_max_checkpoint_num,
'save_trainer_state': args.save_trainer_state
},
{
'type': 'EvaluationHook',
'by_epoch': False,
'interval': args.eval_interval
},
{
'type': 'BestCkptSaverHook',
'metric_key': 'loss',
'save_best': True,
'rule': 'min',
'max_checkpoint_num': args.best_max_checkpoint_num,
'save_trainer_state': args.save_trainer_state
},
{
'type': 'TextLoggerHook',
'by_epoch': True, # Whether EpochBasedTrainer is used
'interval': args.logging_interval
},
{
'type': 'TensorboardHook',
'by_epoch': False,
'interval': args.tb_interval
}
]
},
'evaluation': {
'dataloader': {
'batch_size_per_gpu': args.batch_size,
'workers_per_gpu': 1,
'shuffle': False,
'drop_last': False,
'pin_memory': True
},
'metrics': [{
'type': 'my_metric',
'vocab_size': tokenizer.vocab_size
}]
}
})
# ### Finetuning
def cfg_modify_fn(cfg: Config) -> Config:
cfg.update(config)
return cfg
trainer = EpochBasedTrainer(
model=model,
cfg_file=cfg_file,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
remove_unused_data=True,
seed=42,
cfg_modify_fn=cfg_modify_fn,
)
trainer.train()
# ### Visualization
tb_dir = os.path.join(work_dir, 'tensorboard_output')
plot_images(tb_dir, ['loss'], 0.9)
if __name__ == '__main__':
args, remaining_argv = parse_args(SftArguments)
2023-07-26 18:12:55 +08:00
if len(remaining_argv) > 0:
if args.ignore_args_error:
logger.warning(f'remaining_argv: {remaining_argv}')
else:
raise ValueError(f'remaining_argv: {remaining_argv}')
llm_sft(args)