mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-24 03:59:23 +01:00
Support lora for llama
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13080086 * support lora for llama * update baichuan * remove work_dir * fixbug: 1. change ConfigDict to list when hooks key not in config 2. ignore all bin files when preparing output folder * 1. support device_map 2. remove the operation of to float when using lora * add inference file * add comment * support device_map
This commit is contained in:
44
examples/pytorch/baichuan/default_offload_opt_param.json
Normal file
44
examples/pytorch/baichuan/default_offload_opt_param.json
Normal file
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"total_num_steps": "auto",
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"offload_param": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto"
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"steps_per_print": 10000000,
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
235
examples/pytorch/baichuan/finetune_baichuan.py
Normal file
235
examples/pytorch/baichuan/finetune_baichuan.py
Normal file
@@ -0,0 +1,235 @@
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
from modelscope import (EpochBasedTrainer, MsDataset, TrainingArgs,
|
||||
build_dataset_from_file, snapshot_download)
|
||||
from modelscope.metainfo import Trainers
|
||||
from modelscope.preprocessors import TextGenerationTransformersPreprocessor
|
||||
from modelscope.swift import Swift
|
||||
from modelscope.swift.lora import LoRAConfig
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
DEFAULT_PAD_TOKEN = '[PAD]'
|
||||
DEFAULT_EOS_TOKEN = '</s>'
|
||||
DEFAULT_BOS_TOKEN = '<s>'
|
||||
DEFAULT_UNK_TOKEN = '<unk>'
|
||||
|
||||
|
||||
@dataclass(init=False)
|
||||
class TextGenerationArguments(TrainingArgs):
|
||||
|
||||
trainer: str = field(
|
||||
default=Trainers.default, metadata={
|
||||
'help': 'The trainer used',
|
||||
})
|
||||
|
||||
src_txt: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The source text key of preprocessor',
|
||||
'cfg_node': 'preprocessor.src_txt'
|
||||
})
|
||||
|
||||
tgt_txt: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The target text key of preprocessor',
|
||||
'cfg_node': 'preprocessor.tgt_txt'
|
||||
})
|
||||
|
||||
preprocessor: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The preprocessor type',
|
||||
'cfg_node': 'preprocessor.type'
|
||||
})
|
||||
|
||||
lr_scheduler: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The lr scheduler type',
|
||||
'cfg_node': 'train.lr_scheduler.type'
|
||||
})
|
||||
|
||||
world_size: int = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The parallel world size',
|
||||
'cfg_node': 'megatron.world_size'
|
||||
})
|
||||
|
||||
tensor_model_parallel_size: int = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The tensor model parallel size',
|
||||
'cfg_node': 'megatron.tensor_model_parallel_size'
|
||||
})
|
||||
|
||||
use_megatron: bool = field(
|
||||
default=None, metadata={
|
||||
'help': 'Whether to use MegatronHook',
|
||||
})
|
||||
|
||||
bf16: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
'help': 'Whether to use bf16',
|
||||
'cfg_node': 'train.bf16'
|
||||
})
|
||||
|
||||
deepspeed: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The location of DeepSpeed json config file.',
|
||||
})
|
||||
|
||||
T_max: int = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'The T_max for CosineAnnealingLR',
|
||||
'cfg_node': 'train.lr_scheduler.T_max'
|
||||
})
|
||||
|
||||
use_lora: int = field(
|
||||
default=0,
|
||||
metadata={'help': 'Whether to use lora to train the model.'},
|
||||
)
|
||||
|
||||
lora_rank: int = field(
|
||||
default=32,
|
||||
metadata={'help': 'The lora rank'},
|
||||
)
|
||||
|
||||
lora_alpha: int = field(
|
||||
default=32,
|
||||
metadata={'help': 'The lora alpha'},
|
||||
)
|
||||
|
||||
lora_dropout: float = field(
|
||||
default=0.05,
|
||||
metadata={'help': 'The lora dropout'},
|
||||
)
|
||||
|
||||
device_map: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'A map that specifies where each submodule should go.'
|
||||
})
|
||||
|
||||
|
||||
def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer,
|
||||
model):
|
||||
"""Resize tokenizer and embedding.
|
||||
|
||||
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
|
||||
"""
|
||||
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
if num_new_tokens > 0:
|
||||
input_embeddings = model.get_input_embeddings().weight.data
|
||||
output_embeddings = model.get_output_embeddings().weight.data
|
||||
|
||||
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
|
||||
dim=0, keepdim=True)
|
||||
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
|
||||
dim=0, keepdim=True)
|
||||
|
||||
input_embeddings[-num_new_tokens:] = input_embeddings_avg
|
||||
output_embeddings[-num_new_tokens:] = output_embeddings_avg
|
||||
|
||||
|
||||
config, args = TextGenerationArguments().parse_cli().to_config()
|
||||
print(config, args)
|
||||
|
||||
|
||||
def cfg_modify_fn(cfg):
|
||||
if args.use_model_config:
|
||||
cfg.merge_from_dict(config)
|
||||
else:
|
||||
cfg = config
|
||||
if 'hooks' not in cfg.train:
|
||||
cfg.train['hooks'] = []
|
||||
if args.use_megatron:
|
||||
cfg.train.hooks.append({'type': 'MegatronHook'})
|
||||
if args.deepspeed:
|
||||
cfg.train.hooks.append({
|
||||
'type': 'DeepspeedHook',
|
||||
'config': args.deepspeed,
|
||||
'save_zero_checkpoint': True,
|
||||
'with_mpu': False,
|
||||
})
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
if args.dataset_json_file is None:
|
||||
train_dataset = MsDataset.load(
|
||||
args.train_dataset_name,
|
||||
subset_name=args.train_subset_name,
|
||||
split=args.train_split,
|
||||
namespace=args.train_dataset_namespace)
|
||||
validation_dataset = MsDataset.load(
|
||||
args.val_dataset_name,
|
||||
subset_name=args.val_subset_name,
|
||||
split=args.val_split,
|
||||
namespace=args.val_dataset_namespace)
|
||||
else:
|
||||
train_dataset, validation_dataset = build_dataset_from_file(
|
||||
args.dataset_json_file)
|
||||
|
||||
model_dir = snapshot_download(args.model)
|
||||
sys.path.append(model_dir)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_dir, trust_remote_code=True, device_map=args.device_map)
|
||||
cfg_file = os.path.join(model_dir, 'configuration.json')
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
||||
|
||||
special_tokens_dict = dict()
|
||||
if tokenizer.pad_token is None or tokenizer.pad_token == '':
|
||||
special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
|
||||
if tokenizer.eos_token is None or tokenizer.eos_token == '':
|
||||
special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
|
||||
if tokenizer.bos_token is None or tokenizer.bos_token == '':
|
||||
special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
|
||||
if tokenizer.unk_token is None or tokenizer.unk_token == '':
|
||||
special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
|
||||
|
||||
smart_tokenizer_and_embedding_resize(
|
||||
special_tokens_dict=special_tokens_dict,
|
||||
tokenizer=tokenizer,
|
||||
model=model,
|
||||
)
|
||||
|
||||
preprocessor = TextGenerationTransformersPreprocessor(
|
||||
model_dir,
|
||||
tokenizer=tokenizer,
|
||||
src_txt=config.preprocessor.src_txt,
|
||||
tgt_txt=config.preprocessor.tgt_txt)
|
||||
|
||||
if args.use_lora != 0:
|
||||
lora_config = LoRAConfig(
|
||||
replace_modules=['pack'],
|
||||
rank=args.lora_rank,
|
||||
lora_alpha=args.lora_alpha,
|
||||
lora_dropout=args.lora_dropout)
|
||||
model = model.bfloat16()
|
||||
Swift.prepare_model(model, lora_config)
|
||||
|
||||
kwargs = dict(
|
||||
model=model,
|
||||
cfg_file=cfg_file,
|
||||
preprocessor=preprocessor,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=validation_dataset,
|
||||
seed=args.seed,
|
||||
cfg_modify_fn=cfg_modify_fn,
|
||||
# No placement for model, leave the model to `device_map`
|
||||
device='cpu')
|
||||
|
||||
trainer: EpochBasedTrainer = build_trainer(
|
||||
name=args.trainer, default_args=kwargs)
|
||||
trainer.train()
|
||||
21
examples/pytorch/baichuan/run_train_lora.sh
Normal file
21
examples/pytorch/baichuan/run_train_lora.sh
Normal file
@@ -0,0 +1,21 @@
|
||||
export PYTHONPATH=$PYTHONPATH:./
|
||||
torchrun examples/pytorch/baichuan/finetune_baichuan.py \
|
||||
--trainer 'text-generation-trainer' \
|
||||
--work_dir './tmp' \
|
||||
--model 'baichuan-inc/baichuan-7B' \
|
||||
--train_dataset_name 'chinese-poetry-collection' \
|
||||
--val_dataset_name 'chinese-poetry-collection' \
|
||||
--train_split 'train' \
|
||||
--val_split 'test' \
|
||||
--src_txt 'text1' \
|
||||
--tgt_txt 'text2' \
|
||||
--max_epochs 1 \
|
||||
--per_device_train_batch_size 8 \
|
||||
--lr 2e-5 \
|
||||
--lr_scheduler 'CosineAnnealingLR' \
|
||||
--eval_strategy 'no' \
|
||||
--bf16 1 \
|
||||
--use_lora 1 \
|
||||
--eval_metrics 'text-gen-metric' \
|
||||
--T_max 1 \
|
||||
--device_map 'auto' \
|
||||
@@ -192,7 +192,8 @@ if config['model']['type'] == 'chatglm6b':
|
||||
model_config['model']['prefix_projection'] = args.prefix_projection
|
||||
|
||||
tokenizer = ChatGLMTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
||||
model = Model.from_pretrained(model_dir, cfg_dict=model_config)
|
||||
model = Model.from_pretrained(
|
||||
model_dir, cfg_dict=model_config, device_map='auto')
|
||||
|
||||
if args.ptuning_checkpoint is not None:
|
||||
# Evaluation
|
||||
@@ -213,7 +214,7 @@ if args.pre_seq_len is not None:
|
||||
# P-tuning v2
|
||||
model = model.half()
|
||||
model.transformer.prefix_encoder.float()
|
||||
else:
|
||||
elif not args.use_lora:
|
||||
# Finetune
|
||||
model = model.float()
|
||||
|
||||
@@ -367,6 +368,8 @@ model.gradient_checkpointing_enable()
|
||||
if config['model']['type'] == 'chatglm6b':
|
||||
model.enable_input_require_grads()
|
||||
|
||||
# import torch
|
||||
# model = torch.nn.DataParallel(model).cuda()
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
cfg_file='./configuration.json',
|
||||
@@ -375,6 +378,8 @@ trainer = Seq2SeqTrainer(
|
||||
seed=args.seed,
|
||||
data_collator=data_collator,
|
||||
remove_unused_data=True,
|
||||
# No placement for model, leave the model to `device_map`
|
||||
device='cpu',
|
||||
cfg_modify_fn=cfg_modify_fn)
|
||||
trainer.tokenizer = tokenizer
|
||||
trainer.train()
|
||||
|
||||
@@ -11,17 +11,17 @@ lora_config = LoRAConfig(
|
||||
lora_dropout=0.05,
|
||||
pretrained_weights='./lora_dureader_target/iter_600.pth')
|
||||
|
||||
model_dir = 'ZhipuAI/chatglm2-6b'
|
||||
model_dir = 'ZhipuAI/ChatGLM-6B'
|
||||
model_config = read_config(model_dir)
|
||||
model_config['model'] = ConfigDict({
|
||||
'type': Models.chatglm2_6b,
|
||||
'type': Models.chatglm_6b,
|
||||
})
|
||||
|
||||
model = Model.from_pretrained(model_dir, cfg_dict=model_config)
|
||||
model = model.bfloat16()
|
||||
Swift.prepare_model(model, lora_config)
|
||||
|
||||
pipe = pipeline('chat', model, pipeline_name='chatglm2_6b-text-generation')
|
||||
pipe = pipeline('chat', model, pipeline_name='chatglm6b-text-generation')
|
||||
|
||||
print(
|
||||
pipe({
|
||||
|
||||
31
examples/pytorch/chatglm6b/lora_inference_v2.py
Normal file
31
examples/pytorch/chatglm6b/lora_inference_v2.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from modelscope import Model, pipeline, read_config
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.swift import Swift
|
||||
from modelscope.swift.lora import LoRAConfig
|
||||
from modelscope.utils.config import ConfigDict
|
||||
|
||||
lora_config = LoRAConfig(
|
||||
replace_modules=['attention.query_key_value'],
|
||||
rank=32,
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
pretrained_weights='./lora_dureader_target/iter_600.pth')
|
||||
|
||||
model_dir = 'ZhipuAI/chatglm2-6b'
|
||||
model_config = read_config(model_dir)
|
||||
model_config['model'] = ConfigDict({
|
||||
'type': Models.chatglm2_6b,
|
||||
})
|
||||
|
||||
model = Model.from_pretrained(model_dir, cfg_dict=model_config)
|
||||
model = model.bfloat16()
|
||||
Swift.prepare_model(model, lora_config)
|
||||
|
||||
pipe = pipeline('chat', model, pipeline_name='chatglm2_6b-text-generation')
|
||||
|
||||
print(
|
||||
pipe({
|
||||
'text':
|
||||
'纵使进入21世纪后,我国教育水平有了明显进步,高考的难度却依旧不容小觑,高考被中国学生和家长定义为改变命运、改写人生脑重要考试,为了这场考试,学生和家长都付出了很多。',
|
||||
'history': []
|
||||
}))
|
||||
@@ -4,14 +4,10 @@
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import json
|
||||
import torch
|
||||
import utils
|
||||
|
||||
from modelscope import TrainingArgs
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
@@ -19,6 +15,8 @@ from modelscope.metainfo import Trainers
|
||||
from modelscope.models.nlp.llama import LlamaForTextGeneration, LlamaTokenizer
|
||||
from modelscope.msdatasets.dataset_cls.custom_datasets.torch_custom_dataset import \
|
||||
TorchCustomDataset
|
||||
from modelscope.swift import Swift
|
||||
from modelscope.swift.lora import LoRAConfig
|
||||
from modelscope.trainers import build_trainer
|
||||
|
||||
IGNORE_INDEX = -100
|
||||
@@ -54,11 +52,35 @@ class TextGenerationArguments(TrainingArgs):
|
||||
'help': 'The location of DeepSpeed json config file.',
|
||||
})
|
||||
|
||||
work_dir: str = field(
|
||||
default=None, metadata={
|
||||
'help': 'The location of work dir',
|
||||
use_lora: int = field(
|
||||
default=0,
|
||||
metadata={'help': 'Whether to use lora to train the model.'},
|
||||
)
|
||||
|
||||
lora_rank: int = field(
|
||||
default=32,
|
||||
metadata={'help': 'The lora rank'},
|
||||
)
|
||||
|
||||
lora_alpha: int = field(
|
||||
default=32,
|
||||
metadata={'help': 'The lora alpha'},
|
||||
)
|
||||
|
||||
lora_dropout: float = field(
|
||||
default=0.05,
|
||||
metadata={'help': 'The lora dropout'},
|
||||
)
|
||||
|
||||
device_map: str = field(
|
||||
default=None,
|
||||
metadata={
|
||||
'help': 'A map that specifies where each submodule should go.'
|
||||
})
|
||||
|
||||
zero_stage: int = field(
|
||||
default=None, metadata={'help': 'The stage of zero_optimization'})
|
||||
|
||||
|
||||
def _tokenize_fn(strings, tokenizer):
|
||||
"""Tokenize a list of strings."""
|
||||
@@ -211,12 +233,15 @@ if __name__ == '__main__':
|
||||
cfg.train.dataloader = {'batch_size_per_gpu': 4, 'workers_per_gpu': 1}
|
||||
if 'hooks' not in cfg.train:
|
||||
cfg.train['hooks'] = []
|
||||
if args.deepspeed is not None:
|
||||
cfg.train.hooks.append({
|
||||
'type': 'DeepspeedHook',
|
||||
'config': args.deepspeed,
|
||||
'save_zero_checkpoint': True,
|
||||
'with_mpu': False,
|
||||
})
|
||||
if args.zero_stage is not None:
|
||||
cfg.train.hooks[-1]['zero_stage'] = args.zero_stage
|
||||
|
||||
cfg.preprocessor.sequence_length = 512
|
||||
return cfg
|
||||
@@ -225,7 +250,17 @@ if __name__ == '__main__':
|
||||
args.model) else snapshot_download(args.model)
|
||||
data_path = args.src_txt if args.src_txt else os.path.join(
|
||||
model_path, 'alpaca_data.json')
|
||||
model = LlamaForTextGeneration.from_pretrained(model_path)
|
||||
model = LlamaForTextGeneration.from_pretrained(
|
||||
model_path, device_map=args.device_map)
|
||||
|
||||
if args.use_lora != 0:
|
||||
lora_config = LoRAConfig(
|
||||
replace_modules=['q_proj', 'k_proj', 'v_proj'],
|
||||
rank=args.lora_rank,
|
||||
lora_alpha=args.lora_alpha,
|
||||
lora_dropout=args.lora_dropout)
|
||||
model = model.bfloat16()
|
||||
Swift.prepare_model(model, lora_config)
|
||||
|
||||
tokenizer = LlamaTokenizer.from_pretrained(
|
||||
model_path,
|
||||
@@ -234,9 +269,13 @@ if __name__ == '__main__':
|
||||
)
|
||||
|
||||
special_tokens_dict = dict()
|
||||
if tokenizer.pad_token is None or tokenizer.pad_token == '':
|
||||
special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
|
||||
if tokenizer.eos_token is None or tokenizer.eos_token == '':
|
||||
special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
|
||||
if tokenizer.bos_token is None or tokenizer.bos_token == '':
|
||||
special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN
|
||||
if tokenizer.unk_token is None or tokenizer.unk_token == '':
|
||||
special_tokens_dict['unk_token'] = DEFAULT_UNK_TOKEN
|
||||
|
||||
smart_tokenizer_and_embedding_resize(
|
||||
@@ -263,7 +302,7 @@ if __name__ == '__main__':
|
||||
trainer.train()
|
||||
|
||||
# prepare for inference
|
||||
if int(os.environ.get('LOCAL_RANK', 0)) == 0:
|
||||
if args.deepspeed and int(os.environ.get('LOCAL_RANK', 0)) == 0:
|
||||
tokenizer.save_pretrained(os.path.join(args.work_dir, 'output'))
|
||||
os.system(f'rm {args.work_dir}/output/pytorch_model*')
|
||||
os.system(
|
||||
|
||||
11
examples/pytorch/llama/run_train_lora.sh
Normal file
11
examples/pytorch/llama/run_train_lora.sh
Normal file
@@ -0,0 +1,11 @@
|
||||
DATA_PARALLEL_SIZE=1
|
||||
|
||||
|
||||
export PYTHONPATH=$PYTHONPATH:./
|
||||
torchrun --nproc_per_node $DATA_PARALLEL_SIZE examples/pytorch/llama/finetune_llama.py \
|
||||
--work_dir './tmp' \
|
||||
--model 'skyline2006/llama-7b' \
|
||||
--deepspeed 'default_offload_opt_param.json' \
|
||||
--eval_interval 100 \
|
||||
--use_lora 1 \
|
||||
--zero_stage 2 \
|
||||
@@ -1267,6 +1267,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
||||
shift_labels = shift_labels.to(shift_logits.device)
|
||||
loss = loss_fct(
|
||||
shift_logits.view(-1, shift_logits.size(-1)),
|
||||
shift_labels.view(-1))
|
||||
|
||||
@@ -1003,6 +1003,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
||||
shift_labels = shift_labels.to(shift_logits.device)
|
||||
loss = loss_fct(
|
||||
shift_logits.view(-1, shift_logits.size(-1)),
|
||||
shift_labels.view(-1))
|
||||
|
||||
@@ -139,8 +139,15 @@ class TextGenerationTransformersPreprocessor(TextGenerationPreprocessorBase):
|
||||
model_type = None
|
||||
if model_dir is not None:
|
||||
model_type = get_model_type(model_dir)
|
||||
if tokenizer is not None:
|
||||
self.nlp_tokenizer = NLPTokenizer(tokenize_kwargs=kwargs)
|
||||
self.nlp_tokenizer._tokenizer = tokenizer
|
||||
else:
|
||||
self.nlp_tokenizer = NLPTokenizerForRoberta(
|
||||
model_dir, model_type, use_fast=use_fast, tokenize_kwargs=kwargs)
|
||||
model_dir,
|
||||
model_type,
|
||||
use_fast=use_fast,
|
||||
tokenize_kwargs=kwargs)
|
||||
|
||||
def decode(self, tokens, **kwargs):
|
||||
"""Decode the tokens to real text.
|
||||
|
||||
@@ -187,6 +187,7 @@ class CheckpointHook(Hook):
|
||||
strategy=self.upload_strategy,
|
||||
done=True)
|
||||
wait_for_done(self.PUSH_TO_HUB_QUEUE_NAME)
|
||||
if self.push_to_hub:
|
||||
self.logger.info('Uploading models done.')
|
||||
|
||||
def _push_to_hub(self, trainer, prefix, output_dir, delete_dir=False):
|
||||
|
||||
@@ -26,7 +26,6 @@ class CheckpointProcessor:
|
||||
trainer: The trainer instance.
|
||||
output_dir: The target folder used in inference.
|
||||
"""
|
||||
model = trainer.unwrap_module(trainer.model)
|
||||
config = trainer.cfg
|
||||
|
||||
# override pipeline by tasks name after finetune done,
|
||||
@@ -38,8 +37,7 @@ class CheckpointProcessor:
|
||||
# TODO a temp fix to avoid pipeline_name and task mismatch
|
||||
config['pipeline'] = {'type': config['task']}
|
||||
|
||||
self.copy_files_and_dump_config(trainer, output_dir, config,
|
||||
self._bin_file(model))
|
||||
self.copy_files_and_dump_config(trainer, output_dir, config, '*.bin')
|
||||
|
||||
@staticmethod
|
||||
def copy_files_and_dump_config(trainer, output_dir, config, bin_file):
|
||||
|
||||
@@ -144,7 +144,10 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
|
||||
except (ImportError, AssertionError):
|
||||
return ''
|
||||
|
||||
def get_bin_file(self):
|
||||
def get_bin_filename(self, with_mpu=True):
|
||||
if not with_mpu:
|
||||
return 'pytorch_model.bin'
|
||||
else:
|
||||
mp_rank = mpu.get_tensor_model_parallel_rank()
|
||||
rank = '{:02d}'.format(mp_rank)
|
||||
return f'mp_rank_{rank}_model_states.pt'
|
||||
@@ -163,13 +166,21 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
|
||||
|
||||
save_dir = os.path.dirname(checkpoint_path_prefix)
|
||||
prefix = os.path.basename(checkpoint_path_prefix)
|
||||
trainer.model.save_checkpoint(save_dir, prefix)
|
||||
|
||||
if not self.stage3_gather_16bit_weights_on_model_save:
|
||||
return
|
||||
bin_file = self.get_bin_file()
|
||||
with_mpu = not mpu.is_unitialized()
|
||||
bin_file = self.get_bin_filename(with_mpu)
|
||||
src_file = os.path.join(checkpoint_path_prefix, bin_file)
|
||||
if self.zero_stage == 3 or with_mpu:
|
||||
trainer.model.save_checkpoint(save_dir, prefix)
|
||||
else:
|
||||
save_checkpoint(
|
||||
model, src_file, None, None, meta=None, with_meta=False)
|
||||
|
||||
if self.zero_stage == 3:
|
||||
return
|
||||
if with_mpu:
|
||||
dest_file = os.path.join(output_dir, self._BIN_FILE_DIR, bin_file)
|
||||
else:
|
||||
dest_file = os.path.join(output_dir, bin_file)
|
||||
if os.path.isfile(dest_file):
|
||||
os.unlink(dest_file)
|
||||
|
||||
@@ -214,7 +225,7 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
|
||||
else:
|
||||
# in eval or prediction
|
||||
save_dir = checkpoint_path_prefix
|
||||
bin_file = self.get_bin_file()
|
||||
bin_file = self.get_bin_filename()
|
||||
model_file = os.path.join(save_dir, bin_file)
|
||||
checkpoint = torch.load(
|
||||
model_file, map_location=lambda storage, loc: storage)
|
||||
@@ -273,11 +284,16 @@ class DeepspeedHook(Hook):
|
||||
config=None,
|
||||
deepspeed_activation_checkpointing=True,
|
||||
save_zero_checkpoint=False,
|
||||
with_mpu=True):
|
||||
with_mpu=True,
|
||||
zero_stage=None):
|
||||
self.save_zero_checkpoint = save_zero_checkpoint
|
||||
self.deepspeed_activation_checkpointing = deepspeed_activation_checkpointing
|
||||
self.with_mpu = with_mpu
|
||||
self.deepspeed_config = config
|
||||
if zero_stage is not None:
|
||||
assert zero_stage in (0, 1, 2,
|
||||
3), 'zero_stage must in (0, 1, 2, 3)!'
|
||||
self.zero_stage = zero_stage
|
||||
|
||||
def register_processor(self, trainer):
|
||||
processor = DeepspeedProcessor()
|
||||
@@ -376,9 +392,9 @@ class DeepspeedHook(Hook):
|
||||
optimizer, lr_scheduler = deepspeed_optim_sched(
|
||||
trainer, ds_config, max_steps)
|
||||
config = ds_config.config
|
||||
self.processor.stage3_gather_16bit_weights_on_model_save = config[
|
||||
'zero_optimization'].get(
|
||||
'stage3_gather_16bit_weights_on_model_save', True)
|
||||
if self.zero_stage is not None:
|
||||
config['zero_optimization']['stage'] = self.zero_stage
|
||||
self.processor.zero_stage = config['zero_optimization'].get('stage', 0)
|
||||
|
||||
trainer.model, trainer.optimizer, _, trainer.lr_scheduler = deepspeed.initialize(
|
||||
model=trainer.model,
|
||||
|
||||
@@ -35,7 +35,7 @@ class MpuProcessor(CheckpointProcessor):
|
||||
except (ImportError, AssertionError):
|
||||
return ''
|
||||
|
||||
def get_bin_file(self):
|
||||
def get_bin_filename(self):
|
||||
mp_rank = mpu.get_tensor_model_parallel_rank()
|
||||
rank = '{:02d}'.format(mp_rank)
|
||||
return f'mp_rank_{rank}_model_states.pt'
|
||||
@@ -72,7 +72,7 @@ class MpuProcessor(CheckpointProcessor):
|
||||
|
||||
save_dir = os.path.dirname(checkpoint_path_prefix)
|
||||
prefix = os.path.basename(checkpoint_path_prefix)
|
||||
bin_file = self.get_bin_file()
|
||||
bin_file = self.get_bin_filename()
|
||||
prefix_bin_file = os.path.join(save_dir, prefix + '_' + bin_file)
|
||||
save_checkpoint(model, prefix_bin_file, with_meta=False)
|
||||
|
||||
@@ -98,7 +98,7 @@ class MpuProcessor(CheckpointProcessor):
|
||||
|
||||
save_dir = os.path.dirname(checkpoint_path_prefix)
|
||||
prefix = os.path.basename(checkpoint_path_prefix)
|
||||
bin_file = self.get_bin_file()
|
||||
bin_file = self.get_bin_filename()
|
||||
absolute_file = os.path.join(save_dir, prefix + '_' + bin_file)
|
||||
if os.path.isfile(absolute_file):
|
||||
os.remove(absolute_file)
|
||||
@@ -108,7 +108,7 @@ class MpuProcessor(CheckpointProcessor):
|
||||
model = trainer.unwrap_module(trainer.model)
|
||||
if os.path.isdir(checkpoint_path_prefix):
|
||||
save_dir = checkpoint_path_prefix
|
||||
bin_file = self.get_bin_file()
|
||||
bin_file = self.get_bin_filename()
|
||||
model_file = os.path.join(save_dir, bin_file)
|
||||
load_checkpoint(model_file, model, None, None)
|
||||
else:
|
||||
@@ -119,7 +119,7 @@ class MpuProcessor(CheckpointProcessor):
|
||||
|
||||
save_dir = os.path.dirname(checkpoint_path_prefix)
|
||||
prefix = os.path.basename(checkpoint_path_prefix)
|
||||
bin_file = self.get_bin_file()
|
||||
bin_file = self.get_bin_filename()
|
||||
|
||||
model_file = os.path.join(save_dir, prefix + '_' + bin_file)
|
||||
load_checkpoint(model_file, model, None, None)
|
||||
|
||||
@@ -232,7 +232,7 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
# A logic to fit the current code
|
||||
# Put a DDPHook in if launcher is provided.
|
||||
if 'hooks' not in self.cfg.train:
|
||||
self.cfg.train['hooks'] = ConfigDict([])
|
||||
self.cfg.train['hooks'] = []
|
||||
self.cfg.train['hooks'].append({
|
||||
'type': 'DDPHook',
|
||||
'launcher': self.launcher
|
||||
|
||||
Reference in New Issue
Block a user