Add lora_inference for baichuan. (#352)

* add lora_inference.py for baichuan

* fix linttest

* fix linttest

---------

Co-authored-by: hemu <hemu.zp@alibaba-inc.com>
This commit is contained in:
Firmament-cyou
2023-07-04 18:39:36 +08:00
committed by GitHub
parent 08c71f1f3d
commit 423e2ce940
2 changed files with 171 additions and 103 deletions

View File

@@ -1,46 +1,52 @@
import ast
import datetime as dt
import math
import os
import random
import re
import sys
import math
import json
import ast
import datetime as dt
from typing import List, Tuple, Dict, Callable, Optional, Union, Any
from functools import partial
#
from tqdm import tqdm
import numpy as np
from numpy import ndarray
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import json
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.figure import Figure
import numpy as np
#
import torch
import torch.nn as nn
import torch.optim as optim
from torch import Tensor, device as Device, dtype as Dtype
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from numpy import ndarray
from tensorboard.backend.event_processing.event_accumulator import \
EventAccumulator
from torch import Tensor
from torch import device as Device
from torch import dtype as Dtype
from torch.nn import Module
from torch.optim import Optimizer
from torch.utils.data import Dataset
from torch.nn.parameter import Parameter
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Optimizer
from torch.optim import lr_scheduler as lrs
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
#
from torchmetrics import Accuracy, MeanMetric
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
#
from modelscope import get_logger
from modelscope import MsDataset, snapshot_download, Model, read_config
from modelscope.utils.config import Config, ConfigDict
from modelscope.msdatasets.dataset_cls.custom_datasets import TorchCustomDataset
from modelscope.trainers import EpochBasedTrainer
from modelscope.swift import Swift, LoRAConfig
from tqdm import tqdm
#
from modelscope import (Model, MsDataset, get_logger, read_config,
snapshot_download)
from modelscope.metrics.base import Metric
from modelscope.metrics.builder import METRICS
from modelscope.utils.registry import default_group
from modelscope.models.nlp.chatglm2 import ChatGLM2Tokenizer
from modelscope.msdatasets.dataset_cls.custom_datasets import \
TorchCustomDataset
from modelscope.swift import LoRAConfig, Swift
from modelscope.trainers import EpochBasedTrainer
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.registry import default_group
#
SYSTEM_TEXT = """{system}"""
@@ -51,7 +57,7 @@ ASSISTANT_PROMPT = """\n\n### 助手
MAX_LENGTH = 2048
TEST_MAX_LENGTH = MAX_LENGTH
COLOR, COLOR_S = "#FFE2D9", "#FF7043"
COLOR, COLOR_S = '#FFE2D9', '#FF7043'
logger = get_logger()
#
@@ -68,7 +74,7 @@ def _get_version(work_dir: str) -> int:
fnames = []
v_list = [-1]
for fname in fnames:
m = re.match(r"v(\d+)", fname)
m = re.match(r'v(\d+)', fname)
if m is None:
continue
v = m.group(1)
@@ -80,10 +86,10 @@ def get_work_dir(work_dir: str) -> str:
"""add version"""
work_dir = os.path.abspath(work_dir)
version = _get_version(work_dir)
time = dt.datetime.now().strftime("%Y%m%d-%H%M%S")
time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
#
work_dir = os.path.join(work_dir, f"v{version}-{time}")
logger.info(f"work_dir: {work_dir}")
work_dir = os.path.join(work_dir, f'v{version}-{time}')
logger.info(f'work_dir: {work_dir}')
return work_dir
@@ -92,19 +98,21 @@ def select_device(device_ids: List[int]) -> Device:
Return: master device
"""
if torch.cuda.is_initialized():
logger.warning("CUDA has been initialized! Device selection fails!")
return torch.device("cuda:0")
logger.warning('CUDA has been initialized! Device selection fails!')
return torch.device('cuda:0')
#
log_s = "Using device: "
log_s = 'Using device: '
if len(device_ids) == 0: # cpu
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
device: str = "cpu"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
device: str = 'cpu'
log_s += device
else:
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(d) for d in device_ids])
assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device_ids)
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
[str(d) for d in device_ids])
assert torch.cuda.is_available(
) and torch.cuda.device_count() >= len(device_ids)
log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. "cuda:1,7,8"
device = "cuda:0"
device = 'cuda:0'
logger.info(log_s)
return torch.device(device)
@@ -118,15 +126,16 @@ def seed_everything(seed: Optional[int] = None, gpu_dtm: bool = False) -> int:
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
logger.info(f"Global seed set to {seed}")
logger.info(f'Global seed set to {seed}')
if gpu_dtm:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
logger.info(f"Setting deterministic: {True}, benchmark: {False}")
logger.info(f'Setting deterministic: {True}, benchmark: {False}')
return seed
def get_T_max(dataset_len: int, batch_size: int, max_epochs: int, drop_last: bool) -> int:
def get_T_max(dataset_len: int, batch_size: int, max_epochs: int,
drop_last: bool) -> int:
"""Calculate T_max in CosineAnnealingLR"""
if drop_last:
T_max = dataset_len // batch_size
@@ -136,25 +145,32 @@ def get_T_max(dataset_len: int, batch_size: int, max_epochs: int, drop_last: boo
return T_max
def tokenize_function(system: str, user: str, assistant: Optional[str], tokenizer) -> Dict[str, Any]:
def tokenize_function(system: str, user: str, assistant: Optional[str],
tokenizer) -> Dict[str, Any]:
"""Only applicable to baichuan and chatglm2. Other models need to be tested"""
system_text = SYSTEM_TEXT.format(system=system)
user_text = USER_TEXT.format(user=user)
system_text_ids: List[int] = tokenizer(system_text, return_attention_mask=False,
add_special_tokens=True)["input_ids"]
user_text_ids: List[int] = tokenizer(user_text, return_attention_mask=False,
add_special_tokens=False)["input_ids"]
assistant_p_input_ids: List[int] = tokenizer(ASSISTANT_PROMPT, return_attention_mask=False,
add_special_tokens=False)["input_ids"]
system_text_ids: List[int] = tokenizer(
system_text, return_attention_mask=False,
add_special_tokens=True)['input_ids']
user_text_ids: List[int] = tokenizer(
user_text, return_attention_mask=False,
add_special_tokens=False)['input_ids']
assistant_p_input_ids: List[int] = tokenizer(
ASSISTANT_PROMPT,
return_attention_mask=False,
add_special_tokens=False)['input_ids']
# tokenizer.bos_token_id: Avoid `assistant` being empty
assistant_input_ids: List[int] = [tokenizer.bos_token_id]
if assistant is not None:
assistant_input_ids += tokenizer(assistant, return_attention_mask=False, add_special_tokens=False)["input_ids"]
assistant_input_ids += tokenizer(
assistant, return_attention_mask=False,
add_special_tokens=False)['input_ids']
assistant_input_ids += [tokenizer.eos_token_id]
#
input_ids = system_text_ids + user_text_ids + assistant_p_input_ids + assistant_input_ids
if assistant is not None: # train, val
if assistant is not None: # train, val
if len(input_ids) > MAX_LENGTH:
return {}
len_mask = len(input_ids) - len(assistant_input_ids)
@@ -164,12 +180,13 @@ def tokenize_function(system: str, user: str, assistant: Optional[str], tokenize
labels = None
#
return {"input_ids": input_ids, "labels": labels}
return {'input_ids': input_ids, 'labels': labels}
class MyDataset(TorchCustomDataset):
def __init__(self, system: List[str], user: List[str], assistant: List[str],
tokenize_function) -> None:
def __init__(self, system: List[str], user: List[str],
assistant: List[str], tokenize_function) -> None:
self._data = []
for i in tqdm(range(len(system))):
_d = tokenize_function(system[i], user[i], assistant[i])
@@ -184,36 +201,48 @@ class MyDataset(TorchCustomDataset):
return len(self._data)
def stat_dataset(dataset: "MyDataset") -> None:
def stat_dataset(dataset: 'MyDataset') -> None:
"""Statistical analysis was performed on the data set"""
_token_len = []
for d in dataset:
_token_len.append(len(d["input_ids"]))
_token_len.append(len(d['input_ids']))
_token_len = np.array(_token_len)
mean = _token_len.mean().item()
std = _token_len.std().item()
min_ = _token_len.min().item()
max_ = _token_len.max().item()
logger.info(
f"Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}")
f'Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}'
)
def print_examples(examples: Dict[str, Any], tokenizer) -> None:
input_ids, labels = examples["input_ids"], examples["labels"]
print(f"[INPUT_IDS] {tokenizer.decode(input_ids)}")
input_ids, labels = examples['input_ids'], examples['labels']
print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}')
print()
print(f"[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}")
print(
f'[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}'
)
def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
input_ids = [torch.tensor(b["input_ids"]) for b in batch]
labels = [torch.tensor(b["labels"]) for b in batch]
attention_mask = [torch.ones(len(input_ids[i]), dtype=torch.int64) for i in range(len(input_ids))]
input_ids = [torch.tensor(b['input_ids']) for b in batch]
labels = [torch.tensor(b['labels']) for b in batch]
attention_mask = [
torch.ones(len(input_ids[i]), dtype=torch.int64)
for i in range(len(input_ids))
]
#
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
input_ids = pad_sequence(
input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_mask = pad_sequence(
attention_mask, batch_first=True, padding_value=0)
labels = pad_sequence(labels, batch_first=True, padding_value=-100)
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels
}
def print_model_info(model: Module, name: Optional[str] = None) -> None:
@@ -228,34 +257,35 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
n_grads /= 1e6
n_buffers /= 1e6
s = [
f"{name}: ",
f"{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ",
f"{n_buffers:.4f}M Buffers",
f'{name}: ',
f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
f'{n_buffers:.4f}M Buffers',
]
s += "."
logger.info("".join(s))
s += '.'
logger.info(''.join(s))
def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
named_p = list(model.named_parameters())
for i, (n, p) in enumerate(named_p):
if i >= max_lines:
logger.info("...")
logger.info('...')
break
logger.info(f"{n}: requires_grad={p.requires_grad}")
logger.info(f'{n}: requires_grad={p.requires_grad}')
@METRICS.register_module(group_key=default_group, module_name='my_metric')
class MyMetric(Metric):
def __init__(self, vocab_size: int):
self.acc = Accuracy("multiclass", num_classes=vocab_size)
self.acc = Accuracy('multiclass', num_classes=vocab_size)
self.loss = MeanMetric()
def add(self, outputs: Dict[str, Any], inputs: Dict[str, Any]) -> None:
loss: Tensor = outputs.loss
self.loss.update(loss)
#
labels: Tensor = inputs["labels"]
labels: Tensor = inputs['labels']
labels = labels[:, 1:]
labels_mask = labels != -100
logits: Tensor = outputs.logits[:, :-1]
@@ -266,18 +296,19 @@ class MyMetric(Metric):
def evaluate(self):
return {
"acc": self.acc.compute().item(),
"loss": self.loss.compute().item()
'acc': self.acc.compute().item(),
'loss': self.loss.compute().item()
}
def merge(self, other: "MyMetric") -> None:
def merge(self, other: 'MyMetric') -> None:
"""This script does not support ddp"""
raise NotImplementedError
def get_baichuan_model_tokenizer(model_dir: Optional[str] = None, load_model: bool = True):
def get_baichuan_model_tokenizer(model_dir: Optional[str] = None,
load_model: bool = True):
if model_dir is None:
model_id = "baichuan-inc/baichuan-7B"
model_id = 'baichuan-inc/baichuan-7B'
model_dir = get_model_dir(model_id, None)
#
sys.path.insert(0, model_dir)
@@ -286,51 +317,59 @@ def get_baichuan_model_tokenizer(model_dir: Optional[str] = None, load_model: bo
from modeling_baichuan import BaiChuanForCausalLM
model_config = BaiChuanConfig.from_pretrained(model_dir)
model_config.torch_dtype = torch.float16
logger.info(f"model_config: {model_config}")
logger.info(f'model_config: {model_config}')
tokenizer = BaiChuanTokenizer.from_pretrained(model_dir)
model = None
if load_model:
model = BaiChuanForCausalLM.from_pretrained(model_dir, config=model_config,
device_map="auto", torch_dtype=torch.float16)
model = BaiChuanForCausalLM.from_pretrained(
model_dir,
config=model_config,
device_map='auto',
torch_dtype=torch.float16)
#
return model, tokenizer
def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None, load_model: bool = True):
def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None,
load_model: bool = True):
if model_dir is None:
model_id = "ZhipuAI/chatglm2-6b"
model_revision = "v1.0.3"
model_id = 'ZhipuAI/chatglm2-6b'
model_revision = 'v1.0.3'
model_dir = snapshot_download(model_id, model_revision)
#
config = read_config(model_dir)
config["model"] = ConfigDict({
"type": "chatglm2-6b"
})
config['model'] = ConfigDict({'type': 'chatglm2-6b'})
tokenizer = ChatGLM2Tokenizer.from_pretrained(model_dir)
model = None
if load_model:
model = Model.from_pretrained(
model_dir, cfg_dict=config, device_map='auto', torch_dtype=torch.float16)
model_dir,
cfg_dict=config,
device_map='auto',
torch_dtype=torch.float16)
return model, tokenizer
def make_dataset(split: str,
tokenize_function: Callable[[str, str, Optional[str]], Dict[str, Any]]) -> MyDataset:
def make_dataset(
split: str, tokenize_function: Callable[[str, str, Optional[str]],
Dict[str, Any]]
) -> MyDataset:
"""
split: Literal["train", "validation"]
"""
dataset = MsDataset.load('modelscope/ms_hackathon_23_agent_train_dev', split=split)
dataset = MsDataset.load(
'modelscope/ms_hackathon_23_agent_train_dev', split=split)
system = []
user = []
assistant = []
for d in dataset:
content = ast.literal_eval(d["conversations"])
s = content[0]["value"]
content = ast.literal_eval(d['conversations'])
s = content[0]['value']
assert len(content) % 2 == 1
for i in range(len(content) // 2):
system.append(s)
user.append(content[2 * i + 1]["value"])
assistant.append(content[2 * i + 2]["value"])
user.append(content[2 * i + 1]['value'])
assistant.append(content[2 * i + 2]['value'])
return MyDataset(system, user, assistant, tokenize_function)
@@ -339,21 +378,22 @@ Item = Dict[str, float]
def read_tensorboard_file(fpath: str) -> Dict[str, List[Item]]:
if not os.path.isfile(fpath):
raise FileNotFoundError(f"fpath: {fpath}")
raise FileNotFoundError(f'fpath: {fpath}')
ea = EventAccumulator(fpath)
ea.Reload()
res = {}
tags = ea.Tags()["scalars"]
tags = ea.Tags()['scalars']
for tag in tags:
values = ea.Scalars(tag)
r = []
for v in values:
r.append({"step": v.step, "value": v.value})
r.append({'step': v.step, 'value': v.value})
res[tag] = r
return res
def tensorboard_smoothing(values: List[float], smooth: float = 0.9) -> List[float]:
def tensorboard_smoothing(values: List[float],
smooth: float = 0.9) -> List[float]:
norm_factor = 1
x = 0
res = []
@@ -366,12 +406,12 @@ def tensorboard_smoothing(values: List[float], smooth: float = 0.9) -> List[floa
return res
def plot_image(data: Dict[str, List[Item]], key_name: str, smooth: float) -> Figure:
def plot_image(data: Dict[str, List[Item]], key_name: str,
smooth: float) -> Figure:
_data = data[key_name]
steps = [d["step"] for d in _data]
values = [d["value"] for d in _data]
fig, ax = plt.subplots(1, 1, squeeze=True,
figsize=(8, 5), dpi=100)
steps = [d['step'] for d in _data]
values = [d['value'] for d in _data]
fig, ax = plt.subplots(1, 1, squeeze=True, figsize=(8, 5), dpi=100)
ax.set_title(key_name)
if smooth != 0:
ax.plot(steps, values, color=COLOR)