mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-18 17:27:43 +01:00
Add lora_inference for baichuan. (#352)
* add lora_inference.py for baichuan * fix linttest * fix linttest --------- Co-authored-by: hemu <hemu.zp@alibaba-inc.com>
This commit is contained in:
@@ -1,46 +1,52 @@
|
||||
import ast
|
||||
import datetime as dt
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import math
|
||||
import json
|
||||
import ast
|
||||
import datetime as dt
|
||||
from typing import List, Tuple, Dict, Callable, Optional, Union, Any
|
||||
from functools import partial
|
||||
#
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
from numpy import ndarray
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import json
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.axes import Axes
|
||||
from matplotlib.figure import Figure
|
||||
import numpy as np
|
||||
#
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch import Tensor, device as Device, dtype as Dtype
|
||||
from matplotlib.axes import Axes
|
||||
from matplotlib.figure import Figure
|
||||
from numpy import ndarray
|
||||
from tensorboard.backend.event_processing.event_accumulator import \
|
||||
EventAccumulator
|
||||
from torch import Tensor
|
||||
from torch import device as Device
|
||||
from torch import dtype as Dtype
|
||||
from torch.nn import Module
|
||||
from torch.optim import Optimizer
|
||||
from torch.utils.data import Dataset
|
||||
from torch.nn.parameter import Parameter
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim import lr_scheduler as lrs
|
||||
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.utils.data import Dataset
|
||||
#
|
||||
from torchmetrics import Accuracy, MeanMetric
|
||||
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
|
||||
#
|
||||
from modelscope import get_logger
|
||||
from modelscope import MsDataset, snapshot_download, Model, read_config
|
||||
from modelscope.utils.config import Config, ConfigDict
|
||||
from modelscope.msdatasets.dataset_cls.custom_datasets import TorchCustomDataset
|
||||
from modelscope.trainers import EpochBasedTrainer
|
||||
from modelscope.swift import Swift, LoRAConfig
|
||||
from tqdm import tqdm
|
||||
|
||||
#
|
||||
from modelscope import (Model, MsDataset, get_logger, read_config,
|
||||
snapshot_download)
|
||||
from modelscope.metrics.base import Metric
|
||||
from modelscope.metrics.builder import METRICS
|
||||
from modelscope.utils.registry import default_group
|
||||
from modelscope.models.nlp.chatglm2 import ChatGLM2Tokenizer
|
||||
from modelscope.msdatasets.dataset_cls.custom_datasets import \
|
||||
TorchCustomDataset
|
||||
from modelscope.swift import LoRAConfig, Swift
|
||||
from modelscope.trainers import EpochBasedTrainer
|
||||
from modelscope.utils.config import Config, ConfigDict
|
||||
from modelscope.utils.registry import default_group
|
||||
|
||||
#
|
||||
SYSTEM_TEXT = """{system}"""
|
||||
@@ -51,7 +57,7 @@ ASSISTANT_PROMPT = """\n\n### 助手
|
||||
MAX_LENGTH = 2048
|
||||
TEST_MAX_LENGTH = MAX_LENGTH
|
||||
|
||||
COLOR, COLOR_S = "#FFE2D9", "#FF7043"
|
||||
COLOR, COLOR_S = '#FFE2D9', '#FF7043'
|
||||
logger = get_logger()
|
||||
#
|
||||
|
||||
@@ -68,7 +74,7 @@ def _get_version(work_dir: str) -> int:
|
||||
fnames = []
|
||||
v_list = [-1]
|
||||
for fname in fnames:
|
||||
m = re.match(r"v(\d+)", fname)
|
||||
m = re.match(r'v(\d+)', fname)
|
||||
if m is None:
|
||||
continue
|
||||
v = m.group(1)
|
||||
@@ -80,10 +86,10 @@ def get_work_dir(work_dir: str) -> str:
|
||||
"""add version"""
|
||||
work_dir = os.path.abspath(work_dir)
|
||||
version = _get_version(work_dir)
|
||||
time = dt.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
|
||||
#
|
||||
work_dir = os.path.join(work_dir, f"v{version}-{time}")
|
||||
logger.info(f"work_dir: {work_dir}")
|
||||
work_dir = os.path.join(work_dir, f'v{version}-{time}')
|
||||
logger.info(f'work_dir: {work_dir}')
|
||||
return work_dir
|
||||
|
||||
|
||||
@@ -92,19 +98,21 @@ def select_device(device_ids: List[int]) -> Device:
|
||||
Return: master device
|
||||
"""
|
||||
if torch.cuda.is_initialized():
|
||||
logger.warning("CUDA has been initialized! Device selection fails!")
|
||||
return torch.device("cuda:0")
|
||||
logger.warning('CUDA has been initialized! Device selection fails!')
|
||||
return torch.device('cuda:0')
|
||||
#
|
||||
log_s = "Using device: "
|
||||
log_s = 'Using device: '
|
||||
if len(device_ids) == 0: # cpu
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
device: str = "cpu"
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
||||
device: str = 'cpu'
|
||||
log_s += device
|
||||
else:
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(d) for d in device_ids])
|
||||
assert torch.cuda.is_available() and torch.cuda.device_count() >= len(device_ids)
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
|
||||
[str(d) for d in device_ids])
|
||||
assert torch.cuda.is_available(
|
||||
) and torch.cuda.device_count() >= len(device_ids)
|
||||
log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. "cuda:1,7,8"
|
||||
device = "cuda:0"
|
||||
device = 'cuda:0'
|
||||
logger.info(log_s)
|
||||
return torch.device(device)
|
||||
|
||||
@@ -118,15 +126,16 @@ def seed_everything(seed: Optional[int] = None, gpu_dtm: bool = False) -> int:
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
logger.info(f"Global seed set to {seed}")
|
||||
logger.info(f'Global seed set to {seed}')
|
||||
if gpu_dtm:
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
logger.info(f"Setting deterministic: {True}, benchmark: {False}")
|
||||
logger.info(f'Setting deterministic: {True}, benchmark: {False}')
|
||||
return seed
|
||||
|
||||
|
||||
def get_T_max(dataset_len: int, batch_size: int, max_epochs: int, drop_last: bool) -> int:
|
||||
def get_T_max(dataset_len: int, batch_size: int, max_epochs: int,
|
||||
drop_last: bool) -> int:
|
||||
"""Calculate T_max in CosineAnnealingLR"""
|
||||
if drop_last:
|
||||
T_max = dataset_len // batch_size
|
||||
@@ -136,25 +145,32 @@ def get_T_max(dataset_len: int, batch_size: int, max_epochs: int, drop_last: boo
|
||||
return T_max
|
||||
|
||||
|
||||
def tokenize_function(system: str, user: str, assistant: Optional[str], tokenizer) -> Dict[str, Any]:
|
||||
def tokenize_function(system: str, user: str, assistant: Optional[str],
|
||||
tokenizer) -> Dict[str, Any]:
|
||||
"""Only applicable to baichuan and chatglm2. Other models need to be tested"""
|
||||
system_text = SYSTEM_TEXT.format(system=system)
|
||||
user_text = USER_TEXT.format(user=user)
|
||||
system_text_ids: List[int] = tokenizer(system_text, return_attention_mask=False,
|
||||
add_special_tokens=True)["input_ids"]
|
||||
user_text_ids: List[int] = tokenizer(user_text, return_attention_mask=False,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
assistant_p_input_ids: List[int] = tokenizer(ASSISTANT_PROMPT, return_attention_mask=False,
|
||||
add_special_tokens=False)["input_ids"]
|
||||
system_text_ids: List[int] = tokenizer(
|
||||
system_text, return_attention_mask=False,
|
||||
add_special_tokens=True)['input_ids']
|
||||
user_text_ids: List[int] = tokenizer(
|
||||
user_text, return_attention_mask=False,
|
||||
add_special_tokens=False)['input_ids']
|
||||
assistant_p_input_ids: List[int] = tokenizer(
|
||||
ASSISTANT_PROMPT,
|
||||
return_attention_mask=False,
|
||||
add_special_tokens=False)['input_ids']
|
||||
|
||||
# tokenizer.bos_token_id: Avoid `assistant` being empty
|
||||
assistant_input_ids: List[int] = [tokenizer.bos_token_id]
|
||||
if assistant is not None:
|
||||
assistant_input_ids += tokenizer(assistant, return_attention_mask=False, add_special_tokens=False)["input_ids"]
|
||||
assistant_input_ids += tokenizer(
|
||||
assistant, return_attention_mask=False,
|
||||
add_special_tokens=False)['input_ids']
|
||||
assistant_input_ids += [tokenizer.eos_token_id]
|
||||
#
|
||||
input_ids = system_text_ids + user_text_ids + assistant_p_input_ids + assistant_input_ids
|
||||
if assistant is not None: # train, val
|
||||
if assistant is not None: # train, val
|
||||
if len(input_ids) > MAX_LENGTH:
|
||||
return {}
|
||||
len_mask = len(input_ids) - len(assistant_input_ids)
|
||||
@@ -164,12 +180,13 @@ def tokenize_function(system: str, user: str, assistant: Optional[str], tokenize
|
||||
labels = None
|
||||
|
||||
#
|
||||
return {"input_ids": input_ids, "labels": labels}
|
||||
return {'input_ids': input_ids, 'labels': labels}
|
||||
|
||||
|
||||
class MyDataset(TorchCustomDataset):
|
||||
def __init__(self, system: List[str], user: List[str], assistant: List[str],
|
||||
tokenize_function) -> None:
|
||||
|
||||
def __init__(self, system: List[str], user: List[str],
|
||||
assistant: List[str], tokenize_function) -> None:
|
||||
self._data = []
|
||||
for i in tqdm(range(len(system))):
|
||||
_d = tokenize_function(system[i], user[i], assistant[i])
|
||||
@@ -184,36 +201,48 @@ class MyDataset(TorchCustomDataset):
|
||||
return len(self._data)
|
||||
|
||||
|
||||
def stat_dataset(dataset: "MyDataset") -> None:
|
||||
def stat_dataset(dataset: 'MyDataset') -> None:
|
||||
"""Statistical analysis was performed on the data set"""
|
||||
_token_len = []
|
||||
for d in dataset:
|
||||
_token_len.append(len(d["input_ids"]))
|
||||
_token_len.append(len(d['input_ids']))
|
||||
_token_len = np.array(_token_len)
|
||||
mean = _token_len.mean().item()
|
||||
std = _token_len.std().item()
|
||||
min_ = _token_len.min().item()
|
||||
max_ = _token_len.max().item()
|
||||
logger.info(
|
||||
f"Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}")
|
||||
f'Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}'
|
||||
)
|
||||
|
||||
|
||||
def print_examples(examples: Dict[str, Any], tokenizer) -> None:
|
||||
input_ids, labels = examples["input_ids"], examples["labels"]
|
||||
print(f"[INPUT_IDS] {tokenizer.decode(input_ids)}")
|
||||
input_ids, labels = examples['input_ids'], examples['labels']
|
||||
print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}')
|
||||
print()
|
||||
print(f"[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}")
|
||||
print(
|
||||
f'[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}'
|
||||
)
|
||||
|
||||
|
||||
def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
|
||||
input_ids = [torch.tensor(b["input_ids"]) for b in batch]
|
||||
labels = [torch.tensor(b["labels"]) for b in batch]
|
||||
attention_mask = [torch.ones(len(input_ids[i]), dtype=torch.int64) for i in range(len(input_ids))]
|
||||
input_ids = [torch.tensor(b['input_ids']) for b in batch]
|
||||
labels = [torch.tensor(b['labels']) for b in batch]
|
||||
attention_mask = [
|
||||
torch.ones(len(input_ids[i]), dtype=torch.int64)
|
||||
for i in range(len(input_ids))
|
||||
]
|
||||
#
|
||||
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
|
||||
attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
|
||||
input_ids = pad_sequence(
|
||||
input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
|
||||
attention_mask = pad_sequence(
|
||||
attention_mask, batch_first=True, padding_value=0)
|
||||
labels = pad_sequence(labels, batch_first=True, padding_value=-100)
|
||||
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
|
||||
return {
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'labels': labels
|
||||
}
|
||||
|
||||
|
||||
def print_model_info(model: Module, name: Optional[str] = None) -> None:
|
||||
@@ -228,34 +257,35 @@ def print_model_info(model: Module, name: Optional[str] = None) -> None:
|
||||
n_grads /= 1e6
|
||||
n_buffers /= 1e6
|
||||
s = [
|
||||
f"{name}: ",
|
||||
f"{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ",
|
||||
f"{n_buffers:.4f}M Buffers",
|
||||
f'{name}: ',
|
||||
f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
|
||||
f'{n_buffers:.4f}M Buffers',
|
||||
]
|
||||
s += "."
|
||||
logger.info("".join(s))
|
||||
s += '.'
|
||||
logger.info(''.join(s))
|
||||
|
||||
|
||||
def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
|
||||
named_p = list(model.named_parameters())
|
||||
for i, (n, p) in enumerate(named_p):
|
||||
if i >= max_lines:
|
||||
logger.info("...")
|
||||
logger.info('...')
|
||||
break
|
||||
logger.info(f"{n}: requires_grad={p.requires_grad}")
|
||||
logger.info(f'{n}: requires_grad={p.requires_grad}')
|
||||
|
||||
|
||||
@METRICS.register_module(group_key=default_group, module_name='my_metric')
|
||||
class MyMetric(Metric):
|
||||
|
||||
def __init__(self, vocab_size: int):
|
||||
self.acc = Accuracy("multiclass", num_classes=vocab_size)
|
||||
self.acc = Accuracy('multiclass', num_classes=vocab_size)
|
||||
self.loss = MeanMetric()
|
||||
|
||||
def add(self, outputs: Dict[str, Any], inputs: Dict[str, Any]) -> None:
|
||||
loss: Tensor = outputs.loss
|
||||
self.loss.update(loss)
|
||||
#
|
||||
labels: Tensor = inputs["labels"]
|
||||
labels: Tensor = inputs['labels']
|
||||
labels = labels[:, 1:]
|
||||
labels_mask = labels != -100
|
||||
logits: Tensor = outputs.logits[:, :-1]
|
||||
@@ -266,18 +296,19 @@ class MyMetric(Metric):
|
||||
|
||||
def evaluate(self):
|
||||
return {
|
||||
"acc": self.acc.compute().item(),
|
||||
"loss": self.loss.compute().item()
|
||||
'acc': self.acc.compute().item(),
|
||||
'loss': self.loss.compute().item()
|
||||
}
|
||||
|
||||
def merge(self, other: "MyMetric") -> None:
|
||||
def merge(self, other: 'MyMetric') -> None:
|
||||
"""This script does not support ddp"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def get_baichuan_model_tokenizer(model_dir: Optional[str] = None, load_model: bool = True):
|
||||
def get_baichuan_model_tokenizer(model_dir: Optional[str] = None,
|
||||
load_model: bool = True):
|
||||
if model_dir is None:
|
||||
model_id = "baichuan-inc/baichuan-7B"
|
||||
model_id = 'baichuan-inc/baichuan-7B'
|
||||
model_dir = get_model_dir(model_id, None)
|
||||
#
|
||||
sys.path.insert(0, model_dir)
|
||||
@@ -286,51 +317,59 @@ def get_baichuan_model_tokenizer(model_dir: Optional[str] = None, load_model: bo
|
||||
from modeling_baichuan import BaiChuanForCausalLM
|
||||
model_config = BaiChuanConfig.from_pretrained(model_dir)
|
||||
model_config.torch_dtype = torch.float16
|
||||
logger.info(f"model_config: {model_config}")
|
||||
logger.info(f'model_config: {model_config}')
|
||||
tokenizer = BaiChuanTokenizer.from_pretrained(model_dir)
|
||||
model = None
|
||||
if load_model:
|
||||
model = BaiChuanForCausalLM.from_pretrained(model_dir, config=model_config,
|
||||
device_map="auto", torch_dtype=torch.float16)
|
||||
model = BaiChuanForCausalLM.from_pretrained(
|
||||
model_dir,
|
||||
config=model_config,
|
||||
device_map='auto',
|
||||
torch_dtype=torch.float16)
|
||||
#
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None, load_model: bool = True):
|
||||
def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None,
|
||||
load_model: bool = True):
|
||||
if model_dir is None:
|
||||
model_id = "ZhipuAI/chatglm2-6b"
|
||||
model_revision = "v1.0.3"
|
||||
model_id = 'ZhipuAI/chatglm2-6b'
|
||||
model_revision = 'v1.0.3'
|
||||
model_dir = snapshot_download(model_id, model_revision)
|
||||
#
|
||||
config = read_config(model_dir)
|
||||
config["model"] = ConfigDict({
|
||||
"type": "chatglm2-6b"
|
||||
})
|
||||
config['model'] = ConfigDict({'type': 'chatglm2-6b'})
|
||||
tokenizer = ChatGLM2Tokenizer.from_pretrained(model_dir)
|
||||
model = None
|
||||
if load_model:
|
||||
model = Model.from_pretrained(
|
||||
model_dir, cfg_dict=config, device_map='auto', torch_dtype=torch.float16)
|
||||
model_dir,
|
||||
cfg_dict=config,
|
||||
device_map='auto',
|
||||
torch_dtype=torch.float16)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def make_dataset(split: str,
|
||||
tokenize_function: Callable[[str, str, Optional[str]], Dict[str, Any]]) -> MyDataset:
|
||||
def make_dataset(
|
||||
split: str, tokenize_function: Callable[[str, str, Optional[str]],
|
||||
Dict[str, Any]]
|
||||
) -> MyDataset:
|
||||
"""
|
||||
split: Literal["train", "validation"]
|
||||
"""
|
||||
dataset = MsDataset.load('modelscope/ms_hackathon_23_agent_train_dev', split=split)
|
||||
dataset = MsDataset.load(
|
||||
'modelscope/ms_hackathon_23_agent_train_dev', split=split)
|
||||
system = []
|
||||
user = []
|
||||
assistant = []
|
||||
for d in dataset:
|
||||
content = ast.literal_eval(d["conversations"])
|
||||
s = content[0]["value"]
|
||||
content = ast.literal_eval(d['conversations'])
|
||||
s = content[0]['value']
|
||||
assert len(content) % 2 == 1
|
||||
for i in range(len(content) // 2):
|
||||
system.append(s)
|
||||
user.append(content[2 * i + 1]["value"])
|
||||
assistant.append(content[2 * i + 2]["value"])
|
||||
user.append(content[2 * i + 1]['value'])
|
||||
assistant.append(content[2 * i + 2]['value'])
|
||||
return MyDataset(system, user, assistant, tokenize_function)
|
||||
|
||||
|
||||
@@ -339,21 +378,22 @@ Item = Dict[str, float]
|
||||
|
||||
def read_tensorboard_file(fpath: str) -> Dict[str, List[Item]]:
|
||||
if not os.path.isfile(fpath):
|
||||
raise FileNotFoundError(f"fpath: {fpath}")
|
||||
raise FileNotFoundError(f'fpath: {fpath}')
|
||||
ea = EventAccumulator(fpath)
|
||||
ea.Reload()
|
||||
res = {}
|
||||
tags = ea.Tags()["scalars"]
|
||||
tags = ea.Tags()['scalars']
|
||||
for tag in tags:
|
||||
values = ea.Scalars(tag)
|
||||
r = []
|
||||
for v in values:
|
||||
r.append({"step": v.step, "value": v.value})
|
||||
r.append({'step': v.step, 'value': v.value})
|
||||
res[tag] = r
|
||||
return res
|
||||
|
||||
|
||||
def tensorboard_smoothing(values: List[float], smooth: float = 0.9) -> List[float]:
|
||||
def tensorboard_smoothing(values: List[float],
|
||||
smooth: float = 0.9) -> List[float]:
|
||||
norm_factor = 1
|
||||
x = 0
|
||||
res = []
|
||||
@@ -366,12 +406,12 @@ def tensorboard_smoothing(values: List[float], smooth: float = 0.9) -> List[floa
|
||||
return res
|
||||
|
||||
|
||||
def plot_image(data: Dict[str, List[Item]], key_name: str, smooth: float) -> Figure:
|
||||
def plot_image(data: Dict[str, List[Item]], key_name: str,
|
||||
smooth: float) -> Figure:
|
||||
_data = data[key_name]
|
||||
steps = [d["step"] for d in _data]
|
||||
values = [d["value"] for d in _data]
|
||||
fig, ax = plt.subplots(1, 1, squeeze=True,
|
||||
figsize=(8, 5), dpi=100)
|
||||
steps = [d['step'] for d in _data]
|
||||
values = [d['value'] for d in _data]
|
||||
fig, ax = plt.subplots(1, 1, squeeze=True, figsize=(8, 5), dpi=100)
|
||||
ax.set_title(key_name)
|
||||
if smooth != 0:
|
||||
ax.plot(steps, values, color=COLOR)
|
||||
|
||||
Reference in New Issue
Block a user