Merge branch 'modelscope:master' into custom_diffusion

This commit is contained in:
Wang Qiang
2023-07-12 10:08:11 +08:00
committed by GitHub
19 changed files with 1230 additions and 186 deletions

View File

@@ -0,0 +1,449 @@
import ast
import datetime as dt
import math
import os
import random
import re
import sys
from functools import partial
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import json
import matplotlib.pyplot as plt
import numpy as np
#
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset as HFDataset
from datasets import concatenate_datasets
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from numpy import ndarray
from tensorboard.backend.event_processing.event_accumulator import \
EventAccumulator
from torch import Tensor
from torch import device as Device
from torch import dtype as Dtype
from torch.nn import Module
from torch.nn.parameter import Parameter
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Optimizer
from torch.optim import lr_scheduler as lrs
from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
from torch.utils.data import Dataset
#
from torchmetrics import Accuracy, MeanMetric
#
from tqdm import tqdm
#
from modelscope import (Model, MsDataset, get_logger, read_config,
snapshot_download)
from modelscope.metrics.base import Metric
from modelscope.metrics.builder import METRICS
from modelscope.models.nlp.chatglm2 import ChatGLM2Tokenizer
from modelscope.msdatasets.dataset_cls.custom_datasets import \
TorchCustomDataset
from modelscope.swift import LoRAConfig, Swift
from modelscope.trainers import EpochBasedTrainer
from modelscope.utils.config import Config, ConfigDict
from modelscope.utils.registry import default_group
#
TEST_SPLIT_P = 0.01
SPLIT_SEED = 42
MAX_LENGTH: Optional[int] = 2048
COLOR, COLOR_S = '#FFE2D9', '#FF7043'
PROMPT = """### 用户
{instruction}
### AI助手
"""
logger = get_logger()
#
def get_model_dir(model_id: str, model_revision: Optional[str] = None) -> str:
model_dir = snapshot_download(model_id, model_revision)
return model_dir
def _get_version(work_dir: str) -> int:
if os.path.isdir(work_dir):
fnames = os.listdir(work_dir)
else:
fnames = []
v_list = [-1]
for fname in fnames:
m = re.match(r'v(\d+)', fname)
if m is None:
continue
v = m.group(1)
v_list.append(int(v))
return max(v_list) + 1
def get_work_dir(work_dir: str) -> str:
"""add version"""
work_dir = os.path.abspath(work_dir)
version = _get_version(work_dir)
time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
#
work_dir = os.path.join(work_dir, f'v{version}-{time}')
logger.info(f'work_dir: {work_dir}')
return work_dir
def select_device(device_ids: List[int]) -> Device:
"""Call this function before cuda is initialized.
Return: master device
"""
if torch.cuda.is_initialized():
logger.warning('CUDA has been initialized! Device selection fails!')
return torch.device('cuda:0')
#
log_s = 'Using device: '
if len(device_ids) == 0: # cpu
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
device: str = 'cpu'
log_s += device
else:
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
[str(d) for d in device_ids])
assert torch.cuda.is_available(
) and torch.cuda.device_count() >= len(device_ids)
log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. 'cuda:1,7,8'
device = 'cuda:0'
logger.info(log_s)
return torch.device(device)
def seed_everything(seed: Optional[int] = None, gpu_dtm: bool = False) -> int:
if seed is None:
seed_max = np.iinfo(np.int32).max
seed = random.randint(0, seed_max)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
logger.info(f'Global seed set to {seed}')
if gpu_dtm:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
logger.info(f'Setting deterministic: {True}, benchmark: {False}')
return seed
def get_T_max(dataset_len: int, batch_size: int, max_epochs: int,
drop_last: bool) -> int:
"""Calculate T_max in CosineAnnealingLR"""
if drop_last:
T_max = dataset_len // batch_size
else:
T_max = math.ceil(dataset_len / batch_size)
T_max *= max_epochs
return T_max
def tokenize_function(example: Dict[str, str], tokenizer) -> Dict[str, Any]:
"""Only applicable to baichuan and chatglm2. Other models need to be tested"""
instruction = example['instruction']
input_: str = example['input']
if input_ is not None and input_ != '':
# instruction = instruction + '\n'
if input_.startswith('输入:'):
instruction = instruction + input_[3:]
else:
instruction = instruction + input_
output = example['output']
src_text = PROMPT.format(instruction=instruction, add_special_tokens=False)
src_input_ids: List[int] = tokenizer(
src_text, return_attention_mask=False,
add_special_tokens=True)['input_ids']
# tokenizer.bos_token_id: Avoid `tgt_input_ids` being empty
tgt_input_ids = [tokenizer.bos_token_id]
if output is not None:
tgt_input_ids += tokenizer(
output, return_attention_mask=False,
add_special_tokens=False)['input_ids']
tgt_input_ids += [tokenizer.eos_token_id]
labels = [-100] * len(src_input_ids) + tgt_input_ids
else:
labels = None
input_ids = src_input_ids + tgt_input_ids
#
if MAX_LENGTH is not None:
input_ids = input_ids[-MAX_LENGTH:]
if labels is not None:
labels = labels[-MAX_LENGTH:]
#
return {'input_ids': input_ids, 'labels': labels}
def stat_dataset(dataset: HFDataset) -> None:
"""Statistical analysis was performed on the data set"""
_token_len = []
for d in dataset:
_token_len.append(len(d['input_ids']))
_token_len = np.array(_token_len)
mean = _token_len.mean().item()
std = _token_len.std().item()
min_ = _token_len.min().item()
max_ = _token_len.max().item()
logger.info(
f'Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}'
)
def print_examples(examples: Dict[str, Any], tokenizer) -> None:
input_ids, labels = examples['input_ids'], examples['labels']
print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}')
print()
print(
f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
)
def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
input_ids = [torch.tensor(b['input_ids']) for b in batch]
labels = [torch.tensor(b['labels']) for b in batch]
attention_mask = [
torch.ones(len(input_ids[i]), dtype=torch.int64)
for i in range(len(input_ids))
]
#
input_ids = pad_sequence(
input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
attention_mask = pad_sequence(
attention_mask, batch_first=True, padding_value=0)
labels = pad_sequence(labels, batch_first=True, padding_value=-100)
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': labels
}
def print_model_info(model: Module, name: Optional[str] = None) -> None:
if name is None:
name = model.__class__.__name__
#
n_params = sum(p.numel() for p in model.parameters())
n_grads = sum(p.numel() for p in model.parameters() if p.requires_grad)
n_buffers = sum(p.numel() for p in model.buffers())
#
n_params /= 1e6
n_grads /= 1e6
n_buffers /= 1e6
s = [
f'{name}: ',
f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
f'{n_buffers:.4f}M Buffers',
]
s += '.'
logger.info(''.join(s))
def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
named_p = list(model.named_parameters())
for i, (n, p) in enumerate(named_p):
if i >= max_lines:
logger.info('...')
break
logger.info(f'{n}: requires_grad={p.requires_grad}')
@METRICS.register_module(group_key=default_group, module_name='my_metric')
class MyMetric(Metric):
def __init__(self, vocab_size: int):
self.acc = Accuracy('multiclass', num_classes=vocab_size)
self.loss = MeanMetric()
def add(self, outputs: Dict[str, Any], inputs: Dict[str, Any]) -> None:
loss: Tensor = outputs.loss
self.loss.update(loss)
#
labels: Tensor = inputs['labels']
labels = labels[:, 1:]
labels_mask = labels != -100
logits: Tensor = outputs.logits[:, :-1]
logits = logits[labels_mask].contiguous().view(-1, logits.shape[-1])
pred = logits.argmax(dim=-1)
labels = labels[labels_mask].to(logits.device)
self.acc.update(pred, labels)
def evaluate(self):
return {
'acc': self.acc.compute().item(),
'loss': self.loss.compute().item()
}
def merge(self, other: 'MyMetric') -> None:
"""This script does not support ddp"""
raise NotImplementedError
def get_baichuan7B_model_tokenizer(model_dir: Optional[str] = None,
load_model: bool = True):
if model_dir is None:
model_id = 'baichuan-inc/baichuan-7B'
model_dir = get_model_dir(model_id, None)
#
sys.path.insert(0, model_dir)
from configuration_baichuan import BaiChuanConfig
from tokenization_baichuan import BaiChuanTokenizer
from modeling_baichuan import BaiChuanForCausalLM
model_config = BaiChuanConfig.from_pretrained(model_dir)
model_config.torch_dtype = torch.float16
logger.info(f'model_config: {model_config}')
tokenizer = BaiChuanTokenizer.from_pretrained(model_dir)
model = None
if load_model:
model = BaiChuanForCausalLM.from_pretrained(
model_dir,
config=model_config,
device_map='auto',
torch_dtype=torch.float16)
#
return model, tokenizer
def get_baichuan13B_model_tokenizer(model_dir: Optional[str] = None,
load_model: bool = True):
if model_dir is None:
model_id = 'baichuan-inc/Baichuan-13B-Base'
model_dir = get_model_dir(model_id, 'v1.0.1')
#
sys.path.insert(0, model_dir)
from configuration_baichuan import BaichuanConfig
from tokenization_baichuan import BaichuanTokenizer
from modeling_baichuan import BaichuanForCausalLM
model_config = BaichuanConfig.from_pretrained(model_dir)
model_config.torch_dtype = torch.float16
logger.info(f'model_config: {model_config}')
tokenizer = BaichuanTokenizer.from_pretrained(model_dir)
model = None
if load_model:
model = BaichuanForCausalLM.from_pretrained(
model_dir,
config=model_config,
device_map='auto',
torch_dtype=torch.float16)
#
return model, tokenizer
def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None,
load_model: bool = True):
if model_dir is None:
model_id = 'ZhipuAI/chatglm2-6b'
model_dir = snapshot_download(model_id, None)
#
config = read_config(model_dir)
config['model'] = ConfigDict({'type': 'chatglm2-6b'})
tokenizer = ChatGLM2Tokenizer.from_pretrained(model_dir)
model = None
if load_model:
model = Model.from_pretrained(
model_dir,
cfg_dict=config,
device_map='auto',
torch_dtype=torch.float16)
return model, tokenizer
def get_alpaca_en_zh_dataset(
tokenize_function,
only_val: bool = False) -> Tuple[HFDataset, HFDataset]:
"""
split: Literal['train', 'validation', None]
"""
dataset_en: HFDataset = MsDataset.load(
'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
dataset_zh: HFDataset = MsDataset.load(
'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
dataset_en = dataset_en.remove_columns(['text'])
dataset: HFDataset = concatenate_datasets([dataset_zh, dataset_en])
#
# dataset = dataset.select(range(1000)) # for debug
dataset = dataset.train_test_split(TEST_SPLIT_P, seed=SPLIT_SEED)
if only_val:
dataset = dataset['test']
if tokenize_function is not None:
dataset = dataset.map(tokenize_function)
dataset = dataset.remove_columns(['instruction', 'input', 'output'])
#
if only_val:
return None, dataset
else:
return dataset['train'], dataset['test']
Item = Dict[str, float]
def read_tensorboard_file(fpath: str) -> Dict[str, List[Item]]:
if not os.path.isfile(fpath):
raise FileNotFoundError(f'fpath: {fpath}')
ea = EventAccumulator(fpath)
ea.Reload()
res = {}
tags = ea.Tags()['scalars']
for tag in tags:
values = ea.Scalars(tag)
r = []
for v in values:
r.append({'step': v.step, 'value': v.value})
res[tag] = r
return res
def tensorboard_smoothing(values: List[float],
smooth: float = 0.9) -> List[float]:
norm_factor = 1
x = 0
res = []
for i in range(len(values)):
x = x * smooth + values[i] # Exponential decay
res.append(x / norm_factor)
#
norm_factor *= smooth
norm_factor += 1
return res
def plot_image(tb_dir: str,
smooth_key: List[str],
smooth_val: float = 0.9,
figsize: Tuple[int, int] = (8, 5),
dpi: int = 100) -> None:
image_dir = os.path.join(os.path.dirname(tb_dir), 'images')
os.makedirs(image_dir, exist_ok=True)
#
fname = os.listdir(tb_dir)[0]
tb_path = os.path.join(tb_dir, fname)
data = read_tensorboard_file(tb_path)
#
for k in data.keys():
_data = data[k]
steps = [d['step'] for d in _data]
values = [d['value'] for d in _data]
if len(values) == 0:
continue
_, ax = plt.subplots(1, 1, squeeze=True, figsize=figsize, dpi=dpi)
ax.set_title(k)
if len(values) == 1:
ax.scatter(steps, values, color=COLOR_S)
elif k in smooth_key:
ax.plot(steps, values, color=COLOR)
values_s = tensorboard_smoothing(values, smooth_val)
ax.plot(steps, values_s, color=COLOR_S)
else:
ax.plot(steps, values, color=COLOR_S)
fpath = os.path.join(image_dir, k.replace('/', '_'))
plt.savefig(fpath, dpi=dpi, bbox_inches='tight')

View File

@@ -0,0 +1,62 @@
# ### Setting up experimental environment.
from _common import *
from transformers import TextStreamer
device_ids = [0, 1]
logger.info(device_ids)
select_device(device_ids)
# ### Loading Model and Tokenizer
# Note: You need to set the value of `CKPT_FPATH`
BAICHUAN_TYPE = '13B' # Literal['7B', '13B']
CKPT_FAPTH = '/path/to/your/xxx.pth'
LORA_TARGET_MODULES = ['W_pack']
if BAICHUAN_TYPE == '7B':
model, tokenizer = get_baichuan7B_model_tokenizer()
else:
model, tokenizer = get_baichuan13B_model_tokenizer()
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
model.bfloat16() # Consistent with training
# ### Preparing lora
LORA_RANK = 8
LORA_ALPHA = 32
LORA_DROPOUT_P = 0 # Arbitrary value
lora_config = LoRAConfig(
replace_modules=LORA_TARGET_MODULES,
rank=LORA_RANK,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT_P,
pretrained_weights=CKPT_FAPTH)
logger.info(f'lora_config: {lora_config}')
Swift.prepare_model(model, lora_config)
# ### Loading Dataset
_, test_dataset = get_alpaca_en_zh_dataset(None, True)
# ### Inference
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
for d in test_dataset[:5]:
output = d['output']
d['output'] = None
input_ids = tokenize_function(d, tokenizer)['input_ids']
print(f'[TEST]{tokenizer.decode(input_ids)}', end='')
input_ids = torch.tensor(input_ids)[None].cuda()
attention_mask = torch.ones_like(input_ids)
generate_ids = model.generate(
input_ids=input_ids,
max_new_tokens=512,
attention_mask=attention_mask,
streamer=streamer,
pad_token_id=tokenizer.pad_token_id,
temperature=0.7,
top_k=50,
do_sample=True)
print()
print(f'[LABELS]{output}')
print(
'-----------------------------------------------------------------------------------'
)
# input('next[ENTER]')

View File

@@ -0,0 +1,199 @@
# ### Setting up experimental environment.
"""
pip install modelscope
pip install numpy pandas matplotlib scikit-learn
pip install transformers datasets
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install tqdm
pip install tensorboard
pip install torchmetrics
pip install sentencepiece
pip install accelerate
pip install numpy -U # Resolve torchmetrics dependencies and update numpy
"""
from _common import *
device_ids = [0, 1, 2, 3]
logger.info(device_ids)
select_device(device_ids)
seed_everything(42)
# ### Loading Model and Tokenizer
BAICHUAN_TYPE = '13B' # Literal['7B', '13B']
WORK_DIR = f'runs/baichuan_{BAICHUAN_TYPE}'
LORA_TARGET_MODULES = ['W_pack']
#
if BAICHUAN_TYPE == '7B':
model_id = 'baichuan-inc/baichuan-7B'
model_dir = get_model_dir(model_id, None)
model, tokenizer = get_baichuan7B_model_tokenizer(model_dir)
else:
model_id = 'baichuan-inc/Baichuan-13B-Base'
model_dir = get_model_dir(model_id, 'v1.0.1')
model, tokenizer = get_baichuan13B_model_tokenizer(model_dir)
#
GRADIENT_CHECKPOINTING = True
if GRADIENT_CHECKPOINTING:
# baichuan13B does not implement the `get_input_embeddings` function
if BAICHUAN_TYPE == '13B':
def get_input_embeddings(self):
return self.model.embed_tokens
model.__class__.get_input_embeddings = get_input_embeddings.__get__(
model)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
#
logger.info(
f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, '
f'pad_token_id: {tokenizer.pad_token_id}')
# ### Preparing lora
LORA_RANK = 8
LORA_ALPHA = 32
LORA_DROPOUT_P = 0.1
lora_config = LoRAConfig(
replace_modules=LORA_TARGET_MODULES,
rank=LORA_RANK,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT_P)
logger.info(f'lora_config: {lora_config}')
Swift.prepare_model(model, lora_config)
#
show_freeze_layers(model)
print_model_info(model)
_p = list(model.parameters())[100]
logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
model.bfloat16()
# ### Loading Dataset
tokenize_function = partial(tokenize_function, tokenizer=tokenizer)
train_dataset, val_dataset = get_alpaca_en_zh_dataset(tokenize_function)
# Data analysis
stat_dataset(train_dataset)
stat_dataset(val_dataset)
data_collate_fn = partial(data_collate_fn, tokenizer=tokenizer)
print_examples(train_dataset[0], tokenizer)
# ### Setting Config
cfg_file = os.path.join(model_dir, 'configuration.json')
#
BATCH_SIZE = 1
MAX_EPOCHS = 1
T_max = get_T_max(len(train_dataset), BATCH_SIZE, MAX_EPOCHS, True)
WORK_DIR = get_work_dir(WORK_DIR)
EVAL_INTERVAL = 500
CONFIG = Config({
'train': {
'dataloader': {
'batch_size_per_gpu': BATCH_SIZE,
'workers_per_gpu': 1,
'shuffle': True,
'drop_last': True,
'pin_memory': True
},
'max_epochs':
MAX_EPOCHS,
'work_dir':
WORK_DIR,
'optimizer': {
'type': 'AdamW',
'lr': 1e-4,
'weight_decay': 0.01,
'options': {
'cumulative_iters': 16,
'grad_clip': {
'norm_type': 2,
'max_norm': 2.0
}
}
},
'lr_scheduler': {
'type': 'CosineAnnealingLR',
'T_max': T_max,
'eta_min': 1e-5,
'options': {
'by_epoch': False,
'warmup': {
'type': 'LinearWarmup',
'warmup_ratio': 0.1,
'warmup_iters': 200
}
}
},
'hooks': [
{
'type': 'CheckpointHook',
'by_epoch': False,
'interval': EVAL_INTERVAL,
'max_checkpoint_num': 1
},
{
'type': 'EvaluationHook',
'by_epoch': False,
'interval': EVAL_INTERVAL
},
{
'type': 'BestCkptSaverHook',
'metric_key': 'acc',
'save_best': True,
'rule': 'max',
'max_checkpoint_num': 1
},
{
'type': 'TextLoggerHook',
'by_epoch': True, # Whether EpochBasedTrainer is used
'interval': 5
},
{
'type': 'TensorboardHook',
'by_epoch': False,
'interval': 5
}
]
},
'evaluation': {
'dataloader': {
'batch_size_per_gpu': BATCH_SIZE,
'workers_per_gpu': 1,
'shuffle': False,
'drop_last': False,
'pin_memory': True
},
'metrics': [{
'type': 'my_metric',
'vocab_size': tokenizer.vocab_size
}]
}
})
# ### Finetuning
def cfg_modify_fn(cfg: Config) -> Config:
cfg.update(CONFIG)
return cfg
trainer = EpochBasedTrainer(
model=model,
cfg_file=cfg_file,
data_collator=data_collate_fn,
train_dataset=train_dataset,
eval_dataset=val_dataset,
remove_unused_data=True,
seed=42,
device='cpu', # No placement for model, leave the model to `device_map`
cfg_modify_fn=cfg_modify_fn,
)
trainer.train()
# ### Visualization
tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')
plot_image(tb_dir, ['loss'], 0.9)

View File

@@ -0,0 +1,60 @@
# ### Setting up experimental environment.
from _common import *
from transformers import TextStreamer
device_ids = [0, 1]
logger.info(device_ids)
select_device(device_ids)
# ### Loading Model and Tokenizer
# Note: You need to set the value of `CKPT_FPATH`
CKPT_FAPTH = '/path/to/your/xxx.pth'
LORA_TARGET_MODULES = ['query_key_value']
model, tokenizer = get_chatglm2_model_tokenizer()
if tokenizer.eos_token_id is None:
tokenizer.eos_token_id = tokenizer.pad_token_id
if tokenizer.bos_token_id is None:
tokenizer.bos_token_id = 1
model.bfloat16() # Consistent with training
# ### Preparing lora
LORA_RANK = 8
LORA_ALPHA = 32
LORA_DROPOUT_P = 0 # Arbitrary value
lora_config = LoRAConfig(
replace_modules=LORA_TARGET_MODULES,
rank=LORA_RANK,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT_P,
pretrained_weights=CKPT_FAPTH)
logger.info(f'lora_config: {lora_config}')
Swift.prepare_model(model, lora_config)
# ### Loading Dataset
_, test_dataset = get_alpaca_en_zh_dataset(None, True)
# ### Inference
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
for d in test_dataset[:5]:
output = d['output']
d['output'] = None
input_ids = tokenize_function(d, tokenizer)['input_ids']
print(f'[TEST]{tokenizer.decode(input_ids)}', end='')
input_ids = torch.tensor(input_ids)[None].cuda()
attention_mask = torch.ones_like(input_ids)
generate_ids = model.generate(
input_ids=input_ids,
max_new_tokens=512,
attention_mask=attention_mask,
streamer=streamer,
pad_token_id=tokenizer.pad_token_id,
temperature=0.7,
top_k=50,
do_sample=True)
print()
print(f'[LABELS]{output}')
print(
'-----------------------------------------------------------------------------------'
)
# input('next[ENTER]')

View File

@@ -0,0 +1,188 @@
# ### Setting up experimental environment.
"""
pip install modelscope
pip install numpy pandas matplotlib scikit-learn
pip install transformers datasets
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install tqdm
pip install tensorboard
pip install torchmetrics
pip install sentencepiece
pip install accelerate
pip install numpy -U # Resolve torchmetrics dependencies and update numpy
"""
from _common import *
device_ids = [0, 1, 2, 3]
logger.info(device_ids)
select_device(device_ids)
seed_everything(42)
# ### Loading Model and Tokenizer
model_id = 'ZhipuAI/chatglm2-6b'
WORK_DIR = 'runs/chatglm2'
LORA_TARGET_MODULES = ['query_key_value']
#
model_dir = get_model_dir(model_id, None)
model, tokenizer = get_chatglm2_model_tokenizer(model_dir)
# chatglm2 does not support gradient_checkpointing
GRADIENT_CHECKPOINTING = False
if GRADIENT_CHECKPOINTING:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
logger.info(tokenizer.special_tokens)
if tokenizer.eos_token_id is None:
tokenizer.eos_token_id = tokenizer.pad_token_id
if tokenizer.bos_token_id is None:
tokenizer.bos_token_id = 1
#
logger.info(
f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, '
f'pad_token_id: {tokenizer.pad_token_id}')
# ### Preparing lora
LORA_RANK = 8
LORA_ALPHA = 32
LORA_DROPOUT_P = 0.1
lora_config = LoRAConfig(
replace_modules=LORA_TARGET_MODULES,
rank=LORA_RANK,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT_P)
logger.info(f'lora_config: {lora_config}')
Swift.prepare_model(model, lora_config)
#
show_freeze_layers(model)
print_model_info(model)
_p = list(model.parameters())[100]
logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
model.bfloat16()
# ### Loading Dataset
tokenize_function = partial(tokenize_function, tokenizer=tokenizer)
train_dataset, val_dataset = get_alpaca_en_zh_dataset(tokenize_function)
# Data analysis
stat_dataset(train_dataset)
stat_dataset(val_dataset)
data_collate_fn = partial(data_collate_fn, tokenizer=tokenizer)
print_examples(train_dataset[0], tokenizer)
# ### Setting Config
cfg_file = os.path.join(model_dir, 'configuration.json')
#
BATCH_SIZE = 1
MAX_EPOCHS = 1
T_max = get_T_max(len(train_dataset), BATCH_SIZE, MAX_EPOCHS, True)
WORK_DIR = get_work_dir(WORK_DIR)
EVAL_INTERVAL = 500
CONFIG = Config({
'train': {
'dataloader': {
'batch_size_per_gpu': BATCH_SIZE,
'workers_per_gpu': 1,
'shuffle': True,
'drop_last': True,
'pin_memory': True
},
'max_epochs':
MAX_EPOCHS,
'work_dir':
WORK_DIR,
'optimizer': {
'type': 'AdamW',
'lr': 1e-4,
'weight_decay': 0.01,
'options': {
'cumulative_iters': 16,
'grad_clip': {
'norm_type': 2,
'max_norm': 2.0
}
}
},
'lr_scheduler': {
'type': 'CosineAnnealingLR',
'T_max': T_max,
'eta_min': 1e-5,
'options': {
'by_epoch': False,
'warmup': {
'type': 'LinearWarmup',
'warmup_ratio': 0.1,
'warmup_iters': 200
}
}
},
'hooks': [
{
'type': 'CheckpointHook',
'by_epoch': False,
'interval': EVAL_INTERVAL,
'max_checkpoint_num': 1
},
{
'type': 'EvaluationHook',
'by_epoch': False,
'interval': EVAL_INTERVAL
},
{
'type': 'BestCkptSaverHook',
'metric_key': 'acc',
'save_best': True,
'rule': 'max',
'max_checkpoint_num': 1
},
{
'type': 'TextLoggerHook',
'by_epoch': True, # Whether EpochBasedTrainer is used
'interval': 5
},
{
'type': 'TensorboardHook',
'by_epoch': False,
'interval': 5
}
]
},
'evaluation': {
'dataloader': {
'batch_size_per_gpu': BATCH_SIZE,
'workers_per_gpu': 1,
'shuffle': False,
'drop_last': False,
'pin_memory': True
},
'metrics': [{
'type': 'my_metric',
'vocab_size': tokenizer.vocab_size
}]
}
})
# ### Finetuning
def cfg_modify_fn(cfg: Config) -> Config:
cfg.update(CONFIG)
return cfg
trainer = EpochBasedTrainer(
model=model,
cfg_file=cfg_file,
data_collator=data_collate_fn,
train_dataset=train_dataset,
eval_dataset=val_dataset,
remove_unused_data=True,
seed=42,
device='cpu', # No placement for model, leave the model to `device_map`
cfg_modify_fn=cfg_modify_fn,
)
trainer.train()
# ### Visualization
tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')
plot_image(tb_dir, ['loss'], 0.9)

View File

@@ -111,7 +111,7 @@ def select_device(device_ids: List[int]) -> Device:
[str(d) for d in device_ids])
assert torch.cuda.is_available(
) and torch.cuda.device_count() >= len(device_ids)
log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. "cuda:1,7,8"
log_s += f"cuda:{','.join([str(d) for d in device_ids])}" # e.g. 'cuda:1,7,8'
device = 'cuda:0'
logger.info(log_s)
return torch.device(device)
@@ -221,7 +221,7 @@ def print_examples(examples: Dict[str, Any], tokenizer) -> None:
print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}')
print()
print(
f'[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}'
f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
)
@@ -334,8 +334,7 @@ def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None,
load_model: bool = True):
if model_dir is None:
model_id = 'ZhipuAI/chatglm2-6b'
model_revision = 'v1.0.3'
model_dir = snapshot_download(model_id, model_revision)
model_dir = snapshot_download(model_id, None)
#
config = read_config(model_dir)
config['model'] = ConfigDict({'type': 'chatglm2-6b'})
@@ -355,7 +354,7 @@ def make_dataset(
Dict[str, Any]]
) -> MyDataset:
"""
split: Literal["train", "validation"]
split: Literal['train', 'validation']
"""
dataset = MsDataset.load(
'modelscope/ms_hackathon_23_agent_train_dev', split=split)

View File

@@ -16,15 +16,6 @@
"### 配置实验环境"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install transformers"
]
},
{
"cell_type": "code",
"execution_count": 1,
@@ -62,7 +53,7 @@
"source": [
"from _common import *\n",
"from transformers import TextStreamer\n",
"device_ids = list(range(min(4, torch.cuda.device_count())))\n",
"device_ids = [0, 1]\n",
"logger.info(device_ids)\n",
"select_device(device_ids)"
]
@@ -152,8 +143,8 @@
}
],
"source": [
"CKPT_FAPTH = \"/home/hackathon/my_git/agent/runs/baichuan/v10-20230702-172449/output_best/pytorch_model.bin\"\n",
"LORA_TARGET_MODULES = [\"W_pack\"]\n",
"CKPT_FAPTH = '/home/hackathon/my_git/agent/runs/baichuan/v10-20230702-172449/output_best/pytorch_model.bin'\n",
"LORA_TARGET_MODULES = ['W_pack']\n",
"\n",
"model, tokenizer = get_baichuan_model_tokenizer()\n",
"if tokenizer.pad_token_id is None:\n",
@@ -225,7 +216,7 @@
" lora_alpha=LORA_ALPHA,\n",
" lora_dropout=LORA_DROPOUT_P,\n",
" pretrained_weights=CKPT_FAPTH)\n",
"logger.info(f\"lora_config: {lora_config}\")\n",
"logger.info(f'lora_config: {lora_config}')\n",
"Swift.prepare_model(model, lora_config)"
]
},
@@ -289,8 +280,8 @@
}
],
"source": [
"test_dataset = make_dataset(\"validation\", lambda system, user, assistant:\n",
" {\"system\": system, \"user\": user, \"assistant\": assistant})"
"test_dataset = make_dataset('validation', lambda system, user, assistant:\n",
" {'system': system, 'user': user, 'assistant': assistant})"
]
},
{
@@ -451,20 +442,21 @@
"source": [
"streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
"for d in test_dataset[:5]:\n",
" system = d[\"system\"]\n",
" user = d[\"user\"]\n",
" assistant = d[\"assistant\"]\n",
" input_ids = tokenize_function(system, user, None, tokenizer)[\"input_ids\"]\n",
" print(f\"[TEST]{tokenizer.decode(input_ids)}\", end=\"\")\n",
" system = d['system']\n",
" user = d['user']\n",
" assistant = d['assistant']\n",
" input_ids = tokenize_function(system, user, None, tokenizer)['input_ids']\n",
" print(f'[TEST]{tokenizer.decode(input_ids)}', end='')\n",
" input_ids = torch.tensor(input_ids)[None].cuda()\n",
" attention_mask = torch.ones_like(input_ids)\n",
" generate_ids = model.generate(input_ids=input_ids, max_new_tokens=512,\n",
" attention_mask=attention_mask,\n",
" streamer=streamer, pad_token_id=tokenizer.pad_token_id)\n",
" streamer=streamer, pad_token_id=tokenizer.pad_token_id, \n",
" temperature=0.7, top_k=50, do_sample=True)\n",
" print()\n",
" print(f\"[LABELS]{assistant}\")\n",
" print(\"-----------------------------------------------------------------------------------\")\n",
" # input(\"next[ENTER]\")"
" print(f'[LABELS]{assistant}')\n",
" print('-----------------------------------------------------------------------------------')\n",
" # input('next[ENTER]')"
]
}
],
@@ -484,7 +476,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
"version": "3.10.12"
}
},
"nbformat": 4,

View File

@@ -36,10 +36,12 @@
"# !pip install modelscope -U\n",
"# !pip install numpy pandas matplotlib scikit-learn\n",
"# !pip install transformers datasets\n",
"# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n",
"# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
"# !pip install tqdm\n",
"# !pip install tensorboard\n",
"# !pip install torchmetrics\n",
"# !pip install sentencepiece\n",
"# !pip install accelerate\n",
"#\n",
"# !pip install numpy -U # Resolve torchmetrics dependencies and update numpy"
]
@@ -73,7 +75,7 @@
],
"source": [
"from _common import *\n",
"device_ids = list(range(min(4, torch.cuda.device_count())))\n",
"device_ids = [0, 1, 2, 3]\n",
"logger.info(device_ids)\n",
"select_device(device_ids)\n",
"_ = seed_everything(42)"
@@ -130,9 +132,9 @@
}
],
"source": [
"model_id = \"baichuan-inc/baichuan-7B\"\n",
"WORK_DIR = \"runs/baichuan\"\n",
"LORA_TARGET_MODULES = [\"W_pack\"]\n",
"model_id = 'baichuan-inc/baichuan-7B'\n",
"WORK_DIR = 'runs/baichuan'\n",
"LORA_TARGET_MODULES = ['W_pack']\n",
"#\n",
"model_dir = get_model_dir(model_id, None)\n",
"model, tokenizer = get_baichuan_model_tokenizer(model_dir)\n",
@@ -144,8 +146,8 @@
"if tokenizer.pad_token_id is None:\n",
" tokenizer.pad_token_id = tokenizer.eos_token_id\n",
"#\n",
"logger.info(f\"bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, \"\n",
" f\"pad_token_id: {tokenizer.pad_token_id}\")"
"logger.info(f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, '\n",
" f'pad_token_id: {tokenizer.pad_token_id}')"
]
},
{
@@ -237,13 +239,13 @@
" rank=LORA_RANK,\n",
" lora_alpha=LORA_ALPHA,\n",
" lora_dropout=LORA_DROPOUT_P)\n",
"logger.info(f\"lora_config: {lora_config}\")\n",
"logger.info(f'lora_config: {lora_config}')\n",
"Swift.prepare_model(model, lora_config)\n",
"#\n",
"show_freeze_layers(model)\n",
"print_model_info(model)\n",
"_p = list(model.parameters())[100]\n",
"logger.info(f\"device: {_p.device}, dtype: {_p.dtype}\")\n",
"logger.info(f'device: {_p.device}, dtype: {_p.dtype}')\n",
"model.bfloat16()"
]
},
@@ -308,8 +310,8 @@
],
"source": [
"tokenize_function = partial(tokenize_function, tokenizer=tokenizer)\n",
"train_dataset = make_dataset(\"train\", tokenize_function)\n",
"val_dataset = make_dataset(\"validation\", tokenize_function)\n",
"train_dataset = make_dataset('train', tokenize_function)\n",
"val_dataset = make_dataset('validation', tokenize_function)\n",
"# Data analysis\n",
"stat_dataset(train_dataset)\n",
"stat_dataset(val_dataset)\n",
@@ -339,7 +341,7 @@
}
],
"source": [
"cfg_file = os.path.join(model_dir, \"configuration.json\")\n",
"cfg_file = os.path.join(model_dir, 'configuration.json')\n",
"#\n",
"BATCH_SIZE = 1\n",
"MAX_EPOCHS = 1\n",
@@ -347,62 +349,62 @@
"WORK_DIR = get_work_dir(WORK_DIR)\n",
"EVAL_INTERVAL = 200\n",
"CONFIG = Config({\n",
" \"train\": {\n",
" \"dataloader\": {\n",
" \"batch_size_per_gpu\": BATCH_SIZE,\n",
" \"workers_per_gpu\": 1,\n",
" \"shuffle\": True,\n",
" \"drop_last\": True,\n",
" \"pin_memory\": True\n",
" 'train': {\n",
" 'dataloader': {\n",
" 'batch_size_per_gpu': BATCH_SIZE,\n",
" 'workers_per_gpu': 1,\n",
" 'shuffle': True,\n",
" 'drop_last': True,\n",
" 'pin_memory': True\n",
" },\n",
" \"max_epochs\": MAX_EPOCHS,\n",
" \"work_dir\": WORK_DIR,\n",
" \"optimizer\": {\n",
" \"type\": \"AdamW\",\n",
" \"lr\": 1e-4,\n",
" \"weight_decay\": 0.01,\n",
" \"options\": {\n",
" \"cumulative_iters\": 16, \"grad_clip\": {\n",
" \"norm_type\": 2,\n",
" \"max_norm\": 2.0\n",
" 'max_epochs': MAX_EPOCHS,\n",
" 'work_dir': WORK_DIR,\n",
" 'optimizer': {\n",
" 'type': 'AdamW',\n",
" 'lr': 1e-4,\n",
" 'weight_decay': 0.01,\n",
" 'options': {\n",
" 'cumulative_iters': 16, 'grad_clip': {\n",
" 'norm_type': 2,\n",
" 'max_norm': 2.0\n",
" }\n",
" }\n",
" },\n",
" \"lr_scheduler\": {\n",
" \"type\": \"CosineAnnealingLR\",\n",
" \"T_max\": T_max,\n",
" \"eta_min\": 1e-5,\n",
" \"options\": {\n",
" \"by_epoch\": False,\n",
" \"warmup\": {\n",
" 'lr_scheduler': {\n",
" 'type': 'CosineAnnealingLR',\n",
" 'T_max': T_max,\n",
" 'eta_min': 1e-5,\n",
" 'options': {\n",
" 'by_epoch': False,\n",
" 'warmup': {\n",
" 'type': 'LinearWarmup',\n",
" 'warmup_ratio': 0.1,\n",
" \"warmup_iters\": 200\n",
" 'warmup_iters': 200\n",
" }\n",
" }\n",
" },\n",
" \"hooks\": [\n",
" {\"type\": \"CheckpointHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL, \"max_checkpoint_num\": 1},\n",
" {\"type\": \"EvaluationHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL},\n",
" {\"type\": \"BestCkptSaverHook\",\n",
" \"metric_key\": \"acc\",\n",
" \"save_best\": True, \"rule\": \"max\", \"max_checkpoint_num\": 1},\n",
" {\"type\": \"TextLoggerHook\",\n",
" \"by_epoch\": True, # Whether EpochBasedTrainer is used\n",
" \"interval\": 5},\n",
" {\"type\": \"TensorboardHook\", \"by_epoch\": False, \"interval\": 5}\n",
" 'hooks': [\n",
" {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1},\n",
" {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL},\n",
" {'type': 'BestCkptSaverHook',\n",
" 'metric_key': 'acc',\n",
" 'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1},\n",
" {'type': 'TextLoggerHook',\n",
" 'by_epoch': True, # Whether EpochBasedTrainer is used\n",
" 'interval': 5},\n",
" {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5}\n",
" ]\n",
" },\n",
" \"evaluation\": {\n",
" \"dataloader\": {\n",
" \"batch_size_per_gpu\": BATCH_SIZE,\n",
" \"workers_per_gpu\": 1,\n",
" \"shuffle\": False,\n",
" \"drop_last\": False,\n",
" \"pin_memory\": True\n",
" 'evaluation': {\n",
" 'dataloader': {\n",
" 'batch_size_per_gpu': BATCH_SIZE,\n",
" 'workers_per_gpu': 1,\n",
" 'shuffle': False,\n",
" 'drop_last': False,\n",
" 'pin_memory': True\n",
" },\n",
" \"metrics\": [\n",
" {\"type\": \"my_metric\", \"vocab_size\": tokenizer.vocab_size}\n",
" 'metrics': [\n",
" {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size}\n",
" ]\n",
" }\n",
"})"
@@ -1778,16 +1780,16 @@
}
],
"source": [
"tb_dir = os.path.join(WORK_DIR, \"tensorboard_output\")\n",
"tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')\n",
"fname = os.listdir(tb_dir)[0]\n",
"tb_path = os.path.join(tb_dir, fname)\n",
"#\n",
"data = read_tensorboard_file(tb_path)\n",
"print(data.keys())\n",
"_ = plot_image(data, \"loss\", 0.9)\n",
"_ = plot_image(data, \"lr\", 0)\n",
"_ = plot_image(data, \"evaluation/acc\", 0)\n",
"_ = plot_image(data, \"evaluation/loss\", 0)"
"_ = plot_image(data, 'loss', 0.9)\n",
"_ = plot_image(data, 'lr', 0)\n",
"_ = plot_image(data, 'evaluation/acc', 0)\n",
"_ = plot_image(data, 'evaluation/loss', 0)"
]
},
{

View File

@@ -17,15 +17,6 @@
"The following code is copied from baichuan_infer.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# !pip install transformers"
]
},
{
"cell_type": "code",
"execution_count": 2,
@@ -63,7 +54,7 @@
"source": [
"from _common import *\n",
"from transformers import TextStreamer\n",
"device_ids = list(range(min(4, torch.cuda.device_count())))\n",
"device_ids = [0, 1]\n",
"logger.info(device_ids)\n",
"select_device(device_ids)"
]
@@ -149,8 +140,8 @@
}
],
"source": [
"CKPT_FAPTH = \"/home/hackathon/my_git/agent/runs/chatglm2/v1-20230702-203505/output_best/pytorch_model.bin\"\n",
"LORA_TARGET_MODULES = [\"query_key_value\"]\n",
"CKPT_FAPTH = '/home/hackathon/my_git/agent/runs/chatglm2/v1-20230702-203505/output_best/pytorch_model.bin'\n",
"LORA_TARGET_MODULES = ['query_key_value']\n",
"\n",
"model, tokenizer = get_chatglm2_model_tokenizer()\n",
"if tokenizer.eos_token_id is None:\n",
@@ -230,7 +221,7 @@
" lora_alpha=LORA_ALPHA,\n",
" lora_dropout=LORA_DROPOUT_P,\n",
" pretrained_weights=CKPT_FAPTH)\n",
"logger.info(f\"lora_config: {lora_config}\")\n",
"logger.info(f'lora_config: {lora_config}')\n",
"Swift.prepare_model(model, lora_config)"
]
},
@@ -295,8 +286,8 @@
}
],
"source": [
"test_dataset = make_dataset(\"validation\", lambda system, user, assistant:\n",
" {\"system\": system, \"user\": user, \"assistant\": assistant})"
"test_dataset = make_dataset('validation', lambda system, user, assistant:\n",
" {'system': system, 'user': user, 'assistant': assistant})"
]
},
{
@@ -484,20 +475,21 @@
"source": [
"streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
"for d in test_dataset[:5]:\n",
" system = d[\"system\"]\n",
" user = d[\"user\"]\n",
" assistant = d[\"assistant\"]\n",
" input_ids = tokenize_function(system, user, None, tokenizer)[\"input_ids\"]\n",
" print(f\"[TEST]{tokenizer.decode(input_ids)}\", end=\"\")\n",
" system = d['system']\n",
" user = d['user']\n",
" assistant = d['assistant']\n",
" input_ids = tokenize_function(system, user, None, tokenizer)['input_ids']\n",
" print(f'[TEST]{tokenizer.decode(input_ids)}', end='')\n",
" input_ids = torch.tensor(input_ids)[None].cuda()\n",
" attention_mask = torch.ones_like(input_ids)\n",
" generate_ids = model.generate(input_ids=input_ids, max_new_tokens=512,\n",
" attention_mask=attention_mask,\n",
" streamer=streamer, pad_token_id=tokenizer.pad_token_id)\n",
" streamer=streamer, pad_token_id=tokenizer.pad_token_id, \n",
" temperature=0.7, top_k=50, do_sample=True)\n",
" print()\n",
" print(f\"[LABELS]{assistant}\")\n",
" print(\"-----------------------------------------------------------------------------------\")\n",
" # input(\"next[ENTER]\")"
" print(f'[LABELS]{assistant}')\n",
" print('-----------------------------------------------------------------------------------')\n",
" # input('next[ENTER]')"
]
}
],
@@ -517,7 +509,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
"version": "3.10.12"
},
"orig_nbformat": 4
},

View File

@@ -43,10 +43,12 @@
"# !pip install modelscope -U\n",
"# !pip install numpy pandas matplotlib scikit-learn\n",
"# !pip install transformers datasets\n",
"# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n",
"# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
"# !pip install tqdm\n",
"# !pip install tensorboard\n",
"# !pip install torchmetrics\n",
"# !pip install sentencepiece\n",
"# !pip install accelerate\n",
"#\n",
"# !pip install numpy -U # Resolve torchmetrics dependencies and update numpy"
]
@@ -78,7 +80,7 @@
],
"source": [
"from _common import *\n",
"device_ids = list(range(min(4, torch.cuda.device_count())))\n",
"device_ids = [0, 1, 2, 3]\n",
"logger.info(device_ids)\n",
"select_device(device_ids)\n",
"_ = seed_everything(42)"
@@ -134,12 +136,11 @@
}
],
"source": [
"model_id = \"ZhipuAI/chatglm2-6b\"\n",
"model_revision = \"v1.0.3\"\n",
"WORK_DIR = \"runs/chatglm2\"\n",
"LORA_TARGET_MODULES = [\"query_key_value\"]\n",
"model_id = 'ZhipuAI/chatglm2-6b'\n",
"WORK_DIR = 'runs/chatglm2'\n",
"LORA_TARGET_MODULES = ['query_key_value']\n",
"#\n",
"model_dir = get_model_dir(model_id, model_revision)\n",
"model_dir = get_model_dir(model_id, None)\n",
"model, tokenizer = get_chatglm2_model_tokenizer(model_dir)\n",
"# chatglm2 does not support gradient_checkpointing\n",
"GRADIENT_CHECKPOINTING = False\n",
@@ -152,8 +153,8 @@
"if tokenizer.bos_token_id is None:\n",
" tokenizer.bos_token_id = 1\n",
"#\n",
"logger.info(f\"bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, \"\n",
" f\"pad_token_id: {tokenizer.pad_token_id}\")"
"logger.info(f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, '\n",
" f'pad_token_id: {tokenizer.pad_token_id}')"
]
},
{
@@ -251,13 +252,13 @@
" rank=LORA_RANK,\n",
" lora_alpha=LORA_ALPHA,\n",
" lora_dropout=LORA_DROPOUT_P)\n",
"logger.info(f\"lora_config: {lora_config}\")\n",
"logger.info(f'lora_config: {lora_config}')\n",
"Swift.prepare_model(model, lora_config)\n",
"#\n",
"show_freeze_layers(model)\n",
"print_model_info(model)\n",
"_p = list(model.parameters())[100]\n",
"logger.info(f\"device: {_p.device}, dtype: {_p.dtype}\")\n",
"logger.info(f'device: {_p.device}, dtype: {_p.dtype}')\n",
"model.bfloat16()"
]
},
@@ -399,8 +400,8 @@
],
"source": [
"tokenize_function = partial(tokenize_function, tokenizer=tokenizer)\n",
"train_dataset = make_dataset(\"train\", tokenize_function)\n",
"val_dataset = make_dataset(\"validation\", tokenize_function)\n",
"train_dataset = make_dataset('train', tokenize_function)\n",
"val_dataset = make_dataset('validation', tokenize_function)\n",
"# Data analysis\n",
"stat_dataset(train_dataset)\n",
"stat_dataset(val_dataset)\n",
@@ -431,7 +432,7 @@
}
],
"source": [
"cfg_file = os.path.join(model_dir, \"configuration.json\")\n",
"cfg_file = os.path.join(model_dir, 'configuration.json')\n",
"#\n",
"BATCH_SIZE = 1\n",
"MAX_EPOCHS = 1\n",
@@ -439,62 +440,62 @@
"WORK_DIR = get_work_dir(WORK_DIR)\n",
"EVAL_INTERVAL = 200\n",
"CONFIG = Config({\n",
" \"train\": {\n",
" \"dataloader\": {\n",
" \"batch_size_per_gpu\": BATCH_SIZE,\n",
" \"workers_per_gpu\": 1,\n",
" \"shuffle\": True,\n",
" \"drop_last\": True,\n",
" \"pin_memory\": True\n",
" 'train': {\n",
" 'dataloader': {\n",
" 'batch_size_per_gpu': BATCH_SIZE,\n",
" 'workers_per_gpu': 1,\n",
" 'shuffle': True,\n",
" 'drop_last': True,\n",
" 'pin_memory': True\n",
" },\n",
" \"max_epochs\": MAX_EPOCHS,\n",
" \"work_dir\": WORK_DIR,\n",
" \"optimizer\": {\n",
" \"type\": \"AdamW\",\n",
" \"lr\": 1e-4,\n",
" \"weight_decay\": 0.01,\n",
" \"options\": {\n",
" \"cumulative_iters\": 16, \"grad_clip\": {\n",
" \"norm_type\": 2,\n",
" \"max_norm\": 2.0\n",
" 'max_epochs': MAX_EPOCHS,\n",
" 'work_dir': WORK_DIR,\n",
" 'optimizer': {\n",
" 'type': 'AdamW',\n",
" 'lr': 1e-4,\n",
" 'weight_decay': 0.01,\n",
" 'options': {\n",
" 'cumulative_iters': 16, 'grad_clip': {\n",
" 'norm_type': 2,\n",
" 'max_norm': 2.0\n",
" }\n",
" }\n",
" },\n",
" \"lr_scheduler\": {\n",
" \"type\": \"CosineAnnealingLR\",\n",
" \"T_max\": T_max,\n",
" \"eta_min\": 1e-5,\n",
" \"options\": {\n",
" \"by_epoch\": False,\n",
" \"warmup\": {\n",
" 'lr_scheduler': {\n",
" 'type': 'CosineAnnealingLR',\n",
" 'T_max': T_max,\n",
" 'eta_min': 1e-5,\n",
" 'options': {\n",
" 'by_epoch': False,\n",
" 'warmup': {\n",
" 'type': 'LinearWarmup',\n",
" 'warmup_ratio': 0.1,\n",
" \"warmup_iters\": 200\n",
" 'warmup_iters': 200\n",
" }\n",
" }\n",
" },\n",
" \"hooks\": [\n",
" {\"type\": \"CheckpointHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL, \"max_checkpoint_num\": 1},\n",
" {\"type\": \"EvaluationHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL},\n",
" {\"type\": \"BestCkptSaverHook\",\n",
" \"metric_key\": \"acc\",\n",
" \"save_best\": True, \"rule\": \"max\", \"max_checkpoint_num\": 1},\n",
" {\"type\": \"TextLoggerHook\",\n",
" \"by_epoch\": True, # Whether EpochBasedTrainer is used\n",
" \"interval\": 5},\n",
" {\"type\": \"TensorboardHook\", \"by_epoch\": False, \"interval\": 5}\n",
" 'hooks': [\n",
" {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1},\n",
" {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL},\n",
" {'type': 'BestCkptSaverHook',\n",
" 'metric_key': 'acc',\n",
" 'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1},\n",
" {'type': 'TextLoggerHook',\n",
" 'by_epoch': True, # Whether EpochBasedTrainer is used\n",
" 'interval': 5},\n",
" {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5}\n",
" ]\n",
" },\n",
" \"evaluation\": {\n",
" \"dataloader\": {\n",
" \"batch_size_per_gpu\": BATCH_SIZE,\n",
" \"workers_per_gpu\": 1,\n",
" \"shuffle\": False,\n",
" \"drop_last\": False,\n",
" \"pin_memory\": True\n",
" 'evaluation': {\n",
" 'dataloader': {\n",
" 'batch_size_per_gpu': BATCH_SIZE,\n",
" 'workers_per_gpu': 1,\n",
" 'shuffle': False,\n",
" 'drop_last': False,\n",
" 'pin_memory': True\n",
" },\n",
" \"metrics\": [\n",
" {\"type\": \"my_metric\", \"vocab_size\": tokenizer.vocab_size}\n",
" 'metrics': [\n",
" {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size}\n",
" ]\n",
" }\n",
"})"
@@ -1884,16 +1885,16 @@
}
],
"source": [
"tb_dir = os.path.join(WORK_DIR, \"tensorboard_output\")\n",
"tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')\n",
"fname = os.listdir(tb_dir)[0]\n",
"tb_path = os.path.join(tb_dir, fname)\n",
"#\n",
"data = read_tensorboard_file(tb_path)\n",
"print(data.keys())\n",
"_ = plot_image(data, \"loss\", 0.9)\n",
"_ = plot_image(data, \"lr\", 0)\n",
"_ = plot_image(data, \"evaluation/acc\", 0)\n",
"_ = plot_image(data, \"evaluation/loss\", 0)"
"_ = plot_image(data, 'loss', 0.9)\n",
"_ = plot_image(data, 'lr', 0)\n",
"_ = plot_image(data, 'evaluation/acc', 0)\n",
"_ = plot_image(data, 'evaluation/loss', 0)"
]
},
{

View File

@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
import torch
import torch.nn.functional as F
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
from packaging import version
from transformers import CLIPTextModel, CLIPTokenizer
from modelscope.metainfo import Models
@@ -34,6 +35,7 @@ class StableDiffusion(TorchModel):
"""
super().__init__(model_dir, *args, **kwargs)
revision = kwargs.pop('revision', None)
xformers_enable = kwargs.pop('xformers_enable', False)
self.lora_tune = kwargs.pop('lora_tune', False)
self.dreambooth_tune = kwargs.pop('dreambooth_tune', False)
@@ -66,6 +68,18 @@ class StableDiffusion(TorchModel):
self.unet.requires_grad_(False)
self.unet = self.unet.to(self.device)
# xformers accelerate memory efficient attention
if xformers_enable:
import xformers
xformers_version = version.parse(xformers.__version__)
if xformers_version == version.parse('0.0.16'):
logger.warn(
'xFormers 0.0.16 cannot be used for training in some GPUs. '
'If you observe problems during training, please update xFormers to at least 0.0.17.'
)
self.unet.enable_xformers_memory_efficient_attention()
def tokenize_caption(self, captions):
""" Convert caption text to token data.

View File

@@ -223,11 +223,23 @@ class CsvDatasetBuilder(csv.Csv):
if field_name.endswith(':FILE'):
transform_fields.append(field_name)
base_extracted_dir = self.split_path_dict.get(split_name, '')
base_extracted_dir: Union[str, list] = self.split_path_dict.get(
split_name, '')
for field_name in transform_fields:
if base_extracted_dir:
if isinstance(base_extracted_dir,
list) and len(base_extracted_dir) > 0:
if df.shape[0] != len(base_extracted_dir):
logger.error(
f"Number of lines in meta-csv file for split '{split_name}' ({df.shape[0]}) "
f'does not match number of data-files({len(base_extracted_dir)})!'
)
else:
df[field_name] = base_extracted_dir
elif isinstance(base_extracted_dir, str) and base_extracted_dir:
df[field_name] = df[field_name].apply(
lambda x: os.path.join(base_extracted_dir, x))
else:
logger.warning(f'Nothing to do for field {field_name}')
pa_data = pa.Table.from_pandas(df)
return Dataset(arrow_table=pa_data)

View File

@@ -93,7 +93,7 @@ class TimestampPipeline(Pipeline):
def __call__(self,
audio_in: Union[str, bytes],
text_in: str = None,
text_in: str,
audio_fs: int = None,
recog_type: str = None,
audio_format: str = None,

View File

@@ -15,7 +15,7 @@ class DiffusersPipeline(Pipeline):
"""
use `model` to create a diffusers pipeline
Args:
model: model id on modelscope hub.
model: model id on modelscope hub or local dir.
device: str = 'gpu'
"""

View File

@@ -48,6 +48,60 @@ class StableDiffusionPipeline(DiffusersPipeline):
def forward(self, inputs: Dict[str, Any],
**forward_params) -> Dict[str, Any]:
"""
Inputs Args:
prompt (`str` or `List[str]`, *optional*):
The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
instead.
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The height in pixels of the generated image.
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
The width in pixels of the generated image.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
less than `1`).
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
[`schedulers.DDIMScheduler`], will be ignored for others.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
to make generation deterministic.
latents (`torch.FloatTensor`, *optional*):
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
tensor will ge generated by sampling using the supplied random `generator`.
prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
provided, text embeddings will be generated from `prompt` input argument.
negative_prompt_embeds (`torch.FloatTensor`, *optional*):
Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generate image. Choose between
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
plain tuple.
callback (`Callable`, *optional*):
A function that will be called every `callback_steps` steps during inference. The function will be
called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
callback_steps (`int`, *optional*, defaults to 1):
The frequency at which the `callback` function will be called. If not specified, the callback will be
called at every step.
"""
if not isinstance(inputs, dict):
raise ValueError(
f'Expected the input to be a dictionary, but got {type(input)}'
@@ -57,7 +111,20 @@ class StableDiffusionPipeline(DiffusersPipeline):
raise ValueError('input should contain "text", but not found')
images = self.pipeline(
inputs['text'], num_inference_steps=30, guidance_scale=7.5)
prompt=inputs.get('text'),
height=inputs.get('height'),
width=inputs.get('width'),
num_inference_steps=inputs.get('num_inference_steps', 50),
guidance_scale=inputs.get('guidance_scale', 7.5),
negative_prompt=inputs.get('negative_prompt'),
num_images_per_prompt=inputs.get('num_images_per_prompt', 1),
eta=inputs.get('eta', 0.0),
generator=inputs.get('generator'),
latents=inputs.get('latents'),
output_type=inputs.get('output_type', 'pil'),
return_dict=inputs.get('return_dict', True),
callback=inputs.get('callback'),
callback_steps=inputs.get('callback_steps', 1))
return images

View File

@@ -168,3 +168,9 @@ TAMING_IMPORT_ERROR = """
{0} requires the timm library but it was not found in your environment. You can install it with pip:
`pip install taming-transformers-rom1504`
"""
# docstyle-ignore
XFORMERS_IMPORT_ERROR = """
{0} requires the timm library but it was not found in your environment. You can install it with pip:
`pip install xformers>=0.0.17`
"""

View File

@@ -306,6 +306,7 @@ REQUIREMENTS_MAAPING = OrderedDict([
('mpi4py', (is_package_available('mpi4py'), MPI4PY_IMPORT_ERROR)),
('open_clip', (is_package_available('open_clip'), OPENCLIP_IMPORT_ERROR)),
('taming', (is_package_available('taming'), TAMING_IMPORT_ERROR)),
('xformers', (is_package_available('xformers'), XFORMERS_IMPORT_ERROR)),
])
SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])

View File

@@ -1,5 +1,5 @@
# Make sure to modify __release_datetime__ to release time when making official release.
__version__ = '1.7.0'
__version__ = '1.7.1'
# default release datetime for branches under active development is set
# to be a time far-far-away-into-the-future
__release_datetime__ = '2099-10-13 08:56:12'

View File

@@ -21,7 +21,7 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
[flake8]
max-line-length = 120
select = B,C,E,F,P,T4,W,B9
ignore = F401,F405,F821,W503,E251
ignore = F401,F403,F405,F821,W503,E251
exclude = docs/src,*.pyi,.git
[darglint]