Merge branch 'master-github' into release/1.7

2025-12-16 16:27:45 +01:00 · 2023-07-19 17:50:59 +08:00
parent f0257dd501 2f7c669f33
commit 57c2bde168
44 changed files with 3142 additions and 368 deletions
--- a/examples/pytorch/auto_speech_recognition/finetune_speech_recognition.py
+++ b/examples/pytorch/auto_speech_recognition/finetune_speech_recognition.py
@@ -1,15 +1,19 @@
 import os
 from modelscope.metainfo import Trainers
-from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+from modelscope.msdatasets.dataset_cls.custom_datasets import ASRDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import DownloadMode
 def modelscope_finetune(params):
    if not os.path.exists(params.output_dir):
        os.makedirs(params.output_dir, exist_ok=True)
    # dataset split ["train", "validation"]
-    ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+    ds_dict = ASRDataset.load(
        params.data_path,
        namespace='speech_asr',
        download_mode=params.download_mode)
    kwargs = dict(
        model=params.model,
        data_dir=ds_dict,
@@ -36,5 +40,6 @@ if __name__ == '__main__':
    # 如果dataset_type="large"，batch_bins单位为毫秒，
    params.max_epoch = 50  # 最大训练轮数
    params.lr = 0.00005  # 设置学习率
    params.download_mode = DownloadMode.FORCE_REDOWNLOAD  # 重新下载数据，否则设置为默认值DownloadMode.REUSE_DATASET_IF_EXISTS
    modelscope_finetune(params)
--- a/examples/pytorch/chatglm6b/finetune.py
+++ b/examples/pytorch/chatglm6b/finetune.py
@@ -8,7 +8,6 @@ from text_generation_metric import TextGenerationMetric
 from transformers import DataCollatorForSeq2Seq
 from modelscope import snapshot_download
 from modelscope.metainfo import Models
 from modelscope.models import Model
 from modelscope.msdatasets import MsDataset
 from modelscope.swift import Swift
@@ -143,6 +142,14 @@ class Chatglm6bArguments(TrainingArgs):
        metadata={'help': 'The lora alpha'},
    )
    use_amp: int = field(
        default=0,
        metadata={
            'help':
            'Whether to use amp(automatic mixed precision) to train the model.'
        },
    )
 args = Chatglm6bArguments(eval_metrics='chatglm').parse_cli()
 print(args)
@@ -160,6 +167,13 @@ def cfg_modify_fn(cfg):
        cfg.merge_from_dict(config)
    else:
        cfg = config
    if args.use_amp:
        if not getattr(cfg.train, 'hooks', None):
            cfg.train.hooks = []
        cfg.train.hooks.append({
            'type': 'TorchAMPOptimizerHook',
            # Optional loss_scale parameter here.
        })
    if cfg.train.lr_scheduler.type == 'LinearLR':
        cfg.train.lr_scheduler['total_iters'] = \
            int(len(train_dataset) / cfg.train.dataloader.batch_size_per_gpu) * cfg.train.max_epochs
@@ -187,15 +201,13 @@ model_config['model'] = ConfigDict({
    'type': config['model']['type'],
 })
-if config['model']['type'] == 'chatglm6b':
+model_config['model']['pre_seq_len'] = args.pre_seq_len
-    model_config['model']['pre_seq_len'] = args.pre_seq_len
+model_config['model']['prefix_projection'] = args.prefix_projection
    model_config['model']['prefix_projection'] = args.prefix_projection
 tokenizer = ChatGLMTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 device_map_kwargs = {}
 device_kwargs = {}
-if args.use_lora != 0:
+if args.use_lora != 0 and torch.cuda.device_count() > 1:
    device_map_kwargs['device_map'] = 'auto'
    # No placement for model, leave the model to `device_map`
    device_kwargs['device'] = 'cpu'
@@ -231,7 +243,10 @@ if args.use_lora != 0:
        rank=args.lora_rank,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout)
-    model = model.bfloat16()
+    if args.use_amp:
        model = model.float()
    else:
        model = model.bfloat16()
    Swift.prepare_model(model, lora_config)
 prefix = args.source_prefix if args.source_prefix is not None else ''
@@ -334,13 +349,10 @@ def preprocess_function_train(examples):
            pad_len = max_seq_length - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
-            if config['model']['type'] == 'chatglm6b':
+            labels = labels + [tokenizer.pad_token_id] * pad_len
-                labels = labels + [tokenizer.pad_token_id] * pad_len
+            if args.ignore_pad_token_for_loss:
-                if args.ignore_pad_token_for_loss:
+                labels = [(lb if lb != tokenizer.pad_token_id else -100)
-                    labels = [(lb if lb != tokenizer.pad_token_id else -100)
+                          for lb in labels]
                              for lb in labels]
            else:
                labels = labels + [-100] * pad_len
            model_inputs['input_ids'].append(input_ids)
            model_inputs['labels'].append(labels)
@@ -372,8 +384,7 @@ data_collator = DataCollatorForSeq2Seq(
    padding=False)
 model.gradient_checkpointing_enable()
-if config['model']['type'] == 'chatglm6b':
+model.enable_input_require_grads()
    model.enable_input_require_grads()
 # import torch
 # model = torch.nn.DataParallel(model).cuda()
--- a/examples/pytorch/chatglm6b/run_train_chatglm2_ptuning_adv_v2.sh
+++ b/examples/pytorch/chatglm6b/run_train_chatglm2_ptuning_adv_v2.sh
@@ -0,0 +1,26 @@
 PRE_SEQ_LEN=128
 LR=2e-2
 PYTHONPATH=. python examples/pytorch/chatglm6b/finetune.py \
    --train_dataset_name AdvertiseGen/train.json \
    --val_dataset_name AdvertiseGen/dev.json \
    --prompt_column content \
    --response_column summary \
    --model "ZhipuAI/chatglm2-6b" \
    --max_source_length 64 \
    --max_target_length 128 \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 1 \
    --train.optimizer.options.cumulative_iters 1 \
    --max_epochs 1 \
    --save_strategy 'by_step' \
    --save_interval 1000 \
    --lr $LR \
    --eval_strategy "by_step" \
    --eval_interval 1000 \
    --lr_strategy 'by_step' \
    --task 'chat' \
    --model.type 'chatglm2-6b' \
    --pre_seq_len $PRE_SEQ_LEN \
    --quantization_bit 4 \
    --work_dir ptuning_adv_target \
--- a/examples/pytorch/llm/_common.py
+++ b/examples/pytorch/llm/_common.py
@@ -0,0 +1,466 @@
 import ast
 import datetime as dt
 import math
 import os
 import random
 import re
 import sys
 from dataclasses import dataclass, field
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import json
 import matplotlib.pyplot as plt
 import numpy as np
 #
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from datasets import Dataset as HfDataset
 from datasets import concatenate_datasets
 from matplotlib.axes import Axes
 from matplotlib.figure import Figure
 from numpy import ndarray
 from tensorboard.backend.event_processing.event_accumulator import \
    EventAccumulator
 from torch import Tensor
 from torch import device as Device
 from torch import dtype as Dtype
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 from torch.nn.utils.rnn import pad_sequence
 from torch.optim import Optimizer
 from torch.optim import lr_scheduler as lrs
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import Dataset
 #
 from torchmetrics import Accuracy, MeanMetric
 #
 from tqdm import tqdm
 from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          GenerationConfig, HfArgumentParser, TextStreamer)
 #
 from modelscope import (Model, MsDataset, get_logger, read_config,
                        snapshot_download)
 from modelscope.metrics.base import Metric
 from modelscope.metrics.builder import METRICS
 from modelscope.models.nlp.chatglm2 import ChatGLM2Tokenizer
 from modelscope.msdatasets.dataset_cls.custom_datasets import \
    TorchCustomDataset
 from modelscope.swift import LoRAConfig, Swift
 from modelscope.trainers import EpochBasedTrainer
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.registry import default_group
 #
 COLOR, COLOR_S = '#FFE2D9', '#FF7043'
 PROMPT = """Human: {instruction}
 AI: """
 logger = get_logger()
 os.environ['TOKENIZERS_PARALLELISM'] = 'true'
 #
 def _get_version(work_dir: str) -> int:
    if os.path.isdir(work_dir):
        fnames = os.listdir(work_dir)
    else:
        fnames = []
    v_list = [-1]
    for fname in fnames:
        m = re.match(r'v(\d+)', fname)
        if m is None:
            continue
        v = m.group(1)
        v_list.append(int(v))
    return max(v_list) + 1
 def get_work_dir(work_dir: str) -> str:
    """add version"""
    work_dir = os.path.abspath(work_dir)
    version = _get_version(work_dir)
    time = dt.datetime.now().strftime('%Y%m%d-%H%M%S')
    #
    work_dir = os.path.join(work_dir, f'v{version}-{time}')
    logger.info(f'work_dir: {work_dir}')
    return work_dir
 def _format_device(device: Union[List[int], str]) -> Tuple[List[int], str]:
    if isinstance(device, list):
        device_ids = device
        device_str = ','.join([str(d) for d in device])
    else:
        device_ids = [int(d) for d in device.split(',') if d != '-1']
        device_str = device
    device_str = device_str.replace(' ', '')
    return device_ids, device_str
 def select_device(device: Union[List[int], str]) -> Device:
    """Call this function before cuda is initialized.
    device: e.g. []: 'cpu', [0], [0, 1, 2]
        e.g. '-1': 'cpu', '0', '0,1,2'
    """
    if torch.cuda.is_initialized():
        logger.warning('CUDA has been initialized! Device selection fails!')
        return torch.device('cuda:0')
    #
    device_ids, device_str = _format_device(device)
    #
    os.environ['CUDA_VISIBLE_DEVICES'] = device_str
    log_s = 'Using device: '
    if len(device_ids) == 0:
        master_device: str = 'cpu'
        log_s += 'cpu'
    else:
        assert torch.cuda.is_available(
        ) and torch.cuda.device_count() >= len(device_ids)
        master_device = 'cuda:0'
        log_s += f'cuda:{device_str}'
    logger.info(log_s)
    return torch.device(master_device)
 def seed_everything(seed: Optional[int] = None, gpu_dtm: bool = False) -> int:
    if seed is None:
        seed_max = np.iinfo(np.int32).max
        seed = random.randint(0, seed_max)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    logger.info(f'Global seed set to {seed}')
    if gpu_dtm:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        logger.info(f'Setting deterministic: {True}, benchmark: {False}')
    return seed
 def get_T_max(dataset_len: int, batch_size: int, max_epochs: int,
              drop_last: bool) -> int:
    """Calculate T_max in CosineAnnealingLR"""
    if drop_last:
        T_max = dataset_len // batch_size
    else:
        T_max = math.ceil(dataset_len / batch_size)
    T_max *= max_epochs
    return T_max
 def tokenize_function(example: Dict[str, Optional[str]],
                      tokenizer,
                      max_length: Optional[int] = 2048) -> Dict[str, Any]:
    """Only applicable to baichuan and chatglm2. Other models need to be tested"""
    instruction: str = example['instruction']
    input_ = example['input']
    if input_ is not None and input_ != '':
        # instruction = instruction + '\n'
        if input_.startswith('输入：'):
            instruction = instruction + input_[3:]
        else:
            instruction = instruction + input_
    output = example['output']
    src_text = PROMPT.format(instruction=instruction)
    src_input_ids: List[int] = tokenizer(
        src_text, return_attention_mask=False,
        add_special_tokens=True)['input_ids']
    #
    tgt_input_ids = []
    if output is not None:
        tgt_input_ids += tokenizer(
            output, return_attention_mask=False,
            add_special_tokens=False)['input_ids']
        tgt_input_ids += [tokenizer.eos_token_id]
        labels = [-100] * len(src_input_ids) + tgt_input_ids
    else:
        labels = None
    input_ids = src_input_ids + tgt_input_ids
    #
    if max_length is not None:
        input_ids = input_ids[-max_length:]
        if labels is not None:
            labels = labels[-max_length:]
    #
    return {'input_ids': input_ids, 'labels': labels}
 def stat_dataset(dataset: HfDataset) -> None:
    """Statistical analysis was performed on the data set"""
    _token_len = []
    for d in dataset:
        _token_len.append(len(d['input_ids']))
    _token_len = np.array(_token_len)
    mean = _token_len.mean().item()
    std = _token_len.std().item()
    min_ = _token_len.min().item()
    max_ = _token_len.max().item()
    logger.info(
        f'Dataset Token Length: {mean:.6f}±{std:.6f}, min={min_:.6f}, max={max_:.6f}, size={_token_len.shape[0]}'
    )
 def print_example(example: Dict[str, Any], tokenizer) -> None:
    input_ids, labels = example['input_ids'], example['labels']
    print(f'[INPUT_IDS] {input_ids}')
    print(f'[INPUT] {tokenizer.decode(input_ids)}')
    print()
    print(f'[LABLES_IDS] {labels}')
    print(
        f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
    )
 def data_collate_fn(batch: List[Dict[str, Any]], tokenizer) -> Dict[str, Any]:
    input_ids = [torch.tensor(b['input_ids']) for b in batch]
    labels = [torch.tensor(b['labels']) for b in batch]
    attention_mask = [
        torch.ones(len(input_ids[i]), dtype=torch.int64)
        for i in range(len(input_ids))
    ]
    #
    input_ids = pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(
        attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
 def print_model_info(model: Module, name: Optional[str] = None) -> None:
    if name is None:
        name = model.__class__.__name__
    #
    n_params = sum(p.numel() for p in model.parameters())
    n_grads = sum(p.numel() for p in model.parameters() if p.requires_grad)
    n_buffers = sum(p.numel() for p in model.buffers())
    #
    n_params /= 1e6
    n_grads /= 1e6
    n_buffers /= 1e6
    s = [
        f'{name}: ',
        f'{n_params:.4f}M Params ({n_grads:.4f}M Trainable), ',
        f'{n_buffers:.4f}M Buffers',
    ]
    s += '.'
    logger.info(''.join(s))
 def show_freeze_layers(model: Module, max_lines: int = 20) -> None:
    named_p = list(model.named_parameters())
    for i, (n, p) in enumerate(named_p):
        if i >= max_lines:
            logger.info('...')
            break
        logger.info(f'{n}: requires_grad={p.requires_grad}')
@METRICS.register_module(group_key=default_group, module_name='my_metric')
 class MyMetric(Metric):
    def __init__(self, vocab_size: int):
        self.acc = Accuracy('multiclass', num_classes=vocab_size)
        self.loss = MeanMetric()
    def add(self, outputs: Dict[str, Any], inputs: Dict[str, Any]) -> None:
        loss: Tensor = outputs.loss
        self.loss.update(loss)
        #
        labels: Tensor = inputs['labels']
        labels = labels[:, 1:]
        labels_mask = labels != -100
        logits: Tensor = outputs.logits[:, :-1]
        logits = logits[labels_mask].contiguous().view(-1, logits.shape[-1])
        pred = logits.argmax(dim=-1)
        labels = labels[labels_mask].to(logits.device)
        self.acc.update(pred, labels)
    def evaluate(self):
        return {
            'acc': self.acc.compute().item(),
            'loss': self.loss.compute().item()
        }
    def merge(self, other: 'MyMetric') -> None:
        """This script does not support ddp. TODO"""
        raise NotImplementedError
 def _add_special_token(tokenizer):
    if tokenizer.eos_token_id is None:
        tokenizer.eos_token_id = 2
    if tokenizer.bos_token_id is None:
        tokenizer.bos_token_id = 1
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = 0
    logger.info(f'bos_token_id: {tokenizer.bos_token_id}, '
                f'eos_token_id: {tokenizer.eos_token_id}, '
                f'pad_token_id: {tokenizer.pad_token_id}')
 def get_baichuan_model_tokenizer(model_dir: str,
                                 load_model: bool = True,
                                 add_special_token: bool = True):
    sys.path.insert(0, model_dir)
    model_config = AutoConfig.from_pretrained(
        model_dir, trust_remote_code=True)
    model_config.torch_dtype = torch.float16
    logger.info(f'model_config: {model_config}')
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, trust_remote_code=True)
    model = None
    if load_model:
        model = AutoModelForCausalLM.from_pretrained(
            model_dir,
            config=model_config,
            device_map='auto',
            torch_dtype=torch.float16,
            trust_remote_code=True)
    #
    if add_special_token:
        _add_special_token(tokenizer)
    return model, tokenizer
 def get_chatglm2_model_tokenizer(model_dir: str,
                                 load_model: bool = True,
                                 add_special_token: bool = True):
    config = read_config(model_dir)
    config['model'] = ConfigDict({'type': 'chatglm2-6b'})
    tokenizer = ChatGLM2Tokenizer.from_pretrained(model_dir)
    model = None
    if load_model:
        model = Model.from_pretrained(
            model_dir,
            cfg_dict=config,
            device_map='auto',
            torch_dtype=torch.float16)
    if add_special_token:
        _add_special_token(tokenizer)
    return model, tokenizer
 def get_llama2_model_tokenizer(model_dir: str,
                               load_model: bool = True,
                               add_special_token: bool = True):
    config = AutoConfig.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = None
    if load_model:
        model = AutoModelForCausalLM.from_pretrained(
            model_dir,
            config=config,
            device_map='auto',
            torch_dtype=torch.float16,
        )
    if add_special_token:
        _add_special_token(tokenizer)
    return model, tokenizer
 def get_alpaca_en_zh_dataset(
        tokenize_function,
        only_val: bool = False,
        test_split_p: float = 0.01,
        split_seed: int = 42,
        data_sample: Optional[int] = None) -> Tuple[HfDataset, HfDataset]:
    """
    split: Literal['train', 'validation', None]
    """
    dataset_en: HfDataset = MsDataset.load(
        'AI-ModelScope/alpaca-gpt4-data-en', split='train').to_hf_dataset()
    dataset_zh: HfDataset = MsDataset.load(
        'AI-ModelScope/alpaca-gpt4-data-zh', split='train').to_hf_dataset()
    dataset_en = dataset_en.remove_columns(['text'])
    dataset: HfDataset = concatenate_datasets([dataset_zh, dataset_en])
    #
    if data_sample is not None:
        dataset = dataset.select(range(data_sample))
    dataset = dataset.train_test_split(test_split_p, seed=split_seed)
    if only_val:
        dataset = dataset['test']
    if tokenize_function is not None:
        dataset = dataset.map(tokenize_function)
        dataset = dataset.remove_columns(['instruction', 'input', 'output'])
    #
    if only_val:
        return None, dataset
    else:
        return dataset['train'], dataset['test']
 Item = Dict[str, float]
 def read_tensorboard_file(fpath: str) -> Dict[str, List[Item]]:
    if not os.path.isfile(fpath):
        raise FileNotFoundError(f'fpath: {fpath}')
    ea = EventAccumulator(fpath)
    ea.Reload()
    res = {}
    tags = ea.Tags()['scalars']
    for tag in tags:
        values = ea.Scalars(tag)
        r = []
        for v in values:
            r.append({'step': v.step, 'value': v.value})
        res[tag] = r
    return res
 def tensorboard_smoothing(values: List[float],
                          smooth: float = 0.9) -> List[float]:
    norm_factor = 1
    x = 0
    res = []
    for i in range(len(values)):
        x = x * smooth + values[i]  # Exponential decay
        res.append(x / norm_factor)
        #
        norm_factor *= smooth
        norm_factor += 1
    return res
 def plot_image(tb_dir: str,
               smooth_key: List[str],
               smooth_val: float = 0.9,
               figsize: Tuple[int, int] = (8, 5),
               dpi: int = 100) -> None:
    image_dir = os.path.join(os.path.dirname(tb_dir), 'images')
    os.makedirs(image_dir, exist_ok=True)
    #
    fname = os.listdir(tb_dir)[0]
    tb_path = os.path.join(tb_dir, fname)
    data = read_tensorboard_file(tb_path)
    #
    for k in data.keys():
        _data = data[k]
        steps = [d['step'] for d in _data]
        values = [d['value'] for d in _data]
        if len(values) == 0:
            continue
        _, ax = plt.subplots(1, 1, squeeze=True, figsize=figsize, dpi=dpi)
        ax.set_title(k)
        if len(values) == 1:
            ax.scatter(steps, values, color=COLOR_S)
        elif k in smooth_key:
            ax.plot(steps, values, color=COLOR)
            values_s = tensorboard_smoothing(values, smooth_val)
            ax.plot(steps, values_s, color=COLOR_S)
        else:
            ax.plot(steps, values, color=COLOR_S)
        fpath = os.path.join(image_dir, k.replace('/', '_'))
        plt.savefig(fpath, dpi=dpi, bbox_inches='tight')
--- a/examples/pytorch/llm/llm_infer.py
+++ b/examples/pytorch/llm/llm_infer.py
@@ -0,0 +1,122 @@
 # ### Setting up experimental environment.
 from _common import *
@dataclass
 class Arguments:
    device: str = '0'  # e.g. '-1'; '0'; '0,1'
    model_type: str = field(
        default='baichuan-7b',
        metadata={
            'choices':
            ['baichuan-7b', 'baichuan-13b', 'chatglm2', 'llama2-7b']
        })
    ckpt_fpath: str = ''  # e.g. '/path/to/your/iter_xxx.pth'
    eval_human: bool = False  # False: eval test_dataset
    data_sample: Optional[int] = None
    #
    lora_target_modules: Optional[List[str]] = None
    lora_rank: int = 8
    lora_alpha: int = 32
    lora_dropout_p: float = 0.1
    #
    max_new_tokens: int = 512
    temperature: float = 0.9
    top_k: int = 50
    top_p: float = 0.9
    def __post_init__(self):
        if self.lora_target_modules is None:
            if self.model_type in {'baichuan-7b', 'baichuan-13b'}:
                self.lora_target_modules = ['W_pack']
            elif self.model_type == 'chatglm2':
                self.lora_target_modules = ['query_key_value']
            elif self.model_type == 'llama2-7b':
                self.lora_target_modules = ['q_proj', 'k_proj', 'v_proj']
            else:
                raise ValueError(f'model_type: {self.model_type}')
        #
        if not os.path.isfile(self.ckpt_fpath):
            raise ValueError('Please enter a valid fpath')
 def parse_args() -> Arguments:
    args, = HfArgumentParser([Arguments]).parse_args_into_dataclasses()
    return args
 args = parse_args()
 logger.info(args)
 select_device(args.device)
 # ### Loading Model and Tokenizer
 if args.model_type == 'baichuan-7b':
    model_dir = snapshot_download('baichuan-inc/baichuan-7B', 'v1.0.5')
    model, tokenizer = get_baichuan_model_tokenizer(model_dir)
 elif args.model_type == 'baichuan-13b':
    model_dir = snapshot_download('baichuan-inc/Baichuan-13B-Base', 'v1.0.2')
    model, tokenizer = get_baichuan_model_tokenizer(model_dir)
 elif args.model_type == 'chatglm2':
    model_dir = snapshot_download('ZhipuAI/chatglm2-6b', 'v1.0.6')
    model, tokenizer = get_chatglm2_model_tokenizer(model_dir)
 elif args.model_type == 'llama2-7b':
    model_dir = snapshot_download('modelscope/Llama-2-7b-ms', 'v1.0.0')
    model, tokenizer = get_llama2_model_tokenizer(model_dir)
 else:
    raise ValueError(f'model_type: {args.model_type}')
 # ### Preparing lora
 lora_config = LoRAConfig(
    replace_modules=args.lora_target_modules,
    rank=args.lora_rank,
    lora_alpha=args.lora_alpha,
    lora_dropout=args.lora_dropout_p,
    pretrained_weights=args.ckpt_fpath)
 logger.info(f'lora_config: {lora_config}')
 Swift.prepare_model(model, lora_config)
 model.bfloat16()  # Consistent with training
 # ### Inference
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 generation_config = GenerationConfig(
    max_new_tokens=args.max_new_tokens,
    temperature=args.temperature,
    top_k=args.top_k,
    top_p=args.top_p,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id)
 logger.info(generation_config)
 def inference(data: Dict[str, Optional[str]]) -> str:
    input_ids = tokenize_function(data, tokenizer)['input_ids']
    print(f'[TEST]{tokenizer.decode(input_ids)}', end='')
    input_ids = torch.tensor(input_ids)[None].cuda()
    attention_mask = torch.ones_like(input_ids)
    generate_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        streamer=streamer,
        generation_config=generation_config)
    output_text = tokenizer.decode(generate_ids[0])
    return output_text
 if args.eval_human:
    while True:
        instruction = input('<<< ')
        data = {'instruction': instruction, 'input': None, 'output': None}
        inference(data)
        print('-' * 80)
 else:
    _, test_dataset = get_alpaca_en_zh_dataset(
        None, True, split_seed=42, data_sample=None)
    mini_test_dataset = test_dataset.select(range(10))
    for data in mini_test_dataset:
        output = data['output']
        data['output'] = None
        inference(data)
        print()
        print(f'[LABELS]{output}')
        print('-' * 80)
        # input('next[ENTER]')
--- a/examples/pytorch/llm/llm_sft.py
+++ b/examples/pytorch/llm/llm_sft.py
@@ -0,0 +1,237 @@
 # ### Setting up experimental environment.
 """
 pip install modelscope
 pip install numpy pandas matplotlib scikit-learn
 pip install transformers datasets
 conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
 pip install tqdm tensorboard torchmetrics sentencepiece charset_normalizer
 pip install accelerate transformers_stream_generator
 pip install numpy -U  # Resolve torchmetrics dependencies and update numpy
 """
 from _common import *
@dataclass
 class Arguments:
    device: str = '0,1'  # e.g. '-1'; '0'; '0,1'
    seed: int = 42
    model_type: str = field(
        default='baichuan-7b',
        metadata={
            'choices':
            ['baichuan-7b', 'baichuan-13b', 'chatglm2', 'llama2-7b']
        })
    data_sample: Optional[int] = None
    #
    lora_target_modules: Optional[List[str]] = None
    lora_rank: int = 8
    lora_alpha: int = 32
    lora_dropout_p: float = 0.1
    #
    gradient_checkpoint: bool = True
    batch_size: int = 1
    max_epochs: int = 1
    eval_interval: int = 500
    learning_rate: float = 1e-4
    weight_decay: float = 0.01
    n_accumulate_grad: int = 16
    grad_clip_norm: float = 1.
    warmup_iters: int = 200
    last_max_checkpoint_num: int = 1
    best_max_checkpoint_num: int = 1
    #
    logging_interval: int = 5
    tb_interval: int = 5
    def __post_init__(self):
        if self.lora_target_modules is None:
            if self.model_type in {'baichuan-7b', 'baichuan-13b'}:
                self.lora_target_modules = ['W_pack']
            elif self.model_type == 'chatglm2':
                self.lora_target_modules = ['query_key_value']
            elif self.model_type == 'llama2-7b':
                self.lora_target_modules = ['q_proj', 'k_proj', 'v_proj']
            else:
                raise ValueError(f'model_type: {self.model_type}')
 def parse_args() -> Arguments:
    args, = HfArgumentParser([Arguments]).parse_args_into_dataclasses()
    return args
 args = parse_args()
 logger.info(args)
 select_device(args.device)
 seed_everything(args.seed)
 # ### Loading Model and Tokenizer
 if args.model_type == 'baichuan-7b':
    model_dir = snapshot_download('baichuan-inc/baichuan-7B', 'v1.0.5')
    model, tokenizer = get_baichuan_model_tokenizer(model_dir)
 elif args.model_type == 'baichuan-13b':
    model_dir = snapshot_download('baichuan-inc/Baichuan-13B-Base', 'v1.0.2')
    model, tokenizer = get_baichuan_model_tokenizer(model_dir)
 elif args.model_type == 'chatglm2':
    model_dir = snapshot_download('ZhipuAI/chatglm2-6b', 'v1.0.6')
    model, tokenizer = get_chatglm2_model_tokenizer(model_dir)
 elif args.model_type == 'llama2-7b':
    model_dir = snapshot_download('modelscope/Llama-2-7b-ms', 'v1.0.0')
    model, tokenizer = get_llama2_model_tokenizer(model_dir)
 else:
    raise ValueError(f'model_type: {args.model_type}')
 #
 if args.gradient_checkpoint:
    # baichuan13B does not implement the `get_input_embeddings` function
    if args.model_type == 'baichuan-13b':
        def get_input_embeddings(self):
            return self.model.embed_tokens
        model.__class__.get_input_embeddings = get_input_embeddings.__get__(
            model)
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
 # ### Preparing lora
 lora_config = LoRAConfig(
    replace_modules=args.lora_target_modules,
    rank=args.lora_rank,
    lora_alpha=args.lora_alpha,
    lora_dropout=args.lora_dropout_p)
 logger.info(f'lora_config: {lora_config}')
 Swift.prepare_model(model, lora_config)
 #
 show_freeze_layers(model)
 print_model_info(model)
 _p: Parameter = list(model.parameters())[100]
 logger.info(f'device: {_p.device}, dtype: {_p.dtype}')
 model.bfloat16()
 # ### Loading Dataset
 tokenize_function = partial(tokenize_function, tokenizer=tokenizer)
 train_dataset, val_dataset = get_alpaca_en_zh_dataset(
    tokenize_function, split_seed=42, data_sample=args.data_sample)
 # Data analysis
 stat_dataset(train_dataset)
 stat_dataset(val_dataset)
 data_collate_fn = partial(data_collate_fn, tokenizer=tokenizer)
 print_example(train_dataset[0], tokenizer)
 # ### Setting Config
 cfg_file = os.path.join(model_dir, 'configuration.json')
 #
 T_max = get_T_max(len(train_dataset), args.batch_size, args.max_epochs, True)
 work_dir = get_work_dir(f'runs/{args.model_type}')
 config = Config({
    'train': {
        'dataloader': {
            'batch_size_per_gpu': args.batch_size,
            'workers_per_gpu': 1,
            'shuffle': True,
            'drop_last': True,
            'pin_memory': True
        },
        'max_epochs':
        args.max_epochs,
        'work_dir':
        work_dir,
        'optimizer': {
            'type': 'AdamW',
            'lr': args.learning_rate,
            'weight_decay': args.weight_decay,
            'options': {
                'cumulative_iters': args.n_accumulate_grad,
                'grad_clip': {
                    'norm_type': 2,
                    'max_norm': args.grad_clip_norm
                }
            }
        },
        'lr_scheduler': {
            'type': 'CosineAnnealingLR',
            'T_max': T_max,
            'eta_min': 0,
            'options': {
                'by_epoch': False,
                'warmup': {
                    'type': 'LinearWarmup',
                    'warmup_ratio': 0.1,
                    'warmup_iters': args.warmup_iters
                }
            }
        },
        'hooks': [
            {
                'type': 'CheckpointHook',
                'by_epoch': False,
                'interval': args.eval_interval,
                'max_checkpoint_num': args.last_max_checkpoint_num
            },
            {
                'type': 'EvaluationHook',
                'by_epoch': False,
                'interval': args.eval_interval
            },
            {
                'type': 'BestCkptSaverHook',
                'metric_key': 'loss',
                'save_best': True,
                'rule': 'min',
                'max_checkpoint_num': args.best_max_checkpoint_num
            },
            {
                'type': 'TextLoggerHook',
                'by_epoch': True,  # Whether EpochBasedTrainer is used
                'interval': args.logging_interval
            },
            {
                'type': 'TensorboardHook',
                'by_epoch': False,
                'interval': args.tb_interval
            }
        ]
    },
    'evaluation': {
        'dataloader': {
            'batch_size_per_gpu': args.batch_size,
            'workers_per_gpu': 1,
            'shuffle': False,
            'drop_last': False,
            'pin_memory': True
        },
        'metrics': [{
            'type': 'my_metric',
            'vocab_size': tokenizer.vocab_size
        }]
    }
 })
 # ### Finetuning
 def cfg_modify_fn(cfg: Config) -> Config:
    cfg.update(config)
    return cfg
 trainer = EpochBasedTrainer(
    model=model,
    cfg_file=cfg_file,
    data_collator=data_collate_fn,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    remove_unused_data=True,
    seed=42,
    device='cpu',  # No placement for model, leave the model to `device_map`
    cfg_modify_fn=cfg_modify_fn,
 )
 trainer.train()
 # ### Visualization
 tb_dir = os.path.join(work_dir, 'tensorboard_output')
 plot_image(tb_dir, ['loss'], 0.9)
--- a/examples/pytorch/llm/run_infer.sh
+++ b/examples/pytorch/llm/run_infer.sh
@@ -0,0 +1,5 @@
 python llm_infer.py \
    --device 0 \
    --model_type llama2-7b \
    --ckpt_fpath "runs/llama2-7b/vx_xxx/output_best/pytorch_model.bin" \
    --eval_human true
--- a/examples/pytorch/llm/run_sft.sh
+++ b/examples/pytorch/llm/run_sft.sh
@@ -0,0 +1,8 @@
 #!/bin/bash
 DATE=$(date +"%Y%m%d-%H%M%S")
 nohup python llm_sft.py \
    --device 0 \
    --model_type llama2-7b \
    --data_sample 25000 \
 &> train_$DATE.out &
--- a/examples/pytorch/llm_agent/_common.py
+++ b/examples/pytorch/llm_agent/_common.py
@@ -49,11 +49,9 @@ from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.registry import default_group
 #
-SYSTEM_TEXT = """{system}"""
+PROMPT = """System: {system}
-USER_TEXT = """\n\n### 用户
+Human: {user}
-{user}"""
+AI: """
 ASSISTANT_PROMPT = """\n\n### 助手
 """
 MAX_LENGTH = 2048
 TEST_MAX_LENGTH = MAX_LENGTH
@@ -62,11 +60,6 @@ logger = get_logger()
 #
 def get_model_dir(model_id: str, model_revision: Optional[str] = None) -> str:
    model_dir = snapshot_download(model_id, model_revision)
    return model_dir
 def _get_version(work_dir: str) -> int:
    if os.path.isdir(work_dir):
        fnames = os.listdir(work_dir)
@@ -93,28 +86,40 @@ def get_work_dir(work_dir: str) -> str:
    return work_dir
-def select_device(device_ids: List[int]) -> Device:
+def _format_device(device: Union[List[int], str]) -> Tuple[List[int], str]:
    if isinstance(device, list):
        device_ids = device
        device_str = ','.join([str(d) for d in device])
    else:
        device_ids = [int(d) for d in device.split(',') if d != '-1']
        device_str = device
    device_str = device_str.replace(' ', '')
    return device_ids, device_str
 def select_device(device: Union[List[int], str]) -> Device:
    """Call this function before cuda is initialized.
-    Return: master device
+    device: e.g. []: 'cpu', [0], [0, 1, 2]
        e.g. '-1': 'cpu', '0', '0,1,2'
    """
    if torch.cuda.is_initialized():
        logger.warning('CUDA has been initialized! Device selection fails!')
        return torch.device('cuda:0')
    #
    device_ids, device_str = _format_device(device)
    #
    os.environ['CUDA_VISIBLE_DEVICES'] = device_str
    log_s = 'Using device: '
-    if len(device_ids) == 0:  # cpu
+    if len(device_ids) == 0:
-        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+        master_device: str = 'cpu'
-        device: str = 'cpu'
+        log_s += 'cpu'
        log_s += device
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(
            [str(d) for d in device_ids])
        assert torch.cuda.is_available(
        ) and torch.cuda.device_count() >= len(device_ids)
-        log_s += f"cuda:{','.join([str(d) for d in device_ids])}"  # e.g. "cuda:1,7,8"
+        master_device = 'cuda:0'
-        device = 'cuda:0'
+        log_s += f'cuda:{device_str}'
    logger.info(log_s)
-    return torch.device(device)
+    return torch.device(master_device)
 def seed_everything(seed: Optional[int] = None, gpu_dtm: bool = False) -> int:
@@ -148,37 +153,27 @@ def get_T_max(dataset_len: int, batch_size: int, max_epochs: int,
 def tokenize_function(system: str, user: str, assistant: Optional[str],
                      tokenizer) -> Dict[str, Any]:
    """Only applicable to baichuan and chatglm2. Other models need to be tested"""
-    system_text = SYSTEM_TEXT.format(system=system)
+    src_text = PROMPT.format(system=system, user=user)
-    user_text = USER_TEXT.format(user=user)
+    src_input_ids: List[int] = tokenizer(
-    system_text_ids: List[int] = tokenizer(
+        src_text, return_attention_mask=False,
        system_text, return_attention_mask=False,
        add_special_tokens=True)['input_ids']
-    user_text_ids: List[int] = tokenizer(
+    #
-        user_text, return_attention_mask=False,
+    tgt_input_ids: List[int] = []
        add_special_tokens=False)['input_ids']
    assistant_p_input_ids: List[int] = tokenizer(
        ASSISTANT_PROMPT,
        return_attention_mask=False,
        add_special_tokens=False)['input_ids']
    # tokenizer.bos_token_id: Avoid `assistant` being empty
    assistant_input_ids: List[int] = [tokenizer.bos_token_id]
    if assistant is not None:
-        assistant_input_ids += tokenizer(
+        tgt_input_ids += tokenizer(
            assistant, return_attention_mask=False,
            add_special_tokens=False)['input_ids']
-        assistant_input_ids += [tokenizer.eos_token_id]
+        tgt_input_ids += [tokenizer.eos_token_id]
        labels = [-100] * len(src_input_ids) + tgt_input_ids
    else:
        labels = None
    input_ids = src_input_ids + tgt_input_ids
    #
-    input_ids = system_text_ids + user_text_ids + assistant_p_input_ids + assistant_input_ids
+    if assistant is not None:
    if assistant is not None:  # train, val
        if len(input_ids) > MAX_LENGTH:
            return {}
-        len_mask = len(input_ids) - len(assistant_input_ids)
+    else:
        labels = [-100] * len_mask + assistant_input_ids
    else:  # test
        input_ids = input_ids[-TEST_MAX_LENGTH:]
        labels = None
    #
    return {'input_ids': input_ids, 'labels': labels}
@@ -221,7 +216,7 @@ def print_examples(examples: Dict[str, Any], tokenizer) -> None:
    print(f'[INPUT_IDS] {tokenizer.decode(input_ids)}')
    print()
    print(
-        f'[LABLES] {tokenizer.decode([l if l != -100 else 0 for l in labels])}'
+        f'[LABLES] {tokenizer.decode([lb if lb != -100 else 0 for lb in labels])}'
    )
@@ -305,12 +300,21 @@ class MyMetric(Metric):
        raise NotImplementedError
-def get_baichuan_model_tokenizer(model_dir: Optional[str] = None,
+def _add_special_token(tokenizer):
-                                 load_model: bool = True):
+    if tokenizer.eos_token_id is None:
-    if model_dir is None:
+        tokenizer.eos_token_id = 2
-        model_id = 'baichuan-inc/baichuan-7B'
+    if tokenizer.bos_token_id is None:
-        model_dir = get_model_dir(model_id, None)
+        tokenizer.bos_token_id = 1
-    #
+    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = 0
    logger.info(f'bos_token_id: {tokenizer.bos_token_id}, '
                f'eos_token_id: {tokenizer.eos_token_id}, '
                f'pad_token_id: {tokenizer.pad_token_id}')
 def get_baichuan7B_model_tokenizer(model_dir: str,
                                   load_model: bool = True,
                                   add_special_token: bool = True):
    sys.path.insert(0, model_dir)
    from configuration_baichuan import BaiChuanConfig
    from tokenization_baichuan import BaiChuanTokenizer
@@ -327,16 +331,14 @@ def get_baichuan_model_tokenizer(model_dir: Optional[str] = None,
            device_map='auto',
            torch_dtype=torch.float16)
    #
    if add_special_token:
        _add_special_token(tokenizer)
    return model, tokenizer
-def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None,
+def get_chatglm2_model_tokenizer(model_dir: str,
-                                 load_model: bool = True):
+                                 load_model: bool = True,
-    if model_dir is None:
+                                 add_special_token: bool = True):
        model_id = 'ZhipuAI/chatglm2-6b'
        model_revision = 'v1.0.3'
        model_dir = snapshot_download(model_id, model_revision)
    #
    config = read_config(model_dir)
    config['model'] = ConfigDict({'type': 'chatglm2-6b'})
    tokenizer = ChatGLM2Tokenizer.from_pretrained(model_dir)
@@ -347,6 +349,8 @@ def get_chatglm2_model_tokenizer(model_dir: Optional[str] = None,
            cfg_dict=config,
            device_map='auto',
            torch_dtype=torch.float16)
    if add_special_token:
        _add_special_token(tokenizer)
    return model, tokenizer
@@ -355,7 +359,7 @@ def make_dataset(
                                            Dict[str, Any]]
 ) -> MyDataset:
    """
-    split: Literal["train", "validation"]
+    split: Literal['train', 'validation']
    """
    dataset = MsDataset.load(
        'modelscope/ms_hackathon_23_agent_train_dev', split=split)
--- a/examples/pytorch/llm_agent/baichuan_infer.ipynb
+++ b/examples/pytorch/llm_agent/baichuan_infer.ipynb
@@ -16,15 +16,6 @@
    "### 配置实验环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
@@ -62,8 +53,7 @@
   "source": [
    "from _common import *\n",
    "from transformers import TextStreamer\n",
-    "device_ids = list(range(min(4, torch.cuda.device_count())))\n",
+    "device_ids = [0, 1]\n",
    "logger.info(device_ids)\n",
    "select_device(device_ids)"
   ]
  },
@@ -152,12 +142,11 @@
    }
   ],
   "source": [
-    "CKPT_FAPTH = \"/home/hackathon/my_git/agent/runs/baichuan/v10-20230702-172449/output_best/pytorch_model.bin\"\n",
+    "CKPT_FAPTH = '/home/hackathon/my_git/agent/runs/baichuan/v10-20230702-172449/output_best/pytorch_model.bin'\n",
-    "LORA_TARGET_MODULES = [\"W_pack\"]\n",
+    "LORA_TARGET_MODULES = ['W_pack']\n",
    "\n",
-    "model, tokenizer = get_baichuan_model_tokenizer()\n",
+    "model_dir = snapshot_download('baichuan-inc/baichuan-7B', 'v1.0.5')\n",
-    "if tokenizer.pad_token_id is None:\n",
+    "model, tokenizer = get_baichuan7B_model_tokenizer(model_dir)\n",
    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
    "model.bfloat16()  # Consistent with training"
   ]
  },
@@ -225,7 +214,7 @@
    "    lora_alpha=LORA_ALPHA,\n",
    "    lora_dropout=LORA_DROPOUT_P,\n",
    "    pretrained_weights=CKPT_FAPTH)\n",
-    "logger.info(f\"lora_config: {lora_config}\")\n",
+    "logger.info(f'lora_config: {lora_config}')\n",
    "Swift.prepare_model(model, lora_config)"
   ]
  },
@@ -289,8 +278,8 @@
    }
   ],
   "source": [
-    "test_dataset = make_dataset(\"validation\", lambda system, user, assistant:\n",
+    "test_dataset = make_dataset('validation', lambda system, user, assistant:\n",
-    "                            {\"system\": system, \"user\": user, \"assistant\": assistant})"
+    "                            {'system': system, 'user': user, 'assistant': assistant})"
   ]
  },
  {
@@ -451,20 +440,21 @@
   "source": [
    "streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
    "for d in test_dataset[:5]:\n",
-    "    system = d[\"system\"]\n",
+    "    system = d['system']\n",
-    "    user = d[\"user\"]\n",
+    "    user = d['user']\n",
-    "    assistant = d[\"assistant\"]\n",
+    "    assistant = d['assistant']\n",
-    "    input_ids = tokenize_function(system, user, None, tokenizer)[\"input_ids\"]\n",
+    "    input_ids = tokenize_function(system, user, None, tokenizer)['input_ids']\n",
-    "    print(f\"[TEST]{tokenizer.decode(input_ids)}\", end=\"\")\n",
+    "    print(f'[TEST]{tokenizer.decode(input_ids)}', end='')\n",
    "    input_ids = torch.tensor(input_ids)[None].cuda()\n",
    "    attention_mask = torch.ones_like(input_ids)\n",
    "    generate_ids = model.generate(input_ids=input_ids, max_new_tokens=512,\n",
    "                                  attention_mask=attention_mask,\n",
-    "                                  streamer=streamer, pad_token_id=tokenizer.pad_token_id)\n",
+    "                                  streamer=streamer, pad_token_id=tokenizer.eos_token_id, \n",
    "                                  temperature=0.7, top_k=50, top_p=0.7, do_sample=True)\n",
    "    print()\n",
-    "    print(f\"[LABELS]{assistant}\")\n",
+    "    print(f'[LABELS]{assistant}')\n",
-    "    print(\"-----------------------------------------------------------------------------------\")\n",
+    "    print('-----------------------------------------------------------------------------------')\n",
-    "    # input(\"next[ENTER]\")"
+    "    # input('next[ENTER]')"
   ]
  }
 ],
@@ -484,7 +474,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.10.12"
  }
 },
 "nbformat": 4,
--- a/examples/pytorch/llm_agent/baichuan_sft.ipynb
+++ b/examples/pytorch/llm_agent/baichuan_sft.ipynb
@@ -33,14 +33,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# !pip install modelscope -U\n",
+    "# !pip install modelscope\n",
    "# !pip install numpy pandas matplotlib scikit-learn\n",
    "# !pip install transformers datasets\n",
    "# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n",
-    "# !pip install tqdm\n",
+    "# !pip install tqdm tensorboard torchmetrics sentencepiece charset_normalizer accelerate\n",
-    "# !pip install tensorboard\n",
+    "\n",
    "# !pip install torchmetrics\n",
    "#\n",
    "# !pip install numpy -U  # Resolve torchmetrics dependencies and update numpy"
   ]
  },
@@ -73,8 +71,7 @@
   ],
   "source": [
    "from _common import *\n",
-    "device_ids = list(range(min(4, torch.cuda.device_count())))\n",
+    "device_ids = [0, 1]\n",
    "logger.info(device_ids)\n",
    "select_device(device_ids)\n",
    "_ = seed_everything(42)"
   ]
@@ -130,22 +127,16 @@
    }
   ],
   "source": [
-    "model_id = \"baichuan-inc/baichuan-7B\"\n",
+    "WORK_DIR = 'runs/baichuan'\n",
-    "WORK_DIR = \"runs/baichuan\"\n",
+    "LORA_TARGET_MODULES = ['W_pack']\n",
    "LORA_TARGET_MODULES = [\"W_pack\"]\n",
    "#\n",
-    "model_dir = get_model_dir(model_id, None)\n",
+    "model_dir = snapshot_download('baichuan-inc/baichuan-7B', 'v1.0.5')\n",
-    "model, tokenizer = get_baichuan_model_tokenizer(model_dir)\n",
+    "model, tokenizer = get_baichuan7B_model_tokenizer(model_dir)\n",
    "#\n",
    "GRADIENT_CHECKPOINTING = True\n",
    "if GRADIENT_CHECKPOINTING:\n",
    "    model.gradient_checkpointing_enable()\n",
-    "    model.enable_input_require_grads()\n",
+    "    model.enable_input_require_grads()"
    "if tokenizer.pad_token_id is None:\n",
    "    tokenizer.pad_token_id = tokenizer.eos_token_id\n",
    "#\n",
    "logger.info(f\"bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, \"\n",
    "            f\"pad_token_id: {tokenizer.pad_token_id}\")"
   ]
  },
  {
@@ -237,13 +228,13 @@
    "    rank=LORA_RANK,\n",
    "    lora_alpha=LORA_ALPHA,\n",
    "    lora_dropout=LORA_DROPOUT_P)\n",
-    "logger.info(f\"lora_config: {lora_config}\")\n",
+    "logger.info(f'lora_config: {lora_config}')\n",
    "Swift.prepare_model(model, lora_config)\n",
    "#\n",
    "show_freeze_layers(model)\n",
    "print_model_info(model)\n",
    "_p = list(model.parameters())[100]\n",
-    "logger.info(f\"device: {_p.device}, dtype: {_p.dtype}\")\n",
+    "logger.info(f'device: {_p.device}, dtype: {_p.dtype}')\n",
    "model.bfloat16()"
   ]
  },
@@ -308,8 +299,8 @@
   ],
   "source": [
    "tokenize_function = partial(tokenize_function, tokenizer=tokenizer)\n",
-    "train_dataset = make_dataset(\"train\", tokenize_function)\n",
+    "train_dataset = make_dataset('train', tokenize_function)\n",
-    "val_dataset = make_dataset(\"validation\", tokenize_function)\n",
+    "val_dataset = make_dataset('validation', tokenize_function)\n",
    "# Data analysis\n",
    "stat_dataset(train_dataset)\n",
    "stat_dataset(val_dataset)\n",
@@ -339,7 +330,7 @@
    }
   ],
   "source": [
-    "cfg_file = os.path.join(model_dir, \"configuration.json\")\n",
+    "cfg_file = os.path.join(model_dir, 'configuration.json')\n",
    "#\n",
    "BATCH_SIZE = 1\n",
    "MAX_EPOCHS = 1\n",
@@ -347,62 +338,62 @@
    "WORK_DIR = get_work_dir(WORK_DIR)\n",
    "EVAL_INTERVAL = 200\n",
    "CONFIG = Config({\n",
-    "    \"train\": {\n",
+    "    'train': {\n",
-    "        \"dataloader\": {\n",
+    "        'dataloader': {\n",
-    "            \"batch_size_per_gpu\": BATCH_SIZE,\n",
+    "            'batch_size_per_gpu': BATCH_SIZE,\n",
-    "            \"workers_per_gpu\": 1,\n",
+    "            'workers_per_gpu': 1,\n",
-    "            \"shuffle\": True,\n",
+    "            'shuffle': True,\n",
-    "            \"drop_last\": True,\n",
+    "            'drop_last': True,\n",
-    "            \"pin_memory\": True\n",
+    "            'pin_memory': True\n",
    "        },\n",
-    "        \"max_epochs\": MAX_EPOCHS,\n",
+    "        'max_epochs': MAX_EPOCHS,\n",
-    "        \"work_dir\": WORK_DIR,\n",
+    "        'work_dir': WORK_DIR,\n",
-    "        \"optimizer\": {\n",
+    "        'optimizer': {\n",
-    "            \"type\": \"AdamW\",\n",
+    "            'type': 'AdamW',\n",
-    "            \"lr\": 1e-4,\n",
+    "            'lr': 1e-4,\n",
-    "            \"weight_decay\": 0.01,\n",
+    "            'weight_decay': 0.01,\n",
-    "            \"options\": {\n",
+    "            'options': {\n",
-    "                \"cumulative_iters\": 16, \"grad_clip\": {\n",
+    "                'cumulative_iters': 16, 'grad_clip': {\n",
-    "                    \"norm_type\": 2,\n",
+    "                    'norm_type': 2,\n",
-    "                    \"max_norm\": 2.0\n",
+    "                    'max_norm': 2.0\n",
    "                }\n",
    "            }\n",
    "        },\n",
-    "        \"lr_scheduler\": {\n",
+    "        'lr_scheduler': {\n",
-    "            \"type\": \"CosineAnnealingLR\",\n",
+    "            'type': 'CosineAnnealingLR',\n",
-    "            \"T_max\": T_max,\n",
+    "            'T_max': T_max,\n",
-    "            \"eta_min\": 1e-5,\n",
+    "            'eta_min': 1e-5,\n",
-    "            \"options\": {\n",
+    "            'options': {\n",
-    "                \"by_epoch\": False,\n",
+    "                'by_epoch': False,\n",
-    "                \"warmup\": {\n",
+    "                'warmup': {\n",
    "                    'type': 'LinearWarmup',\n",
    "                    'warmup_ratio': 0.1,\n",
-    "                    \"warmup_iters\": 200\n",
+    "                    'warmup_iters': 200\n",
    "                }\n",
    "            }\n",
    "        },\n",
-    "        \"hooks\": [\n",
+    "        'hooks': [\n",
-    "            {\"type\": \"CheckpointHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL, \"max_checkpoint_num\": 1},\n",
+    "            {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1},\n",
-    "            {\"type\": \"EvaluationHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL},\n",
+    "            {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL},\n",
-    "            {\"type\": \"BestCkptSaverHook\",\n",
+    "            {'type': 'BestCkptSaverHook',\n",
-    "                \"metric_key\": \"acc\",\n",
+    "                'metric_key': 'acc',\n",
-    "                \"save_best\": True, \"rule\": \"max\", \"max_checkpoint_num\": 1},\n",
+    "                'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1},\n",
-    "            {\"type\": \"TextLoggerHook\",\n",
+    "            {'type': 'TextLoggerHook',\n",
-    "                \"by_epoch\": True,  # Whether EpochBasedTrainer is used\n",
+    "                'by_epoch': True,  # Whether EpochBasedTrainer is used\n",
-    "                \"interval\": 5},\n",
+    "                'interval': 5},\n",
-    "            {\"type\": \"TensorboardHook\", \"by_epoch\": False, \"interval\": 5}\n",
+    "            {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5}\n",
    "        ]\n",
    "    },\n",
-    "    \"evaluation\": {\n",
+    "    'evaluation': {\n",
-    "        \"dataloader\": {\n",
+    "        'dataloader': {\n",
-    "            \"batch_size_per_gpu\": BATCH_SIZE,\n",
+    "            'batch_size_per_gpu': BATCH_SIZE,\n",
-    "            \"workers_per_gpu\": 1,\n",
+    "            'workers_per_gpu': 1,\n",
-    "            \"shuffle\": False,\n",
+    "            'shuffle': False,\n",
-    "            \"drop_last\": False,\n",
+    "            'drop_last': False,\n",
-    "            \"pin_memory\": True\n",
+    "            'pin_memory': True\n",
    "        },\n",
-    "        \"metrics\": [\n",
+    "        'metrics': [\n",
-    "            {\"type\": \"my_metric\", \"vocab_size\": tokenizer.vocab_size}\n",
+    "            {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size}\n",
    "        ]\n",
    "    }\n",
    "})"
@@ -1778,16 +1769,16 @@
    }
   ],
   "source": [
-    "tb_dir = os.path.join(WORK_DIR, \"tensorboard_output\")\n",
+    "tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')\n",
    "fname = os.listdir(tb_dir)[0]\n",
    "tb_path = os.path.join(tb_dir, fname)\n",
    "#\n",
    "data = read_tensorboard_file(tb_path)\n",
    "print(data.keys())\n",
-    "_ = plot_image(data, \"loss\", 0.9)\n",
+    "_ = plot_image(data, 'loss', 0.9)\n",
-    "_ = plot_image(data, \"lr\", 0)\n",
+    "_ = plot_image(data, 'lr', 0)\n",
-    "_ = plot_image(data, \"evaluation/acc\", 0)\n",
+    "_ = plot_image(data, 'evaluation/acc', 0)\n",
-    "_ = plot_image(data, \"evaluation/loss\", 0)"
+    "_ = plot_image(data, 'evaluation/loss', 0)"
   ]
  },
  {
--- a/examples/pytorch/llm_agent/chatglm2_infer.ipynb
+++ b/examples/pytorch/llm_agent/chatglm2_infer.ipynb
@@ -17,15 +17,6 @@
    "The following code is copied from baichuan_infer.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
@@ -63,8 +54,7 @@
   "source": [
    "from _common import *\n",
    "from transformers import TextStreamer\n",
-    "device_ids = list(range(min(4, torch.cuda.device_count())))\n",
+    "device_ids = [0, 1]\n",
    "logger.info(device_ids)\n",
    "select_device(device_ids)"
   ]
  },
@@ -149,14 +139,11 @@
    }
   ],
   "source": [
-    "CKPT_FAPTH = \"/home/hackathon/my_git/agent/runs/chatglm2/v1-20230702-203505/output_best/pytorch_model.bin\"\n",
+    "CKPT_FAPTH = '/home/hackathon/my_git/agent/runs/chatglm2/v1-20230702-203505/output_best/pytorch_model.bin'\n",
-    "LORA_TARGET_MODULES = [\"query_key_value\"]\n",
+    "LORA_TARGET_MODULES = ['query_key_value']\n",
    "\n",
-    "model, tokenizer = get_chatglm2_model_tokenizer()\n",
+    "model_dir = snapshot_download('ZhipuAI/chatglm2-6b', 'v1.0.6')\n",
-    "if tokenizer.eos_token_id is None:\n",
+    "model, tokenizer = get_chatglm2_model_tokenizer(model_dir)\n",
    "    tokenizer.eos_token_id = tokenizer.pad_token_id\n",
    "if tokenizer.bos_token_id is None:\n",
    "    tokenizer.bos_token_id = 1\n",
    "model.bfloat16()  # Consistent with training"
   ]
  },
@@ -230,7 +217,7 @@
    "    lora_alpha=LORA_ALPHA,\n",
    "    lora_dropout=LORA_DROPOUT_P,\n",
    "    pretrained_weights=CKPT_FAPTH)\n",
-    "logger.info(f\"lora_config: {lora_config}\")\n",
+    "logger.info(f'lora_config: {lora_config}')\n",
    "Swift.prepare_model(model, lora_config)"
   ]
  },
@@ -295,8 +282,8 @@
    }
   ],
   "source": [
-    "test_dataset = make_dataset(\"validation\", lambda system, user, assistant:\n",
+    "test_dataset = make_dataset('validation', lambda system, user, assistant:\n",
-    "                            {\"system\": system, \"user\": user, \"assistant\": assistant})"
+    "                            {'system': system, 'user': user, 'assistant': assistant})"
   ]
  },
  {
@@ -484,20 +471,21 @@
   "source": [
    "streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
    "for d in test_dataset[:5]:\n",
-    "    system = d[\"system\"]\n",
+    "    system = d['system']\n",
-    "    user = d[\"user\"]\n",
+    "    user = d['user']\n",
-    "    assistant = d[\"assistant\"]\n",
+    "    assistant = d['assistant']\n",
-    "    input_ids = tokenize_function(system, user, None, tokenizer)[\"input_ids\"]\n",
+    "    input_ids = tokenize_function(system, user, None, tokenizer)['input_ids']\n",
-    "    print(f\"[TEST]{tokenizer.decode(input_ids)}\", end=\"\")\n",
+    "    print(f'[TEST]{tokenizer.decode(input_ids)}', end='')\n",
    "    input_ids = torch.tensor(input_ids)[None].cuda()\n",
    "    attention_mask = torch.ones_like(input_ids)\n",
    "    generate_ids = model.generate(input_ids=input_ids, max_new_tokens=512,\n",
    "                                  attention_mask=attention_mask,\n",
-    "                                  streamer=streamer, pad_token_id=tokenizer.pad_token_id)\n",
+    "                                  streamer=streamer, pad_token_id=tokenizer.eos_token_id, \n",
    "                                  temperature=0.7, top_k=50, top_p=0.7, do_sample=True)\n",
    "    print()\n",
-    "    print(f\"[LABELS]{assistant}\")\n",
+    "    print(f'[LABELS]{assistant}')\n",
-    "    print(\"-----------------------------------------------------------------------------------\")\n",
+    "    print('-----------------------------------------------------------------------------------')\n",
-    "    # input(\"next[ENTER]\")"
+    "    # input('next[ENTER]')"
   ]
  }
 ],
@@ -517,7 +505,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
--- a/examples/pytorch/llm_agent/chatglm2_sft.ipynb
+++ b/examples/pytorch/llm_agent/chatglm2_sft.ipynb
@@ -40,20 +40,18 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# !pip install modelscope -U\n",
+    "# !pip install modelscope\n",
    "# !pip install numpy pandas matplotlib scikit-learn\n",
    "# !pip install transformers datasets\n",
    "# !conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia\n",
-    "# !pip install tqdm\n",
+    "# !pip install tqdm tensorboard torchmetrics sentencepiece charset_normalizer accelerate\n",
-    "# !pip install tensorboard\n",
+    "\n",
    "# !pip install torchmetrics\n",
    "#\n",
    "# !pip install numpy -U  # Resolve torchmetrics dependencies and update numpy"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
@@ -78,8 +76,7 @@
   ],
   "source": [
    "from _common import *\n",
-    "device_ids = list(range(min(4, torch.cuda.device_count())))\n",
+    "device_ids = [0, 1]\n",
    "logger.info(device_ids)\n",
    "select_device(device_ids)\n",
    "_ = seed_everything(42)"
   ]
@@ -134,26 +131,16 @@
    }
   ],
   "source": [
-    "model_id = \"ZhipuAI/chatglm2-6b\"\n",
+    "WORK_DIR = 'runs/chatglm2'\n",
-    "model_revision = \"v1.0.3\"\n",
+    "LORA_TARGET_MODULES = ['query_key_value']\n",
    "WORK_DIR = \"runs/chatglm2\"\n",
    "LORA_TARGET_MODULES = [\"query_key_value\"]\n",
    "#\n",
-    "model_dir = get_model_dir(model_id, model_revision)\n",
+    "model_dir = snapshot_download('ZhipuAI/chatglm2-6b', 'v1.0.6')\n",
    "model, tokenizer = get_chatglm2_model_tokenizer(model_dir)\n",
-    "# chatglm2 does not support gradient_checkpointing\n",
+    "#\n",
-    "GRADIENT_CHECKPOINTING = False\n",
+    "GRADIENT_CHECKPOINTING = True\n",
    "if GRADIENT_CHECKPOINTING:\n",
    "    model.gradient_checkpointing_enable()\n",
-    "    model.enable_input_require_grads()\n",
+    "    model.enable_input_require_grads()"
    "logger.info(tokenizer.special_tokens)\n",
    "if tokenizer.eos_token_id is None:\n",
    "    tokenizer.eos_token_id = tokenizer.pad_token_id\n",
    "if tokenizer.bos_token_id is None:\n",
    "    tokenizer.bos_token_id = 1\n",
    "#\n",
    "logger.info(f\"bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, \"\n",
    "            f\"pad_token_id: {tokenizer.pad_token_id}\")"
   ]
  },
  {
@@ -251,13 +238,13 @@
    "    rank=LORA_RANK,\n",
    "    lora_alpha=LORA_ALPHA,\n",
    "    lora_dropout=LORA_DROPOUT_P)\n",
-    "logger.info(f\"lora_config: {lora_config}\")\n",
+    "logger.info(f'lora_config: {lora_config}')\n",
    "Swift.prepare_model(model, lora_config)\n",
    "#\n",
    "show_freeze_layers(model)\n",
    "print_model_info(model)\n",
    "_p = list(model.parameters())[100]\n",
-    "logger.info(f\"device: {_p.device}, dtype: {_p.dtype}\")\n",
+    "logger.info(f'device: {_p.device}, dtype: {_p.dtype}')\n",
    "model.bfloat16()"
   ]
  },
@@ -399,8 +386,8 @@
   ],
   "source": [
    "tokenize_function = partial(tokenize_function, tokenizer=tokenizer)\n",
-    "train_dataset = make_dataset(\"train\", tokenize_function)\n",
+    "train_dataset = make_dataset('train', tokenize_function)\n",
-    "val_dataset = make_dataset(\"validation\", tokenize_function)\n",
+    "val_dataset = make_dataset('validation', tokenize_function)\n",
    "# Data analysis\n",
    "stat_dataset(train_dataset)\n",
    "stat_dataset(val_dataset)\n",
@@ -431,7 +418,7 @@
    }
   ],
   "source": [
-    "cfg_file = os.path.join(model_dir, \"configuration.json\")\n",
+    "cfg_file = os.path.join(model_dir, 'configuration.json')\n",
    "#\n",
    "BATCH_SIZE = 1\n",
    "MAX_EPOCHS = 1\n",
@@ -439,62 +426,62 @@
    "WORK_DIR = get_work_dir(WORK_DIR)\n",
    "EVAL_INTERVAL = 200\n",
    "CONFIG = Config({\n",
-    "    \"train\": {\n",
+    "    'train': {\n",
-    "        \"dataloader\": {\n",
+    "        'dataloader': {\n",
-    "            \"batch_size_per_gpu\": BATCH_SIZE,\n",
+    "            'batch_size_per_gpu': BATCH_SIZE,\n",
-    "            \"workers_per_gpu\": 1,\n",
+    "            'workers_per_gpu': 1,\n",
-    "            \"shuffle\": True,\n",
+    "            'shuffle': True,\n",
-    "            \"drop_last\": True,\n",
+    "            'drop_last': True,\n",
-    "            \"pin_memory\": True\n",
+    "            'pin_memory': True\n",
    "        },\n",
-    "        \"max_epochs\": MAX_EPOCHS,\n",
+    "        'max_epochs': MAX_EPOCHS,\n",
-    "        \"work_dir\": WORK_DIR,\n",
+    "        'work_dir': WORK_DIR,\n",
-    "        \"optimizer\": {\n",
+    "        'optimizer': {\n",
-    "            \"type\": \"AdamW\",\n",
+    "            'type': 'AdamW',\n",
-    "            \"lr\": 1e-4,\n",
+    "            'lr': 1e-4,\n",
-    "            \"weight_decay\": 0.01,\n",
+    "            'weight_decay': 0.01,\n",
-    "            \"options\": {\n",
+    "            'options': {\n",
-    "                \"cumulative_iters\": 16, \"grad_clip\": {\n",
+    "                'cumulative_iters': 16, 'grad_clip': {\n",
-    "                    \"norm_type\": 2,\n",
+    "                    'norm_type': 2,\n",
-    "                    \"max_norm\": 2.0\n",
+    "                    'max_norm': 2.0\n",
    "                }\n",
    "            }\n",
    "        },\n",
-    "        \"lr_scheduler\": {\n",
+    "        'lr_scheduler': {\n",
-    "            \"type\": \"CosineAnnealingLR\",\n",
+    "            'type': 'CosineAnnealingLR',\n",
-    "            \"T_max\": T_max,\n",
+    "            'T_max': T_max,\n",
-    "            \"eta_min\": 1e-5,\n",
+    "            'eta_min': 1e-5,\n",
-    "            \"options\": {\n",
+    "            'options': {\n",
-    "                \"by_epoch\": False,\n",
+    "                'by_epoch': False,\n",
-    "                \"warmup\": {\n",
+    "                'warmup': {\n",
    "                    'type': 'LinearWarmup',\n",
    "                    'warmup_ratio': 0.1,\n",
-    "                    \"warmup_iters\": 200\n",
+    "                    'warmup_iters': 200\n",
    "                }\n",
    "            }\n",
    "        },\n",
-    "        \"hooks\": [\n",
+    "        'hooks': [\n",
-    "            {\"type\": \"CheckpointHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL, \"max_checkpoint_num\": 1},\n",
+    "            {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1},\n",
-    "            {\"type\": \"EvaluationHook\", \"by_epoch\": False, \"interval\": EVAL_INTERVAL},\n",
+    "            {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL},\n",
-    "            {\"type\": \"BestCkptSaverHook\",\n",
+    "            {'type': 'BestCkptSaverHook',\n",
-    "                \"metric_key\": \"acc\",\n",
+    "                'metric_key': 'acc',\n",
-    "                \"save_best\": True, \"rule\": \"max\", \"max_checkpoint_num\": 1},\n",
+    "                'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1},\n",
-    "            {\"type\": \"TextLoggerHook\",\n",
+    "            {'type': 'TextLoggerHook',\n",
-    "                \"by_epoch\": True,  # Whether EpochBasedTrainer is used\n",
+    "                'by_epoch': True,  # Whether EpochBasedTrainer is used\n",
-    "                \"interval\": 5},\n",
+    "                'interval': 5},\n",
-    "            {\"type\": \"TensorboardHook\", \"by_epoch\": False, \"interval\": 5}\n",
+    "            {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5}\n",
    "        ]\n",
    "    },\n",
-    "    \"evaluation\": {\n",
+    "    'evaluation': {\n",
-    "        \"dataloader\": {\n",
+    "        'dataloader': {\n",
-    "            \"batch_size_per_gpu\": BATCH_SIZE,\n",
+    "            'batch_size_per_gpu': BATCH_SIZE,\n",
-    "            \"workers_per_gpu\": 1,\n",
+    "            'workers_per_gpu': 1,\n",
-    "            \"shuffle\": False,\n",
+    "            'shuffle': False,\n",
-    "            \"drop_last\": False,\n",
+    "            'drop_last': False,\n",
-    "            \"pin_memory\": True\n",
+    "            'pin_memory': True\n",
    "        },\n",
-    "        \"metrics\": [\n",
+    "        'metrics': [\n",
-    "            {\"type\": \"my_metric\", \"vocab_size\": tokenizer.vocab_size}\n",
+    "            {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size}\n",
    "        ]\n",
    "    }\n",
    "})"
@@ -1884,16 +1871,16 @@
    }
   ],
   "source": [
-    "tb_dir = os.path.join(WORK_DIR, \"tensorboard_output\")\n",
+    "tb_dir = os.path.join(WORK_DIR, 'tensorboard_output')\n",
    "fname = os.listdir(tb_dir)[0]\n",
    "tb_path = os.path.join(tb_dir, fname)\n",
    "#\n",
    "data = read_tensorboard_file(tb_path)\n",
    "print(data.keys())\n",
-    "_ = plot_image(data, \"loss\", 0.9)\n",
+    "_ = plot_image(data, 'loss', 0.9)\n",
-    "_ = plot_image(data, \"lr\", 0)\n",
+    "_ = plot_image(data, 'lr', 0)\n",
-    "_ = plot_image(data, \"evaluation/acc\", 0)\n",
+    "_ = plot_image(data, 'evaluation/acc', 0)\n",
-    "_ = plot_image(data, \"evaluation/loss\", 0)"
+    "_ = plot_image(data, 'evaluation/loss', 0)"
   ]
  },
  {
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -165,6 +165,7 @@ class Models(object):
    doc2bot = 'doc2bot'
    peer = 'peer'
    llama = 'llama'
    llama2 = 'llama2'
    chatglm_6b = 'chatglm6b'
    chatglm2_6b = 'chatglm2-6b'
--- a/modelscope/models/multi_modal/stable_diffusion/stable_diffusion.py
+++ b/modelscope/models/multi_modal/stable_diffusion/stable_diffusion.py
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
 import torch
 import torch.nn.functional as F
 from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel
 from packaging import version
 from transformers import CLIPTextModel, CLIPTokenizer
 from modelscope.metainfo import Models
@@ -34,6 +35,7 @@ class StableDiffusion(TorchModel):
        """
        super().__init__(model_dir, *args, **kwargs)
        revision = kwargs.pop('revision', None)
        xformers_enable = kwargs.pop('xformers_enable', False)
        self.lora_tune = kwargs.pop('lora_tune', False)
        self.dreambooth_tune = kwargs.pop('dreambooth_tune', False)
@@ -66,6 +68,18 @@ class StableDiffusion(TorchModel):
                self.unet.requires_grad_(False)
            self.unet = self.unet.to(self.device)
        # xformers accelerate memory efficient attention
        if xformers_enable:
            import xformers
            xformers_version = version.parse(xformers.__version__)
            if xformers_version == version.parse('0.0.16'):
                logger.warn(
                    'xFormers 0.0.16 cannot be used for training in some GPUs. '
                    'If you observe problems during training, please update xFormers to at least 0.0.17.'
                )
            self.unet.enable_xformers_memory_efficient_attention()
    def tokenize_caption(self, captions):
        """ Convert caption text to token data.
--- a/modelscope/models/nlp/init.py
+++ b/modelscope/models/nlp/init.py
@@ -75,6 +75,7 @@ if TYPE_CHECKING:
                       DocumentGroundedDialogRerankModel)
    from .xlm_roberta import XLMRobertaConfig, XLMRobertaModel
    from .llama import LlamaForTextGeneration, LlamaConfig, LlamaModel, LlamaTokenizer, LlamaTokenizerFast
    from .llama2 import Llama2ForTextGeneration, Llama2Config, Llama2Model, Llama2Tokenizer, Llama2TokenizerFast
 else:
    _import_structure = {
@@ -170,6 +171,10 @@ else:
            'LlamaForTextGeneration', 'LlamaConfig', 'LlamaModel',
            'LlamaTokenizer', 'LlamaTokenizerFast'
        ],
        'llama2': [
            'Llama2ForTextGeneration', 'Llama2Config', 'Llama2Model',
            'Llama2Tokenizer', 'Llama2TokenizerFast'
        ],
    }
    import sys
--- a/modelscope/models/nlp/chatglm2/configuration.py
+++ b/modelscope/models/nlp/chatglm2/configuration.py
@@ -1,12 +1,13 @@
 """ ChatGLM model configuration """
-from transformers.configuration_utils import PretrainedConfig
+from transformers import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class ChatGLM2Config(PretrainedConfig):
    model_type = 'chatglm'
    def __init__(self,
                 num_layers=28,
@@ -24,7 +25,6 @@ class ChatGLM2Config(PretrainedConfig):
                 post_layer_norm=True,
                 add_bias_linear=False,
                 add_qkv_bias=False,
                 interleaved_qkv=False,
                 bias_dropout_fusion=True,
                 multi_query_attention=False,
                 multi_query_group_num=1,
@@ -32,8 +32,11 @@ class ChatGLM2Config(PretrainedConfig):
                 attention_softmax_in_fp32=True,
                 fp32_residual_connection=False,
                 quantization_bit=0,
                 pre_seq_len=None,
                 prefix_projection=False,
                 **kwargs):
        self.num_layers = num_layers
        self.vocab_size = padded_vocab_size
        self.padded_vocab_size = padded_vocab_size
        self.hidden_size = hidden_size
        self.ffn_hidden_size = ffn_hidden_size
@@ -55,4 +58,6 @@ class ChatGLM2Config(PretrainedConfig):
        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
        self.fp32_residual_connection = fp32_residual_connection
        self.quantization_bit = quantization_bit
        self.pre_seq_len = pre_seq_len
        self.prefix_projection = prefix_projection
        super().__init__(**kwargs)
--- a/modelscope/models/nlp/chatglm2/quantization.py
+++ b/modelscope/models/nlp/chatglm2/quantization.py
@@ -1,11 +1,9 @@
 import base64
 import bz2
 import ctypes
 from functools import partial
 from typing import List
 import torch
 from torch.nn import Linear
 from torch.nn.parameter import Parameter
 from transformers.utils import logging
--- a/modelscope/models/nlp/chatglm2/text_generation.py
+++ b/modelscope/models/nlp/chatglm2/text_generation.py
@@ -2,10 +2,9 @@
 import copy
 import math
 import re
 import sys
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 import torch.nn.functional as F
@@ -22,10 +21,11 @@ from transformers.modeling_outputs import (BaseModelOutputWithPast,
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from modelscope import Model, TorchModel
 from modelscope.metainfo import Models
 from modelscope.models import MODELS, Model, TorchModel
 from modelscope.outputs import OutputKeys
 from modelscope.utils.constant import Tasks
 from ... import MODELS
 from .configuration import ChatGLM2Config
 # flags required to enable jit fusion kernels
@@ -61,17 +61,50 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
        return scores
 class PrefixEncoder(torch.nn.Module):
    """
    The torch.nn model to encode the prefix
    Input shape: (batch-size, prefix-length)
    Output shape: (batch-size, prefix-length, 2*layers*hidden)
    """
    def __init__(self, config: ChatGLM2Config):
        super().__init__()
        self.prefix_projection = config.prefix_projection
        if self.prefix_projection:
            # Use a two-layer MLP to encode the prefix
            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
            self.trans = torch.nn.Sequential(
                torch.nn.Linear(kv_size, config.hidden_size), torch.nn.Tanh(),
                torch.nn.Linear(config.hidden_size, kv_size))
        else:
            self.embedding = torch.nn.Embedding(
                config.pre_seq_len, config.num_layers * config.kv_channels
                * config.multi_query_group_num * 2)
    def forward(self, prefix: torch.Tensor):
        if self.prefix_projection:
            prefix_tokens = self.embedding(prefix)
            past_key_values = self.trans(prefix_tokens)
        else:
            past_key_values = self.embedding(prefix)
        return past_key_values
 def split_tensor_along_last_dim(
    tensor: torch.Tensor,
    num_partitions: int,
    contiguous_split_chunks: bool = False,
 ) -> List[torch.Tensor]:
    """Split a tensor along its last dimension.
    Arguments:
        tensor: input tensor.
        num_partitions: number of partitions to split the tensor
        contiguous_split_chunks: If True, make each chunk contiguous
                                 in memory.
    Returns:
        A list of Tensors
    """
@@ -92,7 +125,7 @@ class RotaryEmbedding(nn.Module):
    def __init__(self, dim, original_impl=False, device=None, dtype=None):
        super().__init__()
        inv_freq = 1.0 / (10000**(
-            torch.arange(0, dim, 2, device=device, dtype=dtype) / dim))
+            torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
        self.register_buffer('inv_freq', inv_freq)
        self.dim = dim
        self.original_impl = original_impl
@@ -104,6 +137,7 @@ class RotaryEmbedding(nn.Module):
                     device: torch.device,
                     base: int = 10000):
        """Enhanced Transformer with Rotary Position Embedding.
        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
        transformers/rope/__init__.py. MIT License:
        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
@@ -325,6 +359,7 @@ class CoreAttention(torch.nn.Module):
 class SelfAttention(torch.nn.Module):
    """Parallel self-attention layer abstract class.
    Self-attention layer takes input with size [s, b, h]
    and returns output of the same size.
    """
@@ -421,9 +456,9 @@ class SelfAttention(torch.nn.Module):
                self.num_multi_query_groups_per_partition,
                self.hidden_size_per_attention_head))
        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                self.num_attention_heads_per_partition,  # noqa
+                               (self.num_attention_heads_per_partition, # noqa
-                3 * self.hidden_size_per_attention_head)  # noqa
+                                3 * self.hidden_size_per_attention_head) # noqa
            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
@@ -436,11 +471,11 @@ class SelfAttention(torch.nn.Module):
            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
        # adjust key and value for inference
        if kv_cache is not None:
            cache_k, cache_v = kv_cache
            key_layer = torch.cat((cache_k, key_layer), dim=0)
            value_layer = torch.cat((cache_v, value_layer), dim=0)
        if use_cache:
            if kv_cache is not None:
                cache_k, cache_v = kv_cache
                key_layer = torch.cat((cache_k, key_layer), dim=0)
                value_layer = torch.cat((cache_v, value_layer), dim=0)
            kv_cache = (key_layer, value_layer)
        else:
            kv_cache = None
@@ -487,6 +522,7 @@ def _config_to_kwargs(args):
 class MLP(torch.nn.Module):
    """MLP.
    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension.
@@ -530,6 +566,7 @@ class MLP(torch.nn.Module):
 class GLMBlock(torch.nn.Module):
    """A single transformer layer.
    Transformer layer takes input with size [s, b, h] and returns an
    output of the same size.
    """
@@ -642,6 +679,8 @@ class GLMTransformer(torch.nn.Module):
                device=device,
                dtype=config.torch_dtype)
        self.gradient_checkpointing = False
    def _get_layer(self, layer_number):
        return self.layers[layer_number]
@@ -657,6 +696,13 @@ class GLMTransformer(torch.nn.Module):
        if not kv_caches:
            kv_caches = [None for _ in range(self.num_layers)]
        presents = () if use_cache else None
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
                )
                use_cache = False
        all_self_attentions = None
        all_hidden_states = () if output_hidden_states else None
        for index in range(self.num_layers):
@@ -664,13 +710,18 @@ class GLMTransformer(torch.nn.Module):
                all_hidden_states = all_hidden_states + (hidden_states, )
            layer = self._get_layer(index)
-
+            if self.gradient_checkpointing and self.training:
-            hidden_states, kv_cache = layer(
+                layer_ret = torch.utils.checkpoint.checkpoint(
-                hidden_states,
+                    layer, hidden_states, attention_mask, rotary_pos_emb,
-                attention_mask,
+                    kv_caches[index], use_cache)
-                rotary_pos_emb,
+            else:
-                kv_cache=kv_caches[index],
+                layer_ret = layer(
-                use_cache=use_cache)
+                    hidden_states,
                    attention_mask,
                    rotary_pos_emb,
                    kv_cache=kv_caches[index],
                    use_cache=use_cache)
            hidden_states, kv_cache = layer_ret
            if use_cache:
                presents = presents + (kv_cache, )
@@ -724,7 +775,7 @@ class ChatGLMPreTrainedModel(TorchModel, PreTrainedModel):
                dim=-1)  # noqa
        if padding_mask is not None:
            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(
-                1)  # noqa
+                1)
        if not past_length and padding_mask is not None:
            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
        full_attention_mask = (full_attention_mask < 0.5).bool()
@@ -739,7 +790,7 @@ class ChatGLMPreTrainedModel(TorchModel, PreTrainedModel):
        return position_ids
    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ChatGLMModel):
+        if isinstance(module, GLMTransformer):
            module.gradient_checkpointing = value
    @classmethod
@@ -801,6 +852,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
        if device is not None:
            init_kwargs['device'] = device
        self.embedding = init_method(Embedding, config, **init_kwargs)
        self.num_layers = config.num_layers
        self.multi_query_group_num = config.multi_query_group_num
        self.kv_channels = config.kv_channels
        # Rotary positional embeddings
        self.seq_length = config.seq_length
@@ -821,7 +875,30 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
            bias=False,
            dtype=config.torch_dtype,
            **init_kwargs)
-        self.gradient_checkpointing = False
+        self.pre_seq_len = config.pre_seq_len
        self.prefix_projection = config.prefix_projection
        if self.pre_seq_len is not None:
            for param in self.parameters():
                param.requires_grad = False
            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
            self.prefix_encoder = PrefixEncoder(config)
            self.dropout = torch.nn.Dropout(0.1)
    def get_input_embeddings(self):
        return self.embedding.word_embeddings
    def get_prompt(self, batch_size, device, dtype=torch.half):
        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size,
                                                               -1).to(device)
        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
        past_key_values = past_key_values.view(batch_size, self.pre_seq_len,
                                               self.num_layers * 2,
                                               self.multi_query_group_num,
                                               self.kv_channels)
        # seq_len, b, nh, hidden_size
        past_key_values = self.dropout(past_key_values)
        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
        return past_key_values
    def forward(
        self,
@@ -847,6 +924,21 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
        if inputs_embeds is None:
            inputs_embeds = self.embedding(input_ids)
        if self.pre_seq_len is not None:
            if past_key_values is None:
                past_key_values = self.get_prompt(
                    batch_size=batch_size,
                    device=input_ids.device,
                    dtype=inputs_embeds.dtype)
            if attention_mask is not None:
                attention_mask = torch.cat(
                    [
                        attention_mask.new_ones(  # noqa
                            (batch_size, self.pre_seq_len)),
                        attention_mask  # noqa
                    ],  # noqa
                    dim=-1)  # noqa
        if full_attention_mask is None:
            if (attention_mask is not None
                    and not attention_mask.all()) or (past_key_values
@@ -923,7 +1015,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
                    attention_mask,  # noqa
                    attention_mask.new_ones(
                        (attention_mask.shape[0], 1))  # noqa
-                ],
+                ],  # noqa
                dim=-1)  # noqa
        # update position ids
@@ -1032,6 +1124,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        Output shares the same memory storage as `past`.
        """
        return tuple((
@@ -1048,11 +1141,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
                     tokenizer,
                     query: str,
                     history: List[Tuple[str, str]] = None):
-        prompt = ''
+        prompt = tokenizer.build_prompt(query, history=history)
        for i, (old_query, response) in enumerate(history):
            prompt += '[Round {}]\n\n问：{}\n\n答：{}\n\n'.format(
                i + 1, old_query, response)
        prompt += '[Round {}]\n\n问：{}\n\n答：'.format(len(history) + 1, query)
        inputs = tokenizer([prompt], return_tensors='pt')
        inputs = inputs.to(self.device)
        return inputs
@@ -1080,7 +1169,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
              tokenizer,
              query: str,
              history: List[Tuple[str, str]] = None,
-              max_length: int = 2048,
+              max_length: int = 8192,
              num_beams=1,
              do_sample=True,
              top_p=0.8,
@@ -1115,7 +1204,7 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
                    query: str,
                    history: List[Tuple[str, str]] = None,
                    past_key_values=None,
-                    max_length: int = 2048,
+                    max_length: int = 8192,
                    do_sample=True,
                    top_p=0.8,
                    temperature=0.8,
@@ -1142,6 +1231,8 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
                tokenizer, query, history=history)
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[0]
            if self.transformer.pre_seq_len is not None:
                past_length -= self.transformer.pre_seq_len
            inputs.position_ids += past_length
            attention_mask = inputs.attention_mask
            attention_mask = torch.cat(
@@ -1157,12 +1248,13 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
                outputs, past_key_values = outputs
            outputs = outputs.tolist()[0][len(inputs['input_ids'][0]):]
            response = tokenizer.decode(outputs)
-            response = self.process_response(response)
+            if response and response[-1] != '<EFBFBD>':
-            new_history = history + [(query, response)]
+                response = self.process_response(response)
-            if return_past_key_values:
+                new_history = history + [(query, response)]
-                yield response, new_history, past_key_values
+                if return_past_key_values:
-            else:
+                    yield response, new_history, past_key_values
-                yield response, new_history
+                else:
                    yield response, new_history
    @torch.no_grad()
    def stream_generate(
@@ -1295,7 +1387,8 @@ class ChatGLM2ForConditionalGeneration(ChatGLMPreTrainedModel):
            self.transformer.encoder,
            bits,
            empty_init=empty_init,
-            device=device)
+            device=device,
            **kwargs)
        return self
    def chat(self, input: Dict, tokenizer) -> Dict:
--- a/modelscope/models/nlp/chatglm2/tokenization.py
+++ b/modelscope/models/nlp/chatglm2/tokenization.py
@@ -1,13 +1,10 @@
 """Tokenization classes for ChatGLM."""
 import os
 from typing import Dict, List, Optional, Union
 from sentencepiece import SentencePieceProcessor
-from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers import PreTrainedTokenizer
 from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
-from transformers.utils import PaddingStrategy, logging
+from transformers.utils import PaddingStrategy
 logger = logging.get_logger(__name__)
 class SPTokenizer:
@@ -21,7 +18,7 @@ class SPTokenizer:
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
        special_tokens = ['[MASK]', '[gMASK]', '[sMASK]', 'sop', 'eop']
@@ -62,7 +59,9 @@ class SPTokenizer:
    def convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.index_special_tokens:
+        if index in self.index_special_tokens or index in [
                self.eos_id, self.bos_id, self.pad_id
        ] or index < 0:
            return ''
        return self.sp_model.IdToPiece(index)
@@ -76,6 +75,7 @@ class ChatGLM2Tokenizer(PreTrainedTokenizer):
        super().__init__(padding_side=padding_side, **kwargs)
        self.name = 'GLMTokenizer'
        self.vocab_file = vocab_file
        self.tokenizer = SPTokenizer(vocab_file)
        self.special_tokens = {
            '<bos>': self.tokenizer.bos_id,
@@ -91,12 +91,16 @@ class ChatGLM2Tokenizer(PreTrainedTokenizer):
    @property
    def pad_token(self) -> str:
-        return '</s>'
+        return '<unk>'
    @property
    def pad_token_id(self):
        return self.get_command('<pad>')
    @property
    def eos_token(self) -> str:
        return '</s>'
    @property
    def eos_token_id(self):
        return self.get_command('<eos>')
@@ -131,11 +135,13 @@ class ChatGLM2Tokenizer(PreTrainedTokenizer):
    def save_vocabulary(self, save_directory, filename_prefix=None):
        """
        Save the vocabulary and special tokens file to a directory.
        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.
        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
@@ -157,6 +163,16 @@ class ChatGLM2Tokenizer(PreTrainedTokenizer):
        prefix_tokens = [self.get_command('[gMASK]'), self.get_command('sop')]
        return prefix_tokens
    def build_prompt(self, query, history=None):
        if history is None:
            history = []
        prompt = ''
        for i, (old_query, response) in enumerate(history):
            prompt += '[Round {}]\n\n问：{}\n\n答：{}\n\n'.format(
                i + 1, old_query, response)
        prompt += '[Round {}]\n\n问：{}\n\n答：'.format(len(history) + 1, query)
        return prompt
    def build_inputs_with_special_tokens(
            self,
            token_ids_0: List[int],
@@ -164,13 +180,16 @@ class ChatGLM2Tokenizer(PreTrainedTokenizer):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:
        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`
        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
@@ -192,16 +211,19 @@ class ChatGLM2Tokenizer(PreTrainedTokenizer):
    ) -> dict:
        """
        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
        Args:
            encoded_inputs:
                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
            max_length: maximum length of the returned list and optionally padding length (see below).
                Will truncate by taking into account the special tokens.
            padding_strategy: PaddingStrategy to use for padding.
                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                - PaddingStrategy.DO_NOT_PAD: Do not pad
                The tokenizer padding sides are defined in self.padding_side:
                    - 'left': pads on the left of the sequences
                    - 'right': pads on the right of the sequences
            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
--- a/modelscope/models/nlp/llama2/init.py
+++ b/modelscope/models/nlp/llama2/init.py
@@ -0,0 +1,29 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
    from .configuration import Llama2Config
    from .text_generation import Llama2ForTextGeneration
    from .backbone import Llama2Model
    from .tokenization import Llama2Tokenizer
    from .tokenization_fast import Llama2TokenizerFast
 else:
    _import_structure = {
        'configuration': ['Llama2Config'],
        'text_generation': ['Llama2ForTextGeneration'],
        'backbone': ['Llama2Model'],
        'tokenization': ['Llama2Tokenizer'],
        'tokenization_fast': ['Llama2TokenizerFast'],
    }
    import sys
    sys.modules[__name__] = LazyImportModule(
        __name__,
        globals()['__file__'],
        _import_structure,
        module_spec=__spec__,
        extra_objects={},
    )
--- a/modelscope/models/nlp/llama2/backbone.py
+++ b/modelscope/models/nlp/llama2/backbone.py
@@ -0,0 +1,667 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch LLaMA model."""
 from typing import List, Optional, Tuple, Union
 import math
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 from modelscope import TorchModel, Model
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
 from .configuration import Llama2Config
 from ... import MODELS
 logger = get_logger(__name__)
 _CONFIG_FOR_DOC = 'Llama2Config'
 # This file is mainly copied from the llama code of transformers
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
    mask_cond = torch.arange(mask.size(-1), device=device)
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)
    if past_key_values_length > 0:
        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 # Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len
    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
    inverted_mask = 1.0 - expanded_mask
    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        LlamaRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps
    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)
 class LlamaRotaryEmbedding(torch.nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq)
        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
        return (
            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
        )
 class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
        t = t / self.scaling_factor
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        self.scaling_factor = scaling_factor
        super().__init__(dim, max_position_embeddings, base, device)
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        if seq_len > self.max_position_embeddings:
            base = self.base * (
                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
            ) ** (self.dim / (self.dim - 2))
            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
            self.register_buffer("inv_freq", inv_freq)
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
 def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed
 class LlamaMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.pretraining_tp = config.pretraining_tp
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]
    def forward(self, x):
        if self.pretraining_tp > 1:
            slice = self.intermediate_size // self.pretraining_tp
            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
            down_proj = sum(down_proj)
        else:
            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class LlamaAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    def __init__(self, config: Llama2Config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.pretraining_tp = config.pretraining_tp
        self.max_position_embeddings = config.max_position_embeddings
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
        self._init_rope()
    def _init_rope(self):
        if self.config.rope_scaling is None:
            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
        else:
            scaling_type = self.config.rope_scaling["type"]
            scaling_factor = self.config.rope_scaling["factor"]
            if scaling_type == "linear":
                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
                )
            elif scaling_type == "dynamic":
                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
                )
            else:
                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()
        if self.pretraining_tp > 1:
            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
            query_states = torch.cat(query_states, dim=-1)
            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
            key_states = torch.cat(key_states, dim=-1)
            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
            value_states = torch.cat(value_states, dim=-1)
        else:
            query_states = self.q_proj(hidden_states)
            key_states = self.k_proj(hidden_states)
            value_states = self.v_proj(hidden_states)
        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
        kv_seq_len = key_states.shape[-2]
        if past_key_value is not None:
            kv_seq_len += past_key_value[0].shape[-2]
        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
        if past_key_value is not None:
            # reuse k, v, self_attention
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)
        past_key_value = (key_states, value_states) if use_cache else None
        # repeat k/v heads if n_kv_heads < n_heads
        key_states = repeat_kv(key_states, self.num_key_value_groups)
        value_states = repeat_kv(value_states, self.num_key_value_groups)
        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )
        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask
        # upcast attention to fp32
        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
        attn_output = torch.matmul(attn_weights, value_states)
        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
        if self.pretraining_tp > 1:
            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
        else:
            attn_output = self.o_proj(attn_output)
        if not output_attentions:
            attn_weights = None
        return attn_output, attn_weights, past_key_value
 class LlamaDecoderLayer(nn.Module):
    def __init__(self, config: Llama2Config):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.self_attn = LlamaAttention(config=config)
        self.mlp = LlamaMLP(config)
        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )
        hidden_states = residual + hidden_states
        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
        outputs = (hidden_states,)
        if output_attentions:
            outputs += (self_attn_weights,)
        if use_cache:
            outputs += (present_key_value,)
        return outputs
 class LlamaPreTrainedModel(TorchModel, PreTrainedModel):
    config_class = Llama2Config
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    def __init__(self, config, **kwargs):
        super().__init__(config.name_or_path, **kwargs)
        super(Model, self).__init__(config)
    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
    def _set_gradient_checkpointing(self, module, value=False):
        if isinstance(module, LlamaModel):
            module.gradient_checkpointing = value
    @classmethod
    def _instantiate(cls, **kwargs):
        """Instantiate the model.
        Args:
            kwargs: Input args.
                    model_dir: The model dir used to load the checkpoint and the label information.
                    num_labels: An optional arg to tell the model how many classes to initialize.
                                    Method will call utils.parse_label_mapping if num_labels not supplied.
                                    If num_labels is not found, the model will use the default setting (2 classes).
        Returns:
            The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
        """
        model_dir = kwargs.pop('model_dir', None)
        if model_dir is None:
            config = Llama2Config(**kwargs)
            model = cls(config)
        else:
            model = super(Model, cls).from_pretrained(
                pretrained_model_name_or_path=model_dir, **kwargs)
        model.model_dir = model_dir
        return model
@MODELS.register_module(Tasks.backbone, module_name=Models.llama2)
 class Llama2Model(LlamaPreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
    Args:
        config: Llama2Config
    """
    def __init__(self, config: Llama2Config):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()
    def get_input_embeddings(self):
        return self.embed_tokens
    def set_input_embeddings(self, value):
        self.embed_tokens = value
    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
        # create causal mask
        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        combined_attention_mask = None
        if input_shape[-1] > 1:
            combined_attention_mask = _make_causal_mask(
                input_shape,
                inputs_embeds.dtype,
                device=inputs_embeds.device,
                past_key_values_length=past_key_values_length,
            )
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
                inputs_embeds.device
            )
            combined_attention_mask = (
                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
            )
        return combined_attention_mask
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # retrieve input_ids and inputs_embeds
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
        elif input_ids is not None:
            batch_size, seq_length = input_ids.shape
        elif inputs_embeds is not None:
            batch_size, seq_length, _ = inputs_embeds.shape
        else:
            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
        seq_length_with_past = seq_length
        past_key_values_length = 0
        if past_key_values is not None:
            past_key_values_length = past_key_values[0][0].shape[2]
            seq_length_with_past = seq_length_with_past + past_key_values_length
        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(
                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
            )
            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
        else:
            position_ids = position_ids.view(-1, seq_length).long()
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)
        # embed positions
        if attention_mask is None:
            attention_mask = torch.ones(
                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
            )
        attention_mask = self._prepare_decoder_attention_mask(
            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
        )
        hidden_states = inputs_embeds
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
        # decoder layers
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        next_decoder_cache = () if use_cache else None
        for idx, decoder_layer in enumerate(self.layers):
            if output_hidden_states:
                all_hidden_states += (hidden_states,)
            past_key_value = past_key_values[idx] if past_key_values is not None else None
            if self.gradient_checkpointing and self.training:
                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        # None for past_key_value
                        return module(*inputs, output_attentions, None)
                    return custom_forward
                layer_outputs = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(decoder_layer),
                    hidden_states,
                    attention_mask,
                    position_ids,
                    None,
                )
            else:
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    position_ids=position_ids,
                    past_key_value=past_key_value,
                    output_attentions=output_attentions,
                    use_cache=use_cache,
                )
            hidden_states = layer_outputs[0]
            if use_cache:
                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
            if output_attentions:
                all_self_attns += (layer_outputs[1],)
        hidden_states = self.norm(hidden_states)
        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)
        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=next_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
        )
--- a/modelscope/models/nlp/llama2/configuration.py
+++ b/modelscope/models/nlp/llama2/configuration.py
@@ -0,0 +1,161 @@
 # coding=utf-8
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ LLaMA model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from modelscope.utils.logger import get_logger
 logger = get_logger(__name__)
 LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class Llama2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the LLaMA-7B.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`LlamaModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        pretraining_tp (`int`, *optional*, defaults to `1`):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
            these scaling strategies behave:
            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
            experimental feature, subject to breaking API changes in future versions.
    """
    model_type = "llama"
    keys_to_ignore_at_inference = ["past_key_values"]
    def __init__(
        self,
        vocab_size=32000,
        hidden_size=4096,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_scaling=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return
        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
            raise ValueError(
                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
                f"got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_factor = self.rope_scaling.get("factor", None)
        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
            raise ValueError(
                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
            )
        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
--- a/modelscope/models/nlp/llama2/text_generation.py
+++ b/modelscope/models/nlp/llama2/text_generation.py
@@ -0,0 +1,182 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from modelscope.metainfo import Models
 from modelscope.utils.constant import Tasks
 from .backbone import LlamaPreTrainedModel, Llama2Model
 from ... import MODELS
 # This file is mainly copied from the llama code of transformers
@MODELS.register_module(Tasks.text_generation, module_name=Models.llama2)
 class Llama2ForTextGeneration(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = Llama2Model(config)
        self.pretraining_tp = config.pretraining_tp
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        # Initialize weights and apply final processing
        self.post_init()
    def get_input_embeddings(self):
        return self.model.embed_tokens
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value
    def get_output_embeddings(self):
        return self.lm_head
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings
    def set_decoder(self, decoder):
        self.model = decoder
    def get_decoder(self):
        return self.model
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Returns:
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        if self.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        logits = logits.float()
        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output
        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        if past_key_values:
            input_ids = input_ids[:, -1:]
        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}
        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        return model_inputs
    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past
--- a/modelscope/models/nlp/llama2/tokenization.py
+++ b/modelscope/models/nlp/llama2/tokenization.py
@@ -0,0 +1,393 @@
 # coding=utf-8
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for LLaMA."""
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 import sentencepiece as spm
 from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 from modelscope.utils.logger import get_logger
 if TYPE_CHECKING:
    from transformers.pipelines.conversational import Conversation
 logger = get_logger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
    },
    "tokenizer_file": {
        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
    },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "hf-internal-testing/llama-tokenizer": 2048,
 }
 SPIECE_UNDERLINE = "▁"
 B_INST, E_INST = "[INST]", "[/INST]"
 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 # fmt: off
 DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your\
 answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 that your responses are socially unbiased and positive in nature.
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not\
 correct. If you don't know the answer to a question, please don't share false information."""
 # fmt: on
 class Llama2Tokenizer(PreTrainedTokenizer):
    """
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        legacy (`bool`, *optional*, defaults to `True`):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            which includes fixes to properly handle tokens that appear after special tokens. A simple example:
            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer
            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer
            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
            more details.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]
    def __init__(
        self,
        vocab_file,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        pad_token=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        add_bos_token=True,
        add_eos_token=False,
        clean_up_tokenization_spaces=False,
        legacy=True,
        **kwargs,
    ):
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            add_bos_token=add_bos_token,
            add_eos_token=add_eos_token,
            sp_model_kwargs=self.sp_model_kwargs,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            legacy=legacy,
            **kwargs,
        )
        if legacy:
            logger.warning_once(
                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
            )
        self.legacy = legacy
        self.vocab_file = vocab_file
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(vocab_file)
    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        return state
    def __setstate__(self, d):
        self.__dict__ = d
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
    @property
    def vocab_size(self):
        """Returns vocab size"""
        return self.sp_model.get_piece_size()
    def get_vocab(self):
        """Returns vocab as a dict"""
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab
    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
    def tokenize(self, text, **kwargs) -> List[str]:
        # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
        # the beginning of the text
        if not self.legacy:
            text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
        return super().tokenize(text, **kwargs)
    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
    def _tokenize(self, text):
        """
        Returns a tokenized string.
        Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
        we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
        function is called with specials tokens: the input is split on the special tokens, and each subsequence is
        passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
        the extra `SPIECE_UNDERLINE` prepended.
        """
        if not self.legacy:
            is_first = text.startswith(SPIECE_UNDERLINE)
            if is_first:
                text = text[1:]
        tokens = self.sp_model.encode(text, out_type=str)
        if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
            tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
        return tokens
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.sp_model.piece_to_id(token)
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        token = self.sp_model.IdToPiece(index)
        return token
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for i, token in enumerate(tokens):
            # make sure that special tokens are not decoded using sentencepiece model
            if token in self.all_special_tokens:
                if not prev_is_special and i != 0:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string
    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.
        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)
        return (out_vocab_file,)
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
        output = bos_token_id + token_ids_0 + eos_token_id
        if token_ids_1 is not None:
            output = output + bos_token_id + token_ids_1 + eos_token_id
        return output
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.
        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.
        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
        bos_token_id = [1] if self.add_bos_token else []
        eos_token_id = [1] if self.add_eos_token else []
        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
        return (
            bos_token_id
            + ([0] * len(token_ids_0))
            + eos_token_id
            + bos_token_id
            + ([0] * len(token_ids_1))
            + eos_token_id
        )
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:
        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```
        if token_ids_1 is None, only returns the first portion of the mask (0s).
        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
        if token_ids_1 is not None:
            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
        return output
    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
        """Builds the input ids for a conversation.
        This is the format used in the provided examples. System prompts should be manually added at the beginning of
        the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
        ```
        <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST]
        ```
        If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
        ```python
        >>> from transformers import Conversation
        >>> Conversation(
        ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
        ... )
        ```
        Args:
            conversation (`Conversation`):
                Conversation to build input ids for.
        Returns:
            `List[int]`:
                Input ids for the conversation.
        """
        dialogue = list(conversation.iter_texts())
        if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
            [not is_user for is_user, msg in dialogue[1::2]]
        ):
            raise ValueError(
                "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
            )
        dialog_tokens: List[int] = []
        if len(conversation.past_user_inputs) > 0:
            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
                conversation.past_user_inputs[0] = (
                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
                )
        elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
            dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
        dialog_tokens += sum(
            [
                [self.bos_token_id]
                + self.encode(
                    f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
                )
                + [self.eos_token_id]
                for prompt, answer in zip(dialogue[::2], dialogue[1::2])
            ],
            [],
        )
        if not (dialogue[-1][0]):
            raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
        dialog_tokens += [self.bos_token_id] + self.encode(
            f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
        )
        return dialog_tokens
--- a/modelscope/models/nlp/llama2/tokenization_fast.py
+++ b/modelscope/models/nlp/llama2/tokenization_fast.py
@@ -0,0 +1,249 @@
 # coding=utf-8
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Optional, Tuple
 from tokenizers import processors
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 from transformers.utils import is_sentencepiece_available, logging
 from transformers.utils.versions import require_version
 if TYPE_CHECKING:
    from transformers.pipelines.conversational import Conversation
 require_version("tokenizers>=0.13.3")
 if is_sentencepiece_available():
    from .tokenization import Llama2Tokenizer
 else:
    Llama2Tokenizer = None
 logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
 B_INST, E_INST = "[INST]", "[/INST]"
 B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 # fmt: off
 DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your\
 answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 that your responses are socially unbiased and positive in nature.
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not\
 correct. If you don't know the answer to a question, please don't share false information."""
 # fmt: on
 class Llama2TokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
    This uses notably ByteFallback and no normalization.
    ```
    from transformers import LlamaTokenizerFast
    tokenizer = LlaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
    tokenizer.encode("Hello this is a test")
    >>> [1, 15043, 445, 338, 263, 1243]
    ```
    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
            Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
            spaces.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    """
    vocab_files_names = VOCAB_FILES_NAMES
    slow_tokenizer_class = Llama2Tokenizer
    padding_side = "left"
    model_input_names = ["input_ids", "attention_mask"]
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        clean_up_tokenization_spaces=False,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        add_bos_token=True,
        add_eos_token=False,
        **kwargs,
    ):
        super().__init__(
            vocab_file=vocab_file,
            tokenizer_file=tokenizer_file,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            **kwargs,
        )
        self._add_bos_token = add_bos_token
        self._add_eos_token = add_eos_token
        self.update_post_processor()
        self.vocab_file = vocab_file
        self.can_save_slow_tokenizer = False if not self.vocab_file else True
    def update_post_processor(self):
        """
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        """
        bos = self.bos_token
        bos_token_id = self.bos_token_id
        eos = self.eos_token
        eos_token_id = self.eos_token_id
        single = f"{(bos+':0 ') * self.add_bos_token}$A:0{(' '+eos+':0') * self.add_eos_token}"
        pair = f"{single}{(' '+bos+':1') * self.add_bos_token} $B:1{(' '+eos+':1') * self.add_eos_token}"
        special_tokens = []
        if self.add_bos_token:
            special_tokens.append((bos, bos_token_id))
        if self.add_eos_token:
            special_tokens.append((eos, eos_token_id))
        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=single, pair=pair, special_tokens=special_tokens
        )
    @property
    def add_eos_token(self):
        return self._add_eos_token
    @property
    def add_bos_token(self):
        return self._add_bos_token
    @add_eos_token.setter
    def add_eos_token(self, value):
        self._add_eos_token = value
        self.update_post_processor()
    @add_bos_token.setter
    def add_bos_token(self, value):
        self._add_bos_token = value
        self.update_post_processor()
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        return (out_vocab_file,)
    def _build_conversation_input_ids(self, conversation: "Conversation"):
        """Builds the input ids for a conversation.
        This is the format used in the provided examples. System prompts should be manually added at the beginning of
        the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
        ```
        <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST]
        ```
        If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
        ```python
        >>> from transformers import Conversation
        >>> Conversation(
        ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
        ... )
        ```
        Args:
            conversation (`Conversation`):
                Conversation to build input ids for.
        Returns:
            `List[int]`:
                Input ids for the conversation.
        """
        dialogue = list(conversation.iter_texts())
        if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
            [not is_user for is_user, msg in dialogue[1::2]]
        ):
            raise ValueError(
                "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
            )
        dialog_tokens = []
        if len(conversation.past_user_inputs) > 0:
            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
                conversation.past_user_inputs[0] = (
                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
                )
        elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
            dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
        dialog_tokens += sum(
            [
                [self.bos_token_id]
                + self.encode(
                    f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
                )
                + [self.eos_token_id]
                for prompt, answer in zip(dialogue[::2], dialogue[1::2])
            ],
            [],
        )
        if not (dialogue[-1][0]):
            raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
        dialog_tokens += [self.bos_token_id] + self.encode(
            f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
        )
        return dialog_tokens
--- a/modelscope/msdatasets/dataset_cls/custom_datasets/audio/asr_dataset.py
+++ b/modelscope/msdatasets/dataset_cls/custom_datasets/audio/asr_dataset.py
@@ -3,6 +3,7 @@
 import os
 from modelscope.msdatasets.ms_dataset import MsDataset
 from modelscope.utils.constant import DownloadMode
 class ASRDataset(MsDataset):
@@ -29,11 +30,14 @@ class ASRDataset(MsDataset):
        return data_list
    @classmethod
-    def load(cls,
+    def load(
-             dataset_name,
+        cls,
-             namespace='speech_asr',
+        dataset_name,
-             train_set='train',
+        namespace='speech_asr',
-             dev_set='validation'):
+        train_set='train',
        dev_set='validation',
        download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    ):
        if os.path.exists(dataset_name):
            data_dir = dataset_name
            ds_dict = {}
@@ -43,6 +47,10 @@ class ASRDataset(MsDataset):
            return ds_dict
        else:
            from modelscope.msdatasets import MsDataset
            ds_dict = MsDataset.load(
-                dataset_name=dataset_name, namespace=namespace)
+                dataset_name=dataset_name,
                namespace=namespace,
                download_mode=download_mode,
            )
            return ds_dict
--- a/modelscope/msdatasets/download/dataset_builder.py
+++ b/modelscope/msdatasets/download/dataset_builder.py
@@ -223,11 +223,23 @@ class CsvDatasetBuilder(csv.Csv):
            if field_name.endswith(':FILE'):
                transform_fields.append(field_name)
-        base_extracted_dir = self.split_path_dict.get(split_name, '')
+        base_extracted_dir: Union[str, list] = self.split_path_dict.get(
            split_name, '')
        for field_name in transform_fields:
-            if base_extracted_dir:
+            if isinstance(base_extracted_dir,
                          list) and len(base_extracted_dir) > 0:
                if df.shape[0] != len(base_extracted_dir):
                    logger.error(
                        f"Number of lines in meta-csv file for split '{split_name}' ({df.shape[0]}) "
                        f'does not match number of data-files({len(base_extracted_dir)})!'
                    )
                else:
                    df[field_name] = base_extracted_dir
            elif isinstance(base_extracted_dir, str) and base_extracted_dir:
                df[field_name] = df[field_name].apply(
                    lambda x: os.path.join(base_extracted_dir, x))
            else:
                logger.warning(f'Nothing to do for field {field_name}')
        pa_data = pa.Table.from_pandas(df)
        return Dataset(arrow_table=pa_data)
--- a/modelscope/pipelines/audio/timestamp_pipeline.py
+++ b/modelscope/pipelines/audio/timestamp_pipeline.py
@@ -93,7 +93,7 @@ class TimestampPipeline(Pipeline):
    def __call__(self,
                 audio_in: Union[str, bytes],
-                 text_in: str = None,
+                 text_in: str,
                 audio_fs: int = None,
                 recog_type: str = None,
                 audio_format: str = None,
--- a/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/diffusers_pipeline.py
@@ -15,7 +15,7 @@ class DiffusersPipeline(Pipeline):
        """
        use `model` to create a diffusers pipeline
        Args:
-            model: model id on modelscope hub.
+            model: model id on modelscope hub or local dir.
            device: str = 'gpu'
        """
--- a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/chinese_stable_diffusion_pipeline.py
@@ -146,7 +146,8 @@ class _DiffuersChineseStableDiffusionPipeline(StableDiffusionPipeline):
            do_classifier_free_guidance,
            negative_prompt=None,
            prompt_embeds: Optional[torch.FloatTensor] = None,
-            negative_prompt_embeds: Optional[torch.FloatTensor] = None):
+            negative_prompt_embeds: Optional[torch.FloatTensor] = None,
            lora_scale: Optional[float] = None):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -169,7 +170,14 @@ class _DiffuersChineseStableDiffusionPipeline(StableDiffusionPipeline):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        """
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
            self._lora_scale = lora_scale
        if prompt is not None and isinstance(prompt, str):
            batch_size = 1
        elif prompt is not None and isinstance(prompt, list):
--- a/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py
+++ b/modelscope/pipelines/multi_modal/diffusers_wrapped/stable_diffusion/stable_diffusion_pipeline.py
@@ -6,8 +6,7 @@ import cv2
 import numpy as np
 import torch
 import torchvision.transforms as transforms
-from diffusers import \
+from diffusers import DiffusionPipeline
    StableDiffusionPipeline as DiffuserStableDiffusionPipeline
 from PIL import Image
 from modelscope.metainfo import Pipelines
@@ -35,7 +34,7 @@ class StableDiffusionPipeline(DiffusersPipeline):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # load pipeline
        torch_type = torch.float16 if self.device == 'cuda' else torch.float32
-        self.pipeline = DiffuserStableDiffusionPipeline.from_pretrained(
+        self.pipeline = DiffusionPipeline.from_pretrained(
            model, torch_dtype=torch_type)
        self.pipeline = self.pipeline.to(self.device)
        # load lora moudle to unet
@@ -48,6 +47,60 @@ class StableDiffusionPipeline(DiffusersPipeline):
    def forward(self, inputs: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        """
        Inputs Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
            callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. The function will be
                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
        """
        if not isinstance(inputs, dict):
            raise ValueError(
                f'Expected the input to be a dictionary, but got {type(input)}'
@@ -57,7 +110,20 @@ class StableDiffusionPipeline(DiffusersPipeline):
            raise ValueError('input should contain "text", but not found')
        images = self.pipeline(
-            inputs['text'], num_inference_steps=30, guidance_scale=7.5)
+            prompt=inputs.get('text'),
            height=inputs.get('height'),
            width=inputs.get('width'),
            num_inference_steps=inputs.get('num_inference_steps', 50),
            guidance_scale=inputs.get('guidance_scale', 7.5),
            negative_prompt=inputs.get('negative_prompt'),
            num_images_per_prompt=inputs.get('num_images_per_prompt', 1),
            eta=inputs.get('eta', 0.0),
            generator=inputs.get('generator'),
            latents=inputs.get('latents'),
            output_type=inputs.get('output_type', 'pil'),
            return_dict=inputs.get('return_dict', True),
            callback=inputs.get('callback'),
            callback_steps=inputs.get('callback_steps', 1))
        return images
--- a/modelscope/trainers/hooks/checkpoint/checkpoint_hook.py
+++ b/modelscope/trainers/hooks/checkpoint/checkpoint_hook.py
@@ -50,6 +50,7 @@ class CheckpointHook(Hook):
        hub_revision (str): Which branch to push the model to, default is `master`.
        upload_strategy (str): The action adopted when the previous uploading is not done
        and the next one is coming, can be `cancel` or `wait`.
        save_trainer_state (bool): Save the trainer state for continue training, default True.
        kwargs:
            by_epoch (bool): Same with `save_strategy`, but has a higher priority, legacy argument.
            output_sub_dir (str): The folder under the `save_dir` to save the output checkpoint for inference.
@@ -75,6 +76,7 @@ class CheckpointHook(Hook):
                 private_hub: Optional[bool] = True,
                 hub_revision: Optional[str] = DEFAULT_REPOSITORY_REVISION,
                 upload_strategy: Optional[str] = UploadStrategy.cancel,
                 save_trainer_state: Optional[bool] = True,
                 **kwargs):
        self.interval = interval
        self.save_dir = save_dir
@@ -97,6 +99,7 @@ class CheckpointHook(Hook):
        self.private_hub = private_hub
        self.hub_revision = hub_revision
        self.upload_strategy = upload_strategy
        self.save_trainer_state = save_trainer_state
        self.tag = -1
        self.is_model_id = None
        self.max_checkpoint_num = None
@@ -219,7 +222,8 @@ class CheckpointHook(Hook):
        checkpoint_path_prefix = os.path.join(self.save_dir, prefix)
        meta = self._create_training_state(trainer)
        self.processor.save_checkpoints(trainer, checkpoint_path_prefix,
-                                        self.output_dir, meta)
+                                        self.output_dir, meta,
                                        self.save_trainer_state)
        self.save_evaluate_results(trainer)
        self.history_checkpoints.append(checkpoint_path_prefix)
        self._remove_obsolete_checkpoints(trainer)
@@ -399,7 +403,8 @@ class BestCkptSaverHook(CheckpointHook):
        self._best_ckpt_file = checkpoint_path_prefix
        meta = self._create_training_state(trainer)
        self.processor.save_checkpoints(trainer, checkpoint_path_prefix,
-                                        self.output_dir, meta)
+                                        self.output_dir, meta,
                                        self.save_trainer_state)
        self.save_evaluate_results(trainer)
        self.history_checkpoints.add(checkpoint_path_prefix)
        self._remove_obsolete_checkpoints(trainer)
--- a/modelscope/trainers/hooks/checkpoint/checkpoint_processor.py
+++ b/modelscope/trainers/hooks/checkpoint/checkpoint_processor.py
@@ -104,7 +104,8 @@ class CheckpointProcessor:
                         trainer,
                         checkpoint_path_prefix,
                         output_dir,
-                         meta=None):
+                         meta=None,
                         save_optimizers=True):
        """Save the state dict for trainer and model.
        This is a strategic function which can be registered by other hook's function.
@@ -115,13 +116,15 @@ class CheckpointProcessor:
                like: /tmp/test/epoch_0
            output_dir(`str`): The output dir for inference.
            meta: (`dict`): The meta info needed to be saved into files.
            save_optimizers: (`bool`): Do save the optimizers state
        """
        model = trainer.unwrap_module(trainer.model)
        _model_file, _train_state_file = self._get_state_file_name(
            checkpoint_path_prefix)
        # Save pth file without model state_dict
-        self.save_trainer_state(trainer, model, _train_state_file, meta)
+        self.save_trainer_state(trainer, model, _train_state_file, meta,
                                save_optimizers)
        self.save_model_state(model, _model_file)
        self.link(model, _model_file, output_dir)
@@ -175,7 +178,8 @@ class CheckpointProcessor:
                'changing to copy the bin file, this may use more disk space.')
            shutil.copyfile(src_file, dest_file)
-    def save_trainer_state(self, trainer, model, train_state_file, meta):
+    def save_trainer_state(self, trainer, model, train_state_file, meta,
                           save_optimizers):
        """Save the trainer state, including optimizer/lr_scheduler's state dict, random states etc.
        Args:
@@ -183,12 +187,13 @@ class CheckpointProcessor:
            model: The model instance.
            train_state_file: The target file name for saving trainer states.
            meta: Some extra meta info.
            save_optimizers: Save optimizers state or not.
        """
        save_checkpoint(
            model,
            train_state_file,
-            trainer.optimizer,
+            trainer.optimizer if save_optimizers else None,
-            trainer.lr_scheduler,
+            trainer.lr_scheduler if save_optimizers else None,
            meta=meta,
            with_model=False)
--- a/modelscope/trainers/hooks/distributed/deepspeed_hook.py
+++ b/modelscope/trainers/hooks/distributed/deepspeed_hook.py
@@ -156,7 +156,8 @@ class DeepspeedProcessor(CheckpointProcessor, LrSchedulerProcessor,
                         trainer,
                         checkpoint_path_prefix,
                         output_dir,
-                         meta=None):
+                         meta=None,
                         save_optimizers=True):
        model = trainer.unwrap_module(trainer.model)
        _train_state_file = checkpoint_path_prefix + self.rank_name(
        ) + CheckpointProcessor.TRAINER_STATE_SUFFIX
--- a/modelscope/trainers/hooks/distributed/megatron_hook.py
+++ b/modelscope/trainers/hooks/distributed/megatron_hook.py
@@ -57,7 +57,8 @@ class MpuProcessor(CheckpointProcessor):
                         trainer,
                         checkpoint_path_prefix,
                         output_dir,
-                         meta=None):
+                         meta=None,
                         save_optimizers=True):
        model = trainer.unwrap_module(trainer.model)
        _train_state_file = checkpoint_path_prefix + self.rank_name(
        ) + CheckpointProcessor.TRAINER_STATE_SUFFIX
@@ -65,8 +66,8 @@ class MpuProcessor(CheckpointProcessor):
        save_checkpoint(
            model,
            _train_state_file,
-            trainer.optimizer,
+            trainer.optimizer if save_optimizers else None,
-            trainer.lr_scheduler,
+            trainer.lr_scheduler if save_optimizers else None,
            meta=meta,
            with_model=False)
--- a/modelscope/trainers/multi_modal/dreambooth_diffusion/dreambooth_diffusion_trainer.py
+++ b/modelscope/trainers/multi_modal/dreambooth_diffusion/dreambooth_diffusion_trainer.py
@@ -41,7 +41,8 @@ class DreamboothCheckpointProcessor(CheckpointProcessor):
                         trainer,
                         checkpoint_path_prefix,
                         output_dir,
-                         meta=None):
+                         meta=None,
                         save_optimizers=True):
        """Save the state dict for dreambooth model.
        """
        pipeline_args = {}
--- a/modelscope/trainers/multi_modal/lora_diffusion/lora_diffusion_trainer.py
+++ b/modelscope/trainers/multi_modal/lora_diffusion/lora_diffusion_trainer.py
@@ -21,7 +21,8 @@ class LoraDiffusionCheckpointProcessor(CheckpointProcessor):
                         trainer,
                         checkpoint_path_prefix,
                         output_dir,
-                         meta=None):
+                         meta=None,
                         save_optimizers=True):
        """Save the state dict for lora tune model.
        """
        trainer.model.unet = trainer.model.unet.to(torch.float32)
--- a/modelscope/utils/error.py
+++ b/modelscope/utils/error.py
@@ -168,3 +168,9 @@ TAMING_IMPORT_ERROR = """
 {0} requires the timm library but it was not found in your environment. You can install it with pip:
 `pip install taming-transformers-rom1504`
 """
 # docstyle-ignore
 XFORMERS_IMPORT_ERROR = """
 {0} requires the timm library but it was not found in your environment. You can install it with pip:
 `pip install xformers>=0.0.17`
 """
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -306,6 +306,7 @@ REQUIREMENTS_MAAPING = OrderedDict([
    ('mpi4py', (is_package_available('mpi4py'), MPI4PY_IMPORT_ERROR)),
    ('open_clip', (is_package_available('open_clip'), OPENCLIP_IMPORT_ERROR)),
    ('taming', (is_package_available('taming'), TAMING_IMPORT_ERROR)),
    ('xformers', (is_package_available('xformers'), XFORMERS_IMPORT_ERROR)),
 ])
 SYSTEM_PACKAGE = set(['os', 'sys', 'typing'])
--- a/modelscope/utils/logger.py
+++ b/modelscope/utils/logger.py
@@ -25,7 +25,7 @@ def get_logger(log_file: Optional[str] = None,
    logger_name = __name__.split('.')[0]
    logger = logging.getLogger(logger_name)
-
+    logger.propagate = False
    if logger_name in init_loggers:
        add_file_handler_if_needed(logger, log_file, file_mode, log_level)
        return logger
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,7 +1,7 @@
 accelerate
 cloudpickle
 decord>=0.6.0
-diffusers==0.15.0
+diffusers==0.18.0
 fairseq
 ftfy>=6.0.3
 librosa==0.9.2
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,7 +21,7 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 [flake8]
 max-line-length = 120
 select = B,C,E,F,P,T4,W,B9
-ignore = F401,F405,F821,W503,E251
+ignore = F401,F403,F405,F821,W503,E251
 exclude = docs/src,*.pyi,.git
 [darglint]
--- a/tests/trainers/test_lora_diffusion_trainer.py
+++ b/tests/trainers/test_lora_diffusion_trainer.py
@@ -35,7 +35,7 @@ class TestLoraDiffusionTrainer(unittest.TestCase):
        shutil.rmtree(self.tmp_dir)
        super().tearDown()
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_lora_diffusion_train(self):
        model_id = 'AI-ModelScope/stable-diffusion-v1-5'
        model_revision = 'v1.0.9'
@@ -67,7 +67,7 @@ class TestLoraDiffusionTrainer(unittest.TestCase):
        results_files = os.listdir(self.tmp_dir)
        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
-    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_lora_diffusion_eval(self):
        model_id = 'AI-ModelScope/stable-diffusion-v1-5'
        model_revision = 'v1.0.9'