examples/pytorch/llm/llm_infer.py

# ### Setting up experimental environment.
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import warnings
from dataclasses import dataclass, field
from functools import partial
from typing import List, Optional

import torch
from swift import LoRAConfig, Swift
from transformers import GenerationConfig, TextStreamer
from utils import (DATASET_MAPPING, DEFAULT_PROMPT, MODEL_MAPPING, get_dataset,
                   get_model_tokenizer, inference, parse_args, process_dataset,
                   tokenize_function)

from modelscope import get_logger

warnings.warn(
    'This directory has been migrated to '
    'https://github.com/modelscope/swift/tree/main/examples/pytorch/llm, '
    'and the files in this directory are no longer maintained.',
    DeprecationWarning)

logger = get_logger()


@dataclass
class InferArguments:
    model_type: str = field(
        default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})
    sft_type: str = field(
        default='lora', metadata={'choices': ['lora', 'full']})
    ckpt_path: str = '/path/to/your/iter_xxx.pth'
    eval_human: bool = False  # False: eval test_dataset
    ignore_args_error: bool = False  # True: notebook compatibility

    dataset: str = field(
        default='alpaca-en,alpaca-zh',
        metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})
    dataset_seed: int = 42
    dataset_sample: int = 20000  # -1: all dataset
    dataset_test_size: float = 0.01
    prompt: str = DEFAULT_PROMPT
    max_length: Optional[int] = 2048

    lora_target_modules: Optional[List[str]] = None
    lora_rank: int = 8
    lora_alpha: int = 32
    lora_dropout_p: float = 0.1

    max_new_tokens: int = 512
    temperature: float = 0.9
    top_k: int = 50
    top_p: float = 0.9

    def __post_init__(self):
        if self.lora_target_modules is None:
            self.lora_target_modules = MODEL_MAPPING[
                self.model_type]['lora_TM']

        if not os.path.isfile(self.ckpt_path):
            raise ValueError(
                f'Please enter a valid ckpt_path: {self.ckpt_path}')


def llm_infer(args: InferArguments) -> None:
    # ### Loading Model and Tokenizer
    support_bf16 = torch.cuda.is_bf16_supported()
    if not support_bf16:
        logger.warning(f'support_bf16: {support_bf16}')

    kwargs = {'low_cpu_mem_usage': True, 'device_map': 'auto'}
    model, tokenizer, _ = get_model_tokenizer(
        args.model_type, torch_dtype=torch.bfloat16, **kwargs)

    # ### Preparing lora
    if args.sft_type == 'lora':
        lora_config = LoRAConfig(
            target_modules=args.lora_target_modules,
            r=args.lora_rank,
            lora_alpha=args.lora_alpha,
            lora_dropout=args.lora_dropout_p,
            pretrained_weights=args.ckpt_path)
        logger.info(f'lora_config: {lora_config}')
        model = Swift.prepare_model(model, lora_config)
        state_dict = torch.load(args.ckpt_path, map_location='cpu')
        model.load_state_dict(state_dict)
    elif args.sft_type == 'full':
        state_dict = torch.load(args.ckpt_path, map_location='cpu')
        model.load_state_dict(state_dict)
    else:
        raise ValueError(f'args.sft_type: {args.sft_type}')

    # ### Inference
    tokenize_func = partial(
        tokenize_function,
        tokenizer=tokenizer,
        prompt=args.prompt,
        max_length=args.max_length)
    streamer = TextStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_config = GenerationConfig(
        max_new_tokens=args.max_new_tokens,
        temperature=args.temperature,
        top_k=args.top_k,
        top_p=args.top_p,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id)
    logger.info(f'generation_config: {generation_config}')

    if args.eval_human:
        while True:
            instruction = input('<<< ')
            data = {'instruction': instruction}
            input_ids = tokenize_func(data)['input_ids']
            inference(input_ids, model, tokenizer, streamer, generation_config)
            print('-' * 80)
    else:
        dataset = get_dataset(args.dataset.split(','))
        _, test_dataset = process_dataset(dataset, args.dataset_test_size,
                                          args.dataset_sample,
                                          args.dataset_seed)
        mini_test_dataset = test_dataset.select(range(10))
        del dataset
        for data in mini_test_dataset:
            output = data['output']
            data['output'] = None
            input_ids = tokenize_func(data)['input_ids']
            inference(input_ids, model, tokenizer, streamer, generation_config)
            print()
            print(f'[LABELS]{output}')
            print('-' * 80)
            # input('next[ENTER]')


if __name__ == '__main__':
    args, remaining_argv = parse_args(InferArguments)
    if len(remaining_argv) > 0:
        if args.ignore_args_error:
            logger.warning(f'remaining_argv: {remaining_argv}')
        else:
            raise ValueError(f'remaining_argv: {remaining_argv}')
    llm_infer(args)
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00			`# ### Setting up experimental environment.`
add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`import os`
			`# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`import warnings`
add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`from dataclasses import dataclass, field`
			`from functools import partial`
			`from typing import List, Optional`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00
add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`import torch`
Replace code with swift wheel (#467) 2023-08-29 17:27:18 +08:00			`from swift import LoRAConfig, Swift`
add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`from transformers import GenerationConfig, TextStreamer`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`from utils import (DATASET_MAPPING, DEFAULT_PROMPT, MODEL_MAPPING, get_dataset,`
add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`get_model_tokenizer, inference, parse_args, process_dataset,`
			`tokenize_function)`

			`from modelscope import get_logger`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`warnings.warn(`
			`'This directory has been migrated to '`
			`'https://github.com/modelscope/swift/tree/main/examples/pytorch/llm, '`
			`'and the files in this directory are no longer maintained.',`
			`DeprecationWarning)`

add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`logger = get_logger()`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00

			`@dataclass`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`class InferArguments:`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00			`model_type: str = field(`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`default='qwen-7b', metadata={'choices': list(MODEL_MAPPING.keys())})`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`sft_type: str = field(`
			`default='lora', metadata={'choices': ['lora', 'full']})`
ckpt output directory ignore .safetensors (#410) ckpt output file ignore .safetensors update 2023-07-25 19:27:11 +08:00			`ckpt_path: str = '/path/to/your/iter_xxx.pth'`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00			`eval_human: bool = False # False: eval test_dataset`
fix checkpoint, same device bug (#427) 2023-07-29 00:06:27 +08:00			`ignore_args_error: bool = False # True: notebook compatibility`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00
			`dataset: str = field(`
			`default='alpaca-en,alpaca-zh',`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`metadata={'help': f'dataset choices: {list(DATASET_MAPPING.keys())}'})`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`dataset_seed: int = 42`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`dataset_sample: int = 20000 # -1: all dataset`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`dataset_test_size: float = 0.01`
			`prompt: str = DEFAULT_PROMPT`
			`max_length: Optional[int] = 2048`
ckpt output directory ignore .safetensors (#410) ckpt output file ignore .safetensors update 2023-07-25 19:27:11 +08:00
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00			`lora_target_modules: Optional[List[str]] = None`
			`lora_rank: int = 8`
			`lora_alpha: int = 32`
			`lora_dropout_p: float = 0.1`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00			`max_new_tokens: int = 512`
			`temperature: float = 0.9`
			`top_k: int = 50`
			`top_p: float = 0.9`

			`def __post_init__(self):`
			`if self.lora_target_modules is None:`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`self.lora_target_modules = MODEL_MAPPING[`
			`self.model_type]['lora_TM']`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00
ckpt output directory ignore .safetensors (#410) ckpt output file ignore .safetensors update 2023-07-25 19:27:11 +08:00			`if not os.path.isfile(self.ckpt_path):`
			`raise ValueError(`
			`f'Please enter a valid ckpt_path: {self.ckpt_path}')`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00

Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`def llm_infer(args: InferArguments) -> None:`
			`# ### Loading Model and Tokenizer`
			`support_bf16 = torch.cuda.is_bf16_supported()`
			`if not support_bf16:`
			`logger.warning(f'support_bf16: {support_bf16}')`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00
			`kwargs = {'low_cpu_mem_usage': True, 'device_map': 'auto'}`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`model, tokenizer, _ = get_model_tokenizer(`
add readme and warning (#462) * add readme and warning * fix bug * update * update readme 2023-08-11 14:55:24 +08:00			`args.model_type, torch_dtype=torch.bfloat16, **kwargs)`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`# ### Preparing lora`
			`if args.sft_type == 'lora':`
			`lora_config = LoRAConfig(`
Replace code with swift wheel (#467) 2023-08-29 17:27:18 +08:00			`target_modules=args.lora_target_modules,`
			`r=args.lora_rank,`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`lora_alpha=args.lora_alpha,`
			`lora_dropout=args.lora_dropout_p,`
ckpt output directory ignore .safetensors (#410) ckpt output file ignore .safetensors update 2023-07-25 19:27:11 +08:00			`pretrained_weights=args.ckpt_path)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`logger.info(f'lora_config: {lora_config}')`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`model = Swift.prepare_model(model, lora_config)`
Replace code with swift wheel (#467) 2023-08-29 17:27:18 +08:00			`state_dict = torch.load(args.ckpt_path, map_location='cpu')`
			`model.load_state_dict(state_dict)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`elif args.sft_type == 'full':`
ckpt output directory ignore .safetensors (#410) ckpt output file ignore .safetensors update 2023-07-25 19:27:11 +08:00			`state_dict = torch.load(args.ckpt_path, map_location='cpu')`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`model.load_state_dict(state_dict)`
			`else:`
			`raise ValueError(f'args.sft_type: {args.sft_type}')`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`# ### Inference`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`tokenize_func = partial(`
			`tokenize_function,`
			`tokenizer=tokenizer,`
			`prompt=args.prompt,`
			`max_length=args.max_length)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`streamer = TextStreamer(`
			`tokenizer, skip_prompt=True, skip_special_tokens=True)`
			`generation_config = GenerationConfig(`
			`max_new_tokens=args.max_new_tokens,`
			`temperature=args.temperature,`
			`top_k=args.top_k,`
			`top_p=args.top_p,`
			`do_sample=True,`
fix copytree python37 bug (#464) * fix copytree python37 bug * add copytree_py37 function 2023-08-14 11:45:33 +08:00			`pad_token_id=tokenizer.pad_token_id,`
			`eos_token_id=tokenizer.eos_token_id)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`logger.info(f'generation_config: {generation_config}')`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`if args.eval_human:`
			`while True:`
			`instruction = input('<<< ')`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`data = {'instruction': instruction}`
			`input_ids = tokenize_func(data)['input_ids']`
			`inference(input_ids, model, tokenizer, streamer, generation_config)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`print('-' * 80)`
			`else:`
fix checkpoint, same device bug (#427) 2023-07-29 00:06:27 +08:00			`dataset = get_dataset(args.dataset.split(','))`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`_, test_dataset = process_dataset(dataset, args.dataset_test_size,`
			`args.dataset_sample,`
			`args.dataset_seed)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`mini_test_dataset = test_dataset.select(range(10))`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`del dataset`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`for data in mini_test_dataset:`
			`output = data['output']`
			`data['output'] = None`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`input_ids = tokenize_func(data)['input_ids']`
			`inference(input_ids, model, tokenizer, streamer, generation_config)`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`print()`
			`print(f'[LABELS]{output}')`
			`print('-' * 80)`
			`# input('next[ENTER]')`
support llama2 (#393) * Unify sft and infer code into a single file * update llama2 sft infer 2023-07-19 17:34:27 +08:00

Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`if __name__ == '__main__':`
add qwen 7b base and chat 添加QWen 7b base模型和chat模型及相关pipelines Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/13482235 * add qwen 7b base and chat * fix logger * update examples, lint test * add unittest for qwen base and chat * rename qwen to qwen-7b * resolve imports and add a registry to text-generation * reset load model from pretrained * fix precheck * skip qwen test case now * remove strange file 2023-08-02 09:25:21 +08:00			`args, remaining_argv = parse_args(InferArguments)`
support openbuddy-llama2-13b (#416) 2023-07-26 18:12:55 +08:00			`if len(remaining_argv) > 0:`
			`if args.ignore_args_error:`
			`logger.warning(f'remaining_argv: {remaining_argv}')`
			`else:`
			`raise ValueError(f'remaining_argv: {remaining_argv}')`
Added full parameter sft to llm (#402) * Optimized code * update parse_args * fix get_logger bug * update parse_args * Added full parameter fine-tuning * Add support_bf16 warning * Modify the code format and fix bugs 2023-07-24 15:52:09 +08:00			`llm_infer(args)`