diff --git a/modelscope/models/nlp/llama/text_generation.py b/modelscope/models/nlp/llama/text_generation.py
index 0a325df2..dab0f757 100644
--- a/modelscope/models/nlp/llama/text_generation.py
+++ b/modelscope/models/nlp/llama/text_generation.py
@@ -33,10 +33,12 @@ from .backbone import MsModelMixin
def get_chat_prompt(system: str, text: str, history: List[Tuple[str, str]],
max_length: int, tokenizer):
system_prompt = f'[INST] <>\n{system}\n<>\n\n'
- system_ids = tokenizer(system_prompt, return_tensors='pt').input_ids
+ system_ids = tokenizer(
+ system_prompt, add_special_tokens=False, return_tensors='pt').input_ids
text_prompt = f'{text.strip()} [/INST]'
- text_ids = tokenizer(text_prompt, return_tensors='pt').input_ids
+ text_ids = tokenizer(
+ text_prompt, add_special_tokens=False, return_tensors='pt').input_ids
prompt_length = system_ids.shape[-1] + text_ids.shape[-1]
if prompt_length > max_length:
@@ -51,7 +53,9 @@ def get_chat_prompt(system: str, text: str, history: List[Tuple[str, str]],
assert isinstance(user, str)
assert isinstance(bot, str)
round_prompt = f'{user.strip()} [/INST] {bot.strip()} [INST] '
- round_ids = tokenizer(round_prompt, return_tensors='pt').input_ids
+ round_ids = tokenizer(
+ round_prompt, add_special_tokens=False,
+ return_tensors='pt').input_ids
if prompt_length + round_ids.shape[-1] > max_length:
# excess history should not be appended to the prompt
break
diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py
index 8fa9fc24..23473007 100644
--- a/modelscope/pipelines/nlp/__init__.py
+++ b/modelscope/pipelines/nlp/__init__.py
@@ -27,7 +27,9 @@ if TYPE_CHECKING:
from .translation_quality_estimation_pipeline import TranslationQualityEstimationPipeline
from .text_error_correction_pipeline import TextErrorCorrectionPipeline
from .word_alignment_pipeline import WordAlignmentPipeline
- from .text_generation_pipeline import TextGenerationPipeline, TextGenerationT5Pipeline, SeqGPTPipeline
+ from .text_generation_pipeline import TextGenerationPipeline, TextGenerationT5Pipeline, \
+ SeqGPTPipeline, ChatGLM6bTextGenerationPipeline, ChatGLM6bV2TextGenerationPipeline, \
+ QWenChatPipeline, QWenTextGenerationPipeline, Llama2TaskPipeline
from .fid_dialogue_pipeline import FidDialoguePipeline
from .token_classification_pipeline import TokenClassificationPipeline
from .translation_pipeline import TranslationPipeline
@@ -80,7 +82,10 @@ else:
'word_alignment_pipeline': ['WordAlignmentPipeline'],
'text_generation_pipeline': [
'TextGenerationPipeline', 'TextGenerationT5Pipeline',
- 'SeqGPTPipeline'
+ 'ChatGLM6bTextGenerationPipeline',
+ 'ChatGLM6bV2TextGenerationPipeline', 'QWenChatPipeline',
+ 'QWenTextGenerationPipeline', 'SeqGPTPipeline',
+ 'Llama2TaskPipeline'
],
'fid_dialogue_pipeline': ['FidDialoguePipeline'],
'token_classification_pipeline': ['TokenClassificationPipeline'],
diff --git a/modelscope/pipelines/nlp/llama2_text_generation_pipeline.py b/modelscope/pipelines/nlp/llama2_text_generation_pipeline.py
deleted file mode 100644
index d366ec9c..00000000
--- a/modelscope/pipelines/nlp/llama2_text_generation_pipeline.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright (c) 2022 Zhipu.AI
-from typing import Any, Dict, Union
-
-import torch
-
-from modelscope import Model, snapshot_download
-from modelscope.metainfo import Pipelines, Preprocessors
-from modelscope.models.nlp.llama2 import Llama2Tokenizer
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.pipelines.nlp.text_generation_pipeline import \
- TextGenerationPipeline
-from modelscope.preprocessors import Preprocessor
-from modelscope.utils.constant import Fields, Tasks
-
-
-@PIPELINES.register_module(
- Tasks.text_generation,
- module_name=Pipelines.llama2_text_generation_pipeline)
-class Llama2TaskPipeline(TextGenerationPipeline):
-
- def __init__(self,
- model: Union[Model, str],
- preprocessor: Preprocessor = None,
- config_file: str = None,
- device: str = 'gpu',
- auto_collate=True,
- **kwargs):
- """Use `model` and `preprocessor` to create a generation pipeline for prediction.
-
- Args:
- model (str or Model): Supply either a local model dir which supported the text generation task,
- or a model id from the model hub, or a torch model instance.
- preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
- the model if supplied.
- kwargs (dict, `optional`):
- Extra kwargs passed into the preprocessor's constructor.
- Examples:
- >>> from modelscope.utils.constant import Tasks
- >>> import torch
- >>> from modelscope.pipelines import pipeline
- >>> from modelscope import snapshot_download, Model
- >>> model_dir = snapshot_download("modelscope/Llama-2-13b-chat-ms",
- >>> ignore_file_pattern = [r'\\w+\\.safetensors'])
- >>> pipe = pipeline(task=Tasks.text_generation, model=model_dir, device_map='auto',
- >>> torch_dtype=torch.float16)
- >>> inputs="咖啡的作用是什么?"
- >>> result = pipe(inputs,max_length=200, do_sample=True, top_p=0.85,
- >>> temperature=1.0, repetition_penalty=1., eos_token_id=2, bos_token_id=1, pad_token_id=0)
- >>> print(result['text'])
-
- To view other examples plese check tests/pipelines/test_llama2_text_generation_pipeline.py.
- """
- self.model = Model.from_pretrained(
- model, device_map='auto', torch_dtype=torch.float16)
- self.tokenizer = Llama2Tokenizer.from_pretrained(model)
- super().__init__(model=self.model, **kwargs)
-
- def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]:
- return inputs
-
- def _sanitize_parameters(self, **pipeline_parameters):
- return {}, pipeline_parameters, {}
-
- def forward(self,
- inputs,
- max_length=2048,
- do_sample=True,
- top_p=0.85,
- temperature=1.0,
- repetition_penalty=1.,
- eos_token_id=2,
- bos_token_id=1,
- pad_token_id=0,
- **forward_params) -> Dict[str, Any]:
- output = {}
- inputs = self.tokenizer(inputs, return_tensors='pt')
- generate_ids = self.model.generate(
- inputs.input_ids.to('cuda'),
- max_length=max_length,
- do_sample=do_sample,
- top_p=top_p,
- temperature=temperature,
- repetition_penalty=repetition_penalty,
- eos_token_id=eos_token_id,
- bos_token_id=bos_token_id,
- pad_token_id=pad_token_id,
- **forward_params)
- out = self.tokenizer.batch_decode(
- generate_ids,
- skip_special_tokens=True,
- clean_up_tokenization_spaces=False)[0]
- output['text'] = out
- return output
-
- # format the outputs from pipeline
- def postprocess(self, input, **kwargs) -> Dict[str, Any]:
- return input
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 1c37a5b4..2a5b4f7e 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -2,7 +2,6 @@
# Copyright (c) 2022 Zhipu.AI
import os
-import re
from typing import Any, Dict, Optional, Union
import torch
@@ -25,7 +24,8 @@ from modelscope.utils.torch_utils import is_on_same_device
__all__ = [
'TextGenerationPipeline', 'TextGenerationT5Pipeline',
'ChatGLM6bTextGenerationPipeline', 'ChatGLM6bV2TextGenerationPipeline',
- 'QWenChatPipeline', 'QWenTextGenerationPipeline', 'SeqGPTPipeline'
+ 'QWenChatPipeline', 'QWenTextGenerationPipeline', 'SeqGPTPipeline',
+ 'Llama2TaskPipeline'
]
@@ -199,7 +199,7 @@ class ChatGLM6bTextGenerationPipeline(Pipeline):
use_bf16=False,
**kwargs):
from modelscope.models.nlp.chatglm.text_generation import (
- ChatGLMConfig, ChatGLMForConditionalGeneration)
+ ChatGLMForConditionalGeneration)
if isinstance(model, str):
model_dir = snapshot_download(
model) if not os.path.exists(model) else model
@@ -427,7 +427,6 @@ class QWenTextGenerationPipeline(Pipeline):
class SeqGPTPipeline(Pipeline):
def __init__(self, model: Union[Model, str], **kwargs):
- from modelscope.models.nlp import BloomForTextGeneration
from modelscope.utils.hf_util import AutoTokenizer
if isinstance(model, str):
@@ -468,3 +467,89 @@ class SeqGPTPipeline(Pipeline):
# format the outputs from pipeline
def postprocess(self, input, **kwargs) -> Dict[str, Any]:
return input
+
+
+@PIPELINES.register_module(
+ Tasks.text_generation,
+ module_name=Pipelines.llama2_text_generation_pipeline)
+class Llama2TaskPipeline(TextGenerationPipeline):
+
+ def __init__(self,
+ model: Union[Model, str],
+ preprocessor: Preprocessor = None,
+ config_file: str = None,
+ device: str = 'gpu',
+ auto_collate=True,
+ **kwargs):
+ """Use `model` and `preprocessor` to create a generation pipeline for prediction.
+
+ Args:
+ model (str or Model): Supply either a local model dir which supported the text generation task,
+ or a model id from the model hub, or a torch model instance.
+ preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for
+ the model if supplied.
+ kwargs (dict, `optional`):
+ Extra kwargs passed into the preprocessor's constructor.
+ Examples:
+ >>> from modelscope.utils.constant import Tasks
+ >>> import torch
+ >>> from modelscope.pipelines import pipeline
+ >>> from modelscope import snapshot_download, Model
+ >>> model_dir = snapshot_download("modelscope/Llama-2-13b-chat-ms",
+ >>> ignore_file_pattern = [r'\\w+\\.safetensors'])
+ >>> pipe = pipeline(task=Tasks.text_generation, model=model_dir, device_map='auto',
+ >>> torch_dtype=torch.float16)
+ >>> inputs="咖啡的作用是什么?"
+ >>> result = pipe(inputs,max_length=200, do_sample=True, top_p=0.85,
+ >>> temperature=1.0, repetition_penalty=1., eos_token_id=2, bos_token_id=1, pad_token_id=0)
+ >>> print(result['text'])
+
+ To view other examples plese check tests/pipelines/test_llama2_text_generation_pipeline.py.
+ """
+ self.model = Model.from_pretrained(
+ model, device_map='auto', torch_dtype=torch.float16)
+ from modelscope.models.nlp.llama2 import Llama2Tokenizer
+ self.tokenizer = Llama2Tokenizer.from_pretrained(model)
+ super().__init__(model=self.model, **kwargs)
+
+ def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]:
+ return inputs
+
+ def _sanitize_parameters(self, **pipeline_parameters):
+ return {}, pipeline_parameters, {}
+
+ def forward(self,
+ inputs,
+ max_length=2048,
+ do_sample=True,
+ top_p=0.85,
+ temperature=1.0,
+ repetition_penalty=1.,
+ eos_token_id=2,
+ bos_token_id=1,
+ pad_token_id=0,
+ **forward_params) -> Dict[str, Any]:
+ output = {}
+ inputs = self.tokenizer(
+ inputs, add_special_tokens=False, return_tensors='pt')
+ generate_ids = self.model.generate(
+ inputs.input_ids.to('cuda'),
+ max_length=max_length,
+ do_sample=do_sample,
+ top_p=top_p,
+ temperature=temperature,
+ repetition_penalty=repetition_penalty,
+ eos_token_id=eos_token_id,
+ bos_token_id=bos_token_id,
+ pad_token_id=pad_token_id,
+ **forward_params)
+ out = self.tokenizer.batch_decode(
+ generate_ids,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=False)[0]
+ output['text'] = out
+ return output
+
+ # format the outputs from pipeline
+ def postprocess(self, input, **kwargs) -> Dict[str, Any]:
+ return input