mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-25 12:39:25 +01:00
466 lines
15 KiB
Python
466 lines
15 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import uuid
|
|
from typing import Any, Dict, Union
|
|
|
|
from transformers import AutoTokenizer
|
|
|
|
from ..metainfo import Models, Preprocessors
|
|
from ..utils.constant import Fields, InputFields
|
|
from ..utils.type_assert import type_assert
|
|
from .base import Preprocessor
|
|
from .builder import PREPROCESSORS
|
|
|
|
__all__ = [
|
|
'Tokenize', 'SequenceClassificationPreprocessor',
|
|
'TextGenerationPreprocessor', 'ZeroShotClassificationPreprocessor',
|
|
'TokenClassifcationPreprocessor', 'NLIPreprocessor',
|
|
'SentimentClassificationPreprocessor', 'FillMaskPreprocessor'
|
|
]
|
|
|
|
|
|
@PREPROCESSORS.register_module(Fields.nlp)
|
|
class Tokenize(Preprocessor):
|
|
|
|
def __init__(self, tokenizer_name) -> None:
|
|
self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
|
|
|
def __call__(self, data: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
|
|
if isinstance(data, str):
|
|
data = {InputFields.text: data}
|
|
token_dict = self._tokenizer(data[InputFields.text])
|
|
data.update(token_dict)
|
|
return data
|
|
|
|
|
|
@PREPROCESSORS.register_module(
|
|
Fields.nlp, module_name=Preprocessors.nli_tokenizer)
|
|
class NLIPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, *args, **kwargs):
|
|
"""preprocess the data via the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
from sofa import SbertTokenizer
|
|
self.model_dir: str = model_dir
|
|
self.first_sequence: str = kwargs.pop('first_sequence',
|
|
'first_sequence')
|
|
self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
|
|
self.sequence_length = kwargs.pop('sequence_length', 128)
|
|
|
|
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
|
|
|
@type_assert(object, tuple)
|
|
def __call__(self, data: tuple) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (tuple): [sentence1, sentence2]
|
|
sentence1 (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
sentence2 (str): a sentence
|
|
Example:
|
|
'you are so beautiful.'
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
sentence1, sentence2 = data
|
|
new_data = {
|
|
self.first_sequence: sentence1,
|
|
self.second_sequence: sentence2
|
|
}
|
|
# preprocess the data for the model input
|
|
|
|
rst = {
|
|
'id': [],
|
|
'input_ids': [],
|
|
'attention_mask': [],
|
|
'token_type_ids': []
|
|
}
|
|
|
|
max_seq_length = self.sequence_length
|
|
|
|
text_a = new_data[self.first_sequence]
|
|
text_b = new_data[self.second_sequence]
|
|
feature = self.tokenizer(
|
|
text_a,
|
|
text_b,
|
|
padding=False,
|
|
truncation=True,
|
|
max_length=max_seq_length)
|
|
|
|
rst['id'].append(new_data.get('id', str(uuid.uuid4())))
|
|
rst['input_ids'].append(feature['input_ids'])
|
|
rst['attention_mask'].append(feature['attention_mask'])
|
|
rst['token_type_ids'].append(feature['token_type_ids'])
|
|
|
|
return rst
|
|
|
|
|
|
@PREPROCESSORS.register_module(
|
|
Fields.nlp, module_name=Preprocessors.sen_cls_tokenizer)
|
|
class SentimentClassificationPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, *args, **kwargs):
|
|
"""preprocess the data via the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
from sofa import SbertTokenizer
|
|
self.model_dir: str = model_dir
|
|
self.first_sequence: str = kwargs.pop('first_sequence',
|
|
'first_sequence')
|
|
self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
|
|
self.sequence_length = kwargs.pop('sequence_length', 128)
|
|
|
|
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
|
|
|
@type_assert(object, str)
|
|
def __call__(self, data: str) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
|
|
new_data = {self.first_sequence: data}
|
|
# preprocess the data for the model input
|
|
|
|
rst = {
|
|
'id': [],
|
|
'input_ids': [],
|
|
'attention_mask': [],
|
|
'token_type_ids': []
|
|
}
|
|
|
|
max_seq_length = self.sequence_length
|
|
|
|
text_a = new_data[self.first_sequence]
|
|
|
|
text_b = new_data.get(self.second_sequence, None)
|
|
feature = self.tokenizer(
|
|
text_a,
|
|
text_b,
|
|
padding='max_length',
|
|
truncation=True,
|
|
max_length=max_seq_length)
|
|
|
|
rst['id'].append(new_data.get('id', str(uuid.uuid4())))
|
|
rst['input_ids'].append(feature['input_ids'])
|
|
rst['attention_mask'].append(feature['attention_mask'])
|
|
rst['token_type_ids'].append(feature['token_type_ids'])
|
|
|
|
return rst
|
|
|
|
|
|
@PREPROCESSORS.register_module(
|
|
Fields.nlp, module_name=Preprocessors.bert_seq_cls_tokenizer)
|
|
class SequenceClassificationPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, *args, **kwargs):
|
|
"""preprocess the data via the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
from easynlp.modelzoo import AutoTokenizer
|
|
self.model_dir: str = model_dir
|
|
self.first_sequence: str = kwargs.pop('first_sequence',
|
|
'first_sequence')
|
|
self.second_sequence = kwargs.pop('second_sequence', 'second_sequence')
|
|
self.sequence_length = kwargs.pop('sequence_length', 128)
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
|
print(f'this is the tokenzier {self.tokenizer}')
|
|
|
|
@type_assert(object, (str, tuple, Dict))
|
|
def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (str or tuple, Dict):
|
|
sentence1 (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
or
|
|
(sentence1, sentence2)
|
|
sentence1 (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
sentence2 (str): a sentence
|
|
Example:
|
|
'you are so beautiful.'
|
|
or
|
|
{field1: field_value1, field2: field_value2}
|
|
field1 (str): field name, default 'first_sequence'
|
|
field_value1 (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
|
|
field2 (str): field name, default 'second_sequence'
|
|
field_value2 (str): a sentence
|
|
Example:
|
|
'you are so beautiful.'
|
|
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
if isinstance(data, str):
|
|
new_data = {self.first_sequence: data}
|
|
elif isinstance(data, tuple):
|
|
sentence1, sentence2 = data
|
|
new_data = {
|
|
self.first_sequence: sentence1,
|
|
self.second_sequence: sentence2
|
|
}
|
|
else:
|
|
new_data = data
|
|
|
|
# preprocess the data for the model input
|
|
|
|
rst = {
|
|
'id': [],
|
|
'input_ids': [],
|
|
'attention_mask': [],
|
|
'token_type_ids': []
|
|
}
|
|
|
|
max_seq_length = self.sequence_length
|
|
|
|
text_a = new_data[self.first_sequence]
|
|
text_b = new_data.get(self.second_sequence, None)
|
|
feature = self.tokenizer(
|
|
text_a,
|
|
text_b,
|
|
padding='max_length',
|
|
truncation=True,
|
|
max_length=max_seq_length)
|
|
|
|
rst['id'].append(new_data.get('id', str(uuid.uuid4())))
|
|
rst['input_ids'].append(feature['input_ids'])
|
|
rst['attention_mask'].append(feature['attention_mask'])
|
|
rst['token_type_ids'].append(feature['token_type_ids'])
|
|
|
|
return rst
|
|
|
|
|
|
@PREPROCESSORS.register_module(
|
|
Fields.nlp, module_name=Preprocessors.palm_text_gen_tokenizer)
|
|
class TextGenerationPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, tokenizer, *args, **kwargs):
|
|
"""preprocess the data using the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
super().__init__(*args, **kwargs)
|
|
|
|
self.model_dir: str = model_dir
|
|
self.first_sequence: str = kwargs.pop('first_sequence',
|
|
'first_sequence')
|
|
self.second_sequence: str = kwargs.pop('second_sequence',
|
|
'second_sequence')
|
|
self.sequence_length: int = kwargs.pop('sequence_length', 128)
|
|
self.tokenizer = tokenizer
|
|
|
|
@type_assert(object, str)
|
|
def __call__(self, data: str) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
import torch
|
|
|
|
new_data = {self.first_sequence: data}
|
|
# preprocess the data for the model input
|
|
|
|
rst = {'input_ids': [], 'attention_mask': []}
|
|
|
|
max_seq_length = self.sequence_length
|
|
|
|
text_a = new_data.get(self.first_sequence, None)
|
|
text_b = new_data.get(self.second_sequence, None)
|
|
feature = self.tokenizer(
|
|
text_a,
|
|
text_b,
|
|
padding='max_length',
|
|
truncation=True,
|
|
max_length=max_seq_length)
|
|
|
|
rst['input_ids'].append(feature['input_ids'])
|
|
rst['attention_mask'].append(feature['attention_mask'])
|
|
# rst['token_type_ids'].append(feature['token_type_ids'])
|
|
return {k: torch.tensor(v) for k, v in rst.items()}
|
|
|
|
|
|
@PREPROCESSORS.register_module(Fields.nlp)
|
|
class FillMaskPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, *args, **kwargs):
|
|
"""preprocess the data via the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
super().__init__(*args, **kwargs)
|
|
from sofa.utils.backend import AutoTokenizer
|
|
self.model_dir = model_dir
|
|
self.first_sequence: str = kwargs.pop('first_sequence',
|
|
'first_sequence')
|
|
self.sequence_length = kwargs.pop('sequence_length', 128)
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
model_dir, use_fast=False)
|
|
|
|
@type_assert(object, str)
|
|
def __call__(self, data: str) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
import torch
|
|
|
|
new_data = {self.first_sequence: data}
|
|
# preprocess the data for the model input
|
|
|
|
rst = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
|
|
|
|
max_seq_length = self.sequence_length
|
|
|
|
text_a = new_data[self.first_sequence]
|
|
feature = self.tokenizer(
|
|
text_a,
|
|
padding='max_length',
|
|
truncation=True,
|
|
max_length=max_seq_length,
|
|
return_token_type_ids=True)
|
|
|
|
rst['input_ids'].append(feature['input_ids'])
|
|
rst['attention_mask'].append(feature['attention_mask'])
|
|
rst['token_type_ids'].append(feature['token_type_ids'])
|
|
|
|
return {k: torch.tensor(v) for k, v in rst.items()}
|
|
|
|
|
|
@PREPROCESSORS.register_module(
|
|
Fields.nlp, module_name=Preprocessors.zero_shot_cls_tokenizer)
|
|
class ZeroShotClassificationPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, *args, **kwargs):
|
|
"""preprocess the data via the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
from sofa import SbertTokenizer
|
|
self.model_dir: str = model_dir
|
|
self.sequence_length = kwargs.pop('sequence_length', 512)
|
|
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
|
|
|
@type_assert(object, str)
|
|
def __call__(self, data: str, hypothesis_template: str,
|
|
candidate_labels: list) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
pairs = [[data, hypothesis_template.format(label)]
|
|
for label in candidate_labels]
|
|
|
|
features = self.tokenizer(
|
|
pairs,
|
|
padding=True,
|
|
truncation=True,
|
|
max_length=self.sequence_length,
|
|
return_tensors='pt',
|
|
truncation_strategy='only_first')
|
|
return features
|
|
|
|
|
|
@PREPROCESSORS.register_module(
|
|
Fields.nlp, module_name=Preprocessors.token_cls_tokenizer)
|
|
class TokenClassifcationPreprocessor(Preprocessor):
|
|
|
|
def __init__(self, model_dir: str, *args, **kwargs):
|
|
"""preprocess the data via the vocab.txt from the `model_dir` path
|
|
|
|
Args:
|
|
model_dir (str): model path
|
|
"""
|
|
|
|
super().__init__(*args, **kwargs)
|
|
|
|
from sofa import SbertTokenizer
|
|
self.model_dir: str = model_dir
|
|
self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir)
|
|
|
|
@type_assert(object, str)
|
|
def __call__(self, data: str) -> Dict[str, Any]:
|
|
"""process the raw input data
|
|
|
|
Args:
|
|
data (str): a sentence
|
|
Example:
|
|
'you are so handsome.'
|
|
|
|
Returns:
|
|
Dict[str, Any]: the preprocessed data
|
|
"""
|
|
|
|
# preprocess the data for the model input
|
|
|
|
text = data.replace(' ', '').strip()
|
|
tokens = []
|
|
for token in text:
|
|
token = self.tokenizer.tokenize(token)
|
|
tokens.extend(token)
|
|
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
|
input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)
|
|
attention_mask = [1] * len(input_ids)
|
|
token_type_ids = [0] * len(input_ids)
|
|
return {
|
|
'text': text,
|
|
'input_ids': input_ids,
|
|
'attention_mask': attention_mask,
|
|
'token_type_ids': token_type_ids
|
|
}
|