From 414c0c1b3c5bb083524b321b83b5f2b1dc3ea443 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=80=9D=E5=AE=8F?= Date: Mon, 13 Jun 2022 16:56:30 +0800 Subject: [PATCH 1/4] init --- modelscope/models/__init__.py | 2 +- modelscope/models/nlp/__init__.py | 1 + modelscope/models/nlp/nli_model.py | 83 ++++++++++++++++++++++ modelscope/pipelines/nlp/__init__.py | 1 + modelscope/pipelines/nlp/nli_pipeline.py | 88 ++++++++++++++++++++++++ modelscope/preprocessors/__init__.py | 2 +- modelscope/preprocessors/nlp.py | 73 +++++++++++++++++++- modelscope/utils/constant.py | 1 + test.py | 12 ++++ 9 files changed, 260 insertions(+), 3 deletions(-) create mode 100644 modelscope/models/nlp/nli_model.py create mode 100644 modelscope/pipelines/nlp/nli_pipeline.py create mode 100644 test.py diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py index 170e525e..2d852970 100644 --- a/modelscope/models/__init__.py +++ b/modelscope/models/__init__.py @@ -2,4 +2,4 @@ from .base import Model from .builder import MODELS, build_model -from .nlp import BertForSequenceClassification +from .nlp import BertForSequenceClassification, SbertForNLI diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index b2a1d43b..114295fc 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -1,2 +1,3 @@ +from .nli_model import * # noqa F403 from .sequence_classification_model import * # noqa F403 from .text_generation_model import * # noqa F403 diff --git a/modelscope/models/nlp/nli_model.py b/modelscope/models/nlp/nli_model.py new file mode 100644 index 00000000..05166bd0 --- /dev/null +++ b/modelscope/models/nlp/nli_model.py @@ -0,0 +1,83 @@ +import os +from typing import Any, Dict + +import numpy as np +import torch +from sofa import SbertConfig, SbertModel +from sofa.models.sbert.modeling_sbert import SbertPreTrainedModel +from torch import nn +from transformers.activations import ACT2FN, get_activation +from transformers.models.bert.modeling_bert import SequenceClassifierOutput + +from modelscope.utils.constant import Tasks +from ..base import Model, Tensor +from ..builder import MODELS + +__all__ = ['SbertForNLI'] + + +class TextClassifier(SbertPreTrainedModel): + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.encoder = SbertModel(config, add_pooling_layer=True) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, input_ids=None, token_type_ids=None): + outputs = self.encoder( + input_ids, + token_type_ids=token_type_ids, + return_dict=None, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + return logits + + +@MODELS.register_module( + Tasks.nli, module_name=r'nlp_structbert_nli_chinese-base') +class SbertForNLI(Model): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the text generation model from the `model_dir` path. + + Args: + model_dir (str): the model path. + model_cls (Optional[Any], optional): model loader, if None, use the + default loader to load model weights, by default None. + """ + super().__init__(model_dir, *args, **kwargs) + self.model_dir = model_dir + + self.model = TextClassifier.from_pretrained(model_dir, num_labels=3) + self.model.eval() + + def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: + """return the result by the model + + Args: + input (Dict[str, Any]): the preprocessed data + + Returns: + Dict[str, np.ndarray]: results + Example: + { + 'predictions': array([1]), # lable 0-negative 1-positive + 'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32), + 'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value + } + """ + input_ids = torch.tensor(input['input_ids'], dtype=torch.long) + token_type_ids = torch.tensor( + input['token_type_ids'], dtype=torch.long) + with torch.no_grad(): + logits = self.model(input_ids, token_type_ids) + probs = logits.softmax(-1).numpy() + pred = logits.argmax(-1).numpy() + logits = logits.numpy() + res = {'predictions': pred, 'probabilities': probs, 'logits': logits} + return res diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index 3dbbc1bb..e9c5ab98 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -1,2 +1,3 @@ +from .nli_pipeline import * # noqa F403 from .sequence_classification_pipeline import * # noqa F403 from .text_generation_pipeline import * # noqa F403 diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py new file mode 100644 index 00000000..fe658c77 --- /dev/null +++ b/modelscope/pipelines/nlp/nli_pipeline.py @@ -0,0 +1,88 @@ +import os +import uuid +from typing import Any, Dict, Union + +import json +import numpy as np + +from modelscope.models.nlp import SbertForNLI +from modelscope.preprocessors import NLIPreprocessor +from modelscope.utils.constant import Tasks +from ...models import Model +from ..base import Input, Pipeline +from ..builder import PIPELINES + +__all__ = ['NLIPipeline'] + + +@PIPELINES.register_module( + Tasks.nli, module_name=r'nlp_structbert_nli_chinese-base') +class NLIPipeline(Pipeline): + + def __init__(self, + model: Union[SbertForNLI, str], + preprocessor: NLIPreprocessor = None, + **kwargs): + """use `model` and `preprocessor` to create a nlp text classification pipeline for prediction + + Args: + model (SbertForNLI): a model instance + preprocessor (NLIPreprocessor): a preprocessor instance + """ + assert isinstance(model, str) or isinstance(model, SbertForNLI), \ + 'model must be a single str or SbertForNLI' + sc_model = model if isinstance(model, + SbertForNLI) else SbertForNLI(model) + if preprocessor is None: + preprocessor = NLIPreprocessor( + sc_model.model_dir, + first_sequence='first_sequence', + second_sequence='second_sequence') + super().__init__(model=sc_model, preprocessor=preprocessor, **kwargs) + + self.label_path = os.path.join(sc_model.model_dir, + 'label_mapping.json') + with open(self.label_path) as f: + self.label_mapping = json.load(f) + self.label_id_to_name = { + idx: name + for name, idx in self.label_mapping.items() + } + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + + probs = inputs['probabilities'] + logits = inputs['logits'] + predictions = np.argsort(-probs, axis=-1) + preds = predictions[0] + b = 0 + new_result = list() + for pred in preds: + new_result.append({ + 'pred': self.label_id_to_name[pred], + 'prob': float(probs[b][pred]), + 'logit': float(logits[b][pred]) + }) + new_results = list() + new_results.append({ + 'id': + inputs['id'][b] if 'id' in inputs else str(uuid.uuid4()), + 'output': + new_result, + 'predictions': + new_result[0]['pred'], + 'probabilities': + ','.join([str(t) for t in inputs['probabilities'][b]]), + 'logits': + ','.join([str(t) for t in inputs['logits'][b]]) + }) + + return new_results[0] diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 518ea977..47a713ff 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -5,4 +5,4 @@ from .builder import PREPROCESSORS, build_preprocessor from .common import Compose from .image import LoadImage, load_image from .nlp import * # noqa F403 -from .nlp import TextGenerationPreprocessor +from .nlp import NLIPreprocessor, TextGenerationPreprocessor diff --git a/modelscope/preprocessors/nlp.py b/modelscope/preprocessors/nlp.py index 0de41bfc..37442dcb 100644 --- a/modelscope/preprocessors/nlp.py +++ b/modelscope/preprocessors/nlp.py @@ -10,7 +10,7 @@ from modelscope.utils.type_assert import type_assert from .base import Preprocessor from .builder import PREPROCESSORS -__all__ = ['Tokenize', 'SequenceClassificationPreprocessor'] +__all__ = ['Tokenize', 'SequenceClassificationPreprocessor', 'NLIPreprocessor'] @PREPROCESSORS.register_module(Fields.nlp) @@ -27,6 +27,77 @@ class Tokenize(Preprocessor): return data +@PREPROCESSORS.register_module( + Fields.nlp, module_name=r'nlp_structbert_nli_chinese-base') +class NLIPreprocessor(Preprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + """preprocess the data via the vocab.txt from the `model_dir` path + + Args: + model_dir (str): model path + """ + + super().__init__(*args, **kwargs) + + from sofa import SbertTokenizer + self.model_dir: str = model_dir + self.first_sequence: str = kwargs.pop('first_sequence', + 'first_sequence') + self.second_sequence = kwargs.pop('second_sequence', 'second_sequence') + self.sequence_length = kwargs.pop('sequence_length', 128) + + self.tokenizer = SbertTokenizer.from_pretrained(self.model_dir) + + @type_assert(object, tuple) + def __call__(self, data: tuple) -> Dict[str, Any]: + """process the raw input data + + Args: + data (tuple): [sentence1, sentence2] + sentence1 (str): a sentence + Example: + 'you are so handsome.' + sentence2 (str): a sentence + Example: + 'you are so beautiful.' + + Returns: + Dict[str, Any]: the preprocessed data + """ + sentence1, sentence2 = data + new_data = { + self.first_sequence: sentence1, + self.second_sequence: sentence2 + } + # preprocess the data for the model input + + rst = { + 'id': [], + 'input_ids': [], + 'attention_mask': [], + 'token_type_ids': [] + } + + max_seq_length = self.sequence_length + + text_a = new_data[self.first_sequence] + text_b = new_data[self.second_sequence] + feature = self.tokenizer( + text_a, + text_b, + padding=False, + truncation=True, + max_length=max_seq_length) + + rst['id'].append(new_data.get('id', str(uuid.uuid4()))) + rst['input_ids'].append(feature['input_ids']) + rst['attention_mask'].append(feature['attention_mask']) + rst['token_type_ids'].append(feature['token_type_ids']) + + return rst + + @PREPROCESSORS.register_module( Fields.nlp, module_name=r'bert-sentiment-analysis') class SequenceClassificationPreprocessor(Preprocessor): diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index c51e2445..a955574b 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -30,6 +30,7 @@ class Tasks(object): image_matting = 'image-matting' # nlp tasks + nli = 'nli' sentiment_analysis = 'sentiment-analysis' text_classification = 'text-classification' relation_extraction = 'relation-extraction' diff --git a/test.py b/test.py new file mode 100644 index 00000000..d0cd093b --- /dev/null +++ b/test.py @@ -0,0 +1,12 @@ +from modelscope.models import SbertForNLI +from modelscope.pipelines import pipeline +from modelscope.preprocessors import NLIPreprocessor + +model = SbertForNLI('../nlp_structbert_nli_chinese-base') +print(model) +tokenizer = NLIPreprocessor(model.model_dir) + +semantic_cls = pipeline('nli', model=model, preprocessor=tokenizer) +print(type(semantic_cls)) + +print(semantic_cls(input=('相反,这表明克林顿的敌人是疯子。', '四川商务职业学院商务管理在哪个校区?'))) From 62d98c02732dbdd99ea46e0aba6301a7f3be59a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=80=9D=E5=AE=8F?= Date: Mon, 13 Jun 2022 17:30:48 +0800 Subject: [PATCH 2/4] [to #42322933] init --- modelscope/pipelines/nlp/nli_pipeline.py | 4 ++-- test.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modelscope/pipelines/nlp/nli_pipeline.py b/modelscope/pipelines/nlp/nli_pipeline.py index fe658c77..135f826a 100644 --- a/modelscope/pipelines/nlp/nli_pipeline.py +++ b/modelscope/pipelines/nlp/nli_pipeline.py @@ -31,8 +31,8 @@ class NLIPipeline(Pipeline): """ assert isinstance(model, str) or isinstance(model, SbertForNLI), \ 'model must be a single str or SbertForNLI' - sc_model = model if isinstance(model, - SbertForNLI) else SbertForNLI(model) + sc_model = model if isinstance( + model, SbertForNLI) else Model.from_pretrained(model) if preprocessor is None: preprocessor = NLIPreprocessor( sc_model.model_dir, diff --git a/test.py b/test.py index d0cd093b..b10a7d0b 100644 --- a/test.py +++ b/test.py @@ -9,4 +9,7 @@ tokenizer = NLIPreprocessor(model.model_dir) semantic_cls = pipeline('nli', model=model, preprocessor=tokenizer) print(type(semantic_cls)) -print(semantic_cls(input=('相反,这表明克林顿的敌人是疯子。', '四川商务职业学院商务管理在哪个校区?'))) +print( + semantic_cls( + input=('我想还有一件事也伤害到了老师的招聘,那就是他们在课堂上失去了很多的权威', + '教师在课堂上失去权威,导致想要进入这一职业的人减少了。'))) From 314082810e4e00512faf2ccd827b8a16125b5ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=80=9D=E5=AE=8F?= Date: Mon, 13 Jun 2022 20:24:31 +0800 Subject: [PATCH 3/4] [to #42322933] init --- modelscope/pipelines/builder.py | 1 + tests/pipelines/test_nli.py | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tests/pipelines/test_nli.py diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 6495a5db..3c97c2be 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -18,6 +18,7 @@ PIPELINES = Registry('pipelines') DEFAULT_MODEL_FOR_PIPELINE = { # TaskName: (pipeline_module_name, model_repo) Tasks.image_matting: ('image-matting', 'damo/image-matting-person'), + Tasks.nli: ('nli', 'damo/nlp_structbert_nli_chinese-base'), Tasks.text_classification: ('bert-sentiment-analysis', 'damo/bert-base-sst2'), Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'), diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py new file mode 100644 index 00000000..9167b897 --- /dev/null +++ b/tests/pipelines/test_nli.py @@ -0,0 +1,48 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from maas_hub.snapshot_download import snapshot_download + +from modelscope.models import Model +from modelscope.models.nlp import SbertForNLI +from modelscope.pipelines import NLIPipeline, pipeline +from modelscope.preprocessors import NLIPreprocessor +from modelscope.utils.constant import Tasks + + +class NLITest(unittest.TestCase): + model_id = 'damo/nlp_structbert_nli_chinese-base' + sentence1 = '四川商务职业学院和四川财经职业学院哪个好?' + sentence2 = '四川商务职业学院商务管理在哪个校区?' + + def test_run_from_local(self): + cache_path = snapshot_download(self.model_id) + tokenizer = NLIPreprocessor(cache_path) + model = SbertForNLI(cache_path, tokenizer=tokenizer) + pipeline1 = NLIPipeline(model, preprocessor=tokenizer) + pipeline2 = pipeline(Tasks.nli, model=model, preprocessor=tokenizer) + print(f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n' + f'pipeline1:{pipeline1(input=(self.sentence1, self.sentence2))}') + print() + print( + f'sentence1: {self.sentence1}\nsentence2: {self.sentence2}\n' + f'pipeline1: {pipeline2(input=(self.sentence1, self.sentence2))}') + + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id) + tokenizer = NLIPreprocessor(model.model_dir) + pipeline_ins = pipeline( + task=Tasks.nli, model=model, preprocessor=tokenizer) + print(pipeline_ins(input=(self.sentence1, self.sentence2))) + + def test_run_with_model_name(self): + pipeline_ins = pipeline(task=Tasks.nli, model=self.model_id) + print(pipeline_ins(input=(self.sentence1, self.sentence2))) + + def test_run_with_default_model(self): + pipeline_ins = pipeline(task=Tasks.nli) + print(pipeline_ins(input=(self.sentence1, self.sentence2))) + + +if __name__ == '__main__': + unittest.main() From 753b98f5267841795d0cad3043d468623cd3c4c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=99=BA=E4=B8=9E?= Date: Tue, 14 Jun 2022 09:27:38 +0800 Subject: [PATCH 4/4] update pipeline registry info --- modelscope/models/nlp/nli_model.py | 5 +++-- modelscope/pipelines/builder.py | 3 ++- tests/pipelines/test_nli.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modelscope/models/nlp/nli_model.py b/modelscope/models/nlp/nli_model.py index 05166bd0..91972a62 100644 --- a/modelscope/models/nlp/nli_model.py +++ b/modelscope/models/nlp/nli_model.py @@ -16,7 +16,7 @@ from ..builder import MODELS __all__ = ['SbertForNLI'] -class TextClassifier(SbertPreTrainedModel): +class SbertTextClassifier(SbertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -53,7 +53,8 @@ class SbertForNLI(Model): super().__init__(model_dir, *args, **kwargs) self.model_dir = model_dir - self.model = TextClassifier.from_pretrained(model_dir, num_labels=3) + self.model = SbertTextClassifier.from_pretrained( + model_dir, num_labels=3) self.model.eval() def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]: diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 3c97c2be..8afbd041 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -18,7 +18,8 @@ PIPELINES = Registry('pipelines') DEFAULT_MODEL_FOR_PIPELINE = { # TaskName: (pipeline_module_name, model_repo) Tasks.image_matting: ('image-matting', 'damo/image-matting-person'), - Tasks.nli: ('nli', 'damo/nlp_structbert_nli_chinese-base'), + Tasks.nli: ('nlp_structbert_nli_chinese-base', + 'damo/nlp_structbert_nli_chinese-base'), Tasks.text_classification: ('bert-sentiment-analysis', 'damo/bert-base-sst2'), Tasks.text_generation: ('palm', 'damo/nlp_palm_text-generation_chinese'), diff --git a/tests/pipelines/test_nli.py b/tests/pipelines/test_nli.py index 9167b897..ad94697a 100644 --- a/tests/pipelines/test_nli.py +++ b/tests/pipelines/test_nli.py @@ -15,6 +15,7 @@ class NLITest(unittest.TestCase): sentence1 = '四川商务职业学院和四川财经职业学院哪个好?' sentence2 = '四川商务职业学院商务管理在哪个校区?' + @unittest.skip('skip temporarily to save test time') def test_run_from_local(self): cache_path = snapshot_download(self.model_id) tokenizer = NLIPreprocessor(cache_path)