tests/pipelines/test_named_entity_recognition.py

# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest

from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
                                   TransformerCRFForNamedEntityRecognition)
from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
from modelscope.preprocessors import \
    TokenClassificationTransformersPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.demo_utils import DemoCompatibilityCheck
from modelscope.utils.test_utils import test_level


class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
    language_examples = {
        'zh':
        '新华社北京二月十一日电（记者唐虹）',
        'en':
        'Italy recalled Marcello Cuttitta',
        'ru':
        'важным традиционным промыслом является производство пальмового масла .',
        'fr':
        'fer à souder électronique',
        'es':
        'el primer avistamiento por europeos de esta zona fue en 1606 , '
        'en la expedición española mandada por luis váez de torres .',
        'nl':
        'in het vorige seizoen promoveerden sc cambuur , dat kampioen werd en go ahead eagles via de play offs .',
        'tr':
        'köyün pırasa kavurması ve içi yağlama ve akıtma adındaki hamur işleri meşhurdur . ; çörek ekmeği ; '
        'diye adlandırdıkları mayasız ekmeği unutmamaklazım .',
        'ko':
        '국립진주박물관은 1984년 11월 2일 개관하였으며 한국 전통목조탑을 석조 건물로 형상화한 것으로 건축가 김수근 선생의 대표적 작품이다 .',
        'fa':
        'ﺞﻤﻋیﺕ ﺍیﻥ ﺎﺴﺗﺎﻧ ۳۰ ﻩﺯﺍﺭ ﻦﻓﺭ ﺎﺴﺗ ﻭ ﻢﻧﺎﺒﻋ ﻢﻬﻣی ﺍﺯ ﺲﻧگ ﺂﻬﻧ ﺩﺍﺭﺩ .',
        'de':
        'die szene beinhaltete lenny baker und christopher walken .',
        'hi':
        '१४९२ में एक चार्टर के आधार पर, उसके पिता ने उसे वाडोविस के उत्तराधिकारी के रूप में छोड़ दिया।',
        'bn':
        'যদিও গির্জার সবসময় রাজকীয় পিউ থাকত, তবে গির্জায় রাজকীয়ভাবে এটিই ছিল প্রথম দেখা।',
        'multi':
        '新华社北京二月十一日电（记者唐虹）',
    }

    all_modelcards_info = [
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-news',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-social_media',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-generic',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-resume',
            'language': 'zh'
        },
        {
            'model_id': 'damo/nlp_lstm_named-entity-recognition_chinese-news',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_lstm_named-entity-recognition_chinese-social_media',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_lstm_named-entity-recognition_chinese-generic',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_lstm_named-entity-recognition_chinese-resume',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-book',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-finance',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-game',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-bank',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-literature',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-cmeee',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-news',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-social_media',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-literature',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-politics',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-music',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-science',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-ai',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-wiki',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-large-generic',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-generic',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_multilingual-large-generic',
            'language': 'multi'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_russian-large-generic',
            'language': 'ru'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_spanish-large-generic',
            'language': 'es'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_dutch-large-generic',
            'language': 'nl'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_turkish-large-generic',
            'language': 'tr'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_korean-large-generic',
            'language': 'ko'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_farsi-large-generic',
            'language': 'fa'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_german-large-generic',
            'language': 'de'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_hindi-large-generic',
            'language': 'hi'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_bangla-large-generic',
            'language': 'bn'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-ecom',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_chinese-base-ecom-50cls',
            'language': 'zh'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_english-large-ecom',
            'language': 'en'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_russian-large-ecom',
            'language': 'ru'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_french-large-ecom',
            'language': 'fr'
        },
        {
            'model_id':
            'damo/nlp_raner_named-entity-recognition_spanish-large-ecom',
            'language': 'es'
        },
        {
            'model_id':
            'damo/nlp_structbert_keyphrase-extraction_base-icassp2023-mug-track4-baseline',
            'language': 'zh'
        },
        {
            'model_id': 'damo/nlp_raner_chunking_english-large',
            'language': 'en'
        },
    ]

    def setUp(self) -> None:
        self.task = Tasks.named_entity_recognition
        self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
        self.english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
        self.chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic'
        self.tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
        self.lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
        self.addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base'
        self.lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic'
        self.sentence = '这与温岭市新河镇的一个神秘的传说有关。[SEP]地名'
        self.sentence_en = 'pizza shovel'
        self.sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。'
        self.addr = '浙江省杭州市余杭区文一西路969号亲橙里'
        self.addr1 = '浙江省西湖区灵隐隧道'
        self.addr2 = '内蒙古自治区巴彦淖尔市'
        self.ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装'

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_tcrf_by_direct_model_download(self):
        cache_path = snapshot_download(self.tcrf_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
        model = TransformerCRFForNamedEntityRecognition(
            cache_path, tokenizer=tokenizer)
        pipeline1 = NamedEntityRecognitionPipeline(
            model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.named_entity_recognition,
            model=model,
            preprocessor=tokenizer)
        print(f'sentence: {self.sentence}\n'
              f'pipeline1:{pipeline1(input=self.sentence)}')
        print()
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_lcrf_by_direct_model_download(self):
        cache_path = snapshot_download(self.lcrf_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
        model = LSTMCRFForNamedEntityRecognition(
            cache_path, tokenizer=tokenizer)
        pipeline1 = NamedEntityRecognitionPipeline(
            model, preprocessor=tokenizer)
        pipeline2 = pipeline(
            Tasks.named_entity_recognition,
            model=model,
            preprocessor=tokenizer)
        print(f'sentence: {self.sentence}\n'
              f'pipeline1:{pipeline1(input=self.sentence)}')
        print()
        print(f'pipeline2: {pipeline2(input=self.sentence)}')

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_tcrf_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.tcrf_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=model,
            preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_addrst_with_model_from_modelhub(self):
        model = Model.from_pretrained(
            'damo/nlp_structbert_address-parsing_chinese_base')
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=model,
            preprocessor=tokenizer)
        print(pipeline_ins(input=self.addr))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_addrst_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.addr_model_id)
        print(pipeline_ins(input=self.addr))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_addrst_with_model_name_batch(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.addr_model_id)
        print(
            pipeline_ins(
                input=[self.addr, self.addr1, self.addr2], batch_size=2))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_addrst_with_model_name_batch_iter(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=self.addr_model_id,
            padding=False)
        print(pipeline_ins(input=[self.addr, self.addr1, self.addr2]))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_lcrf_with_model_from_modelhub(self):
        model = Model.from_pretrained(self.lcrf_model_id)
        tokenizer = TokenClassificationTransformersPreprocessor(
            model.model_dir)
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=model,
            preprocessor=tokenizer)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
    def test_run_tcrf_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.tcrf_model_id)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_lcrf_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lcrf_with_chinese_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.chinese_model_id)
        print(pipeline_ins(input=self.sentence_zh))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lcrf_with_chinese_model_name_batch_iter(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=self.chinese_model_id,
            padding=False)
        print(
            pipeline_ins(input=[
                self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
            ]))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lcrf_with_chinese_model_name_batch(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.chinese_model_id)
        print(
            pipeline_ins(
                input=[
                    self.sentence_zh, self.sentence_zh[:20],
                    self.sentence_zh[10:]
                ],
                batch_size=2))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstm_with_chinese_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.lstm_model_id)
        print(pipeline_ins(input=self.sentence_zh))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstm_with_chinese_model_name_batch_iter(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=self.lstm_model_id,
            padding=False)
        print(
            pipeline_ins(input=[
                self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
            ]))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_lstm_with_chinese_model_name_batch(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.lstm_model_id)
        print(
            pipeline_ins(
                input=[
                    self.sentence_zh, self.sentence_zh[:20],
                    self.sentence_zh[10:]
                ],
                batch_size=2))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_english_with_model_name(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.english_model_id)
        print(pipeline_ins(input=self.sentence_en))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_english_with_model_name_batch(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition, model=self.english_model_id)
        print(
            pipeline_ins(
                input=[self.ecom, self.sentence_zh, self.sentence],
                batch_size=2))

    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
    def test_run_english_with_model_name_batch_iter(self):
        pipeline_ins = pipeline(
            task=Tasks.named_entity_recognition,
            model=self.english_model_id,
            padding=False)
        print(pipeline_ins(input=[self.ecom, self.sentence_zh, self.sentence]))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_default_model(self):
        pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
        print(pipeline_ins(input=self.sentence))

    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_run_with_all_modelcards(self):
        for item in self.all_modelcards_info:
            model_id = item['model_id']
            sentence = self.language_examples[item['language']]
            with self.subTest(model_id=model_id):
                pipeline_ins = pipeline(Tasks.named_entity_recognition,
                                        model_id)
                print(pipeline_ins(input=sentence))

    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
    def test_demo_compatibility(self):
        self.compatibility_check()


if __name__ == '__main__':
    unittest.main()
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								# Copyright (c) Alibaba, Inc. and its affiliates.
 								import unittest
 								from modelscope.hub.snapshot_download import snapshot_download
 								from modelscope.models import Model
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
 								                                   TransformerCRFForNamedEntityRecognition)
-												[to #43112771] requirements check and lazy import support 

											
										
										
											2022-07-27 17:29:16 +08:00
+								from modelscope.pipelines import pipeline
 								from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								from modelscope.preprocessors import \
 								    TokenClassificationTransformersPreprocessor
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								from modelscope.utils.constant import Tasks
-												[to #44657982] add unittest for demo and demotest utils 

unittest for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006180

											
										
										
											2022-09-08 14:08:51 +08:00
+								from modelscope.utils.demo_utils import DemoCompatibilityCheck
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								from modelscope.utils.test_utils import test_level
-												[to #44657982] add unittest for demo and demotest utils 

unittest for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006180

											
										
										
											2022-09-08 14:08:51 +08:00
+								class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
-												[to #42322933] add UT for NER&EL models 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10897188

   
											
										
										
											2022-12-04 15:53:32 +08:00
+								    language_examples = {
 								        'zh':
 								        '新华社北京二月十一日电（记者唐虹）',
 								        'en':
 								        'Italy recalled Marcello Cuttitta',
 								        'ru':
 								        'важным традиционным промыслом является производство пальмового масла .',
 								        'fr':
 								        'fer à souder électronique',
 								        'es':
 								        'el primer avistamiento por europeos de esta zona fue en 1606 , '
 								        'en la expedición española mandada por luis váez de torres .',
 								        'nl':
 								        'in het vorige seizoen promoveerden sc cambuur , dat kampioen werd en go ahead eagles via de play offs .',
 								        'tr':
 								        'köyün pırasa kavurması ve içi yağlama ve akıtma adındaki hamur işleri meşhurdur . ; çörek ekmeği ; '
 								        'diye adlandırdıkları mayasız ekmeği unutmamaklazım .',
 								        'ko':
 								        '국립진주박물관은 1984년 11월 2일 개관하였으며 한국 전통목조탑을 석조 건물로 형상화한 것으로 건축가 김수근 선생의 대표적 작품이다 .',
 								        'fa':
 								        'ﺞﻤﻋیﺕ ﺍیﻥ ﺎﺴﺗﺎﻧ ۳۰ ﻩﺯﺍﺭ ﻦﻓﺭ ﺎﺴﺗ ﻭ ﻢﻧﺎﺒﻋ ﻢﻬﻣی ﺍﺯ ﺲﻧگ ﺂﻬﻧ ﺩﺍﺭﺩ .',
 								        'de':
 								        'die szene beinhaltete lenny baker und christopher walken .',
 								        'hi':
 								        '१४९२ में एक चार्टर के आधार पर, उसके पिता ने उसे वाडोविस के उत्तराधिकारी के रूप में छोड़ दिया।',
 								        'bn':
 								        'যদিও গির্জার সবসময় রাজকীয় পিউ থাকত, তবে গির্জায় রাজকীয়ভাবে এটিই ছিল প্রথম দেখা।',
 								        'multi':
 								        '新华社北京二月十一日电（记者唐虹）',
 								    }
 								    all_modelcards_info = [
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-news',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-social_media',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-generic',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-resume',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id': 'damo/nlp_lstm_named-entity-recognition_chinese-news',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_lstm_named-entity-recognition_chinese-social_media',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_lstm_named-entity-recognition_chinese-generic',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_lstm_named-entity-recognition_chinese-resume',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-book',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-finance',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-game',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-bank',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-literature',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-cmeee',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-news',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-social_media',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-literature',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-politics',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-music',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-science',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-ai',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-wiki',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-large-generic',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-generic',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_multilingual-large-generic',
 								            'language': 'multi'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_russian-large-generic',
 								            'language': 'ru'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_spanish-large-generic',
 								            'language': 'es'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_dutch-large-generic',
 								            'language': 'nl'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_turkish-large-generic',
 								            'language': 'tr'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_korean-large-generic',
 								            'language': 'ko'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_farsi-large-generic',
 								            'language': 'fa'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_german-large-generic',
 								            'language': 'de'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_hindi-large-generic',
 								            'language': 'hi'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_bangla-large-generic',
 								            'language': 'bn'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-ecom',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_chinese-base-ecom-50cls',
 								            'language': 'zh'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_english-large-ecom',
 								            'language': 'en'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_russian-large-ecom',
 								            'language': 'ru'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_french-large-ecom',
 								            'language': 'fr'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_raner_named-entity-recognition_spanish-large-ecom',
 								            'language': 'es'
 								        },
 								        {
 								            'model_id':
 								            'damo/nlp_structbert_keyphrase-extraction_base-icassp2023-mug-track4-baseline',
 								            'language': 'zh'
 								        },
-												[to #42322933] add UT for chunking model 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11289061


											
										
										
											2023-01-04 04:41:00 +08:00
+								        {
 								            'model_id': 'damo/nlp_raner_chunking_english-large',
 								            'language': 'en'
 								        },
-												[to #42322933] add UT for NER&EL models 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10897188

   
											
										
										
											2022-12-04 15:53:32 +08:00
+								    ]
-												[to #44657982] add unittest for demo and demotest utils 

unittest for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006180

											
										
										
											2022-09-08 14:08:51 +08:00
 								    def setUp(self) -> None:
 								        self.task = Tasks.named_entity_recognition
 								        self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
-												[to #42322933] add UT for NER&EL models 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10897188

   
											
										
										
											2022-12-04 15:53:32 +08:00
+								        self.english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
 								        self.chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic'
 								        self.tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
 								        self.lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
 								        self.addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base'
 								        self.lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic'
-												support prompt ner 

修改preprocessor增加对prompt模型的支持。
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10972542

   
											
										
										
											2022-12-06 10:39:37 +08:00
+								        self.sentence = '这与温岭市新河镇的一个神秘的传说有关。[SEP]地名'
-												[to #42322933] add UT for NER&EL models 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10897188

   
											
										
										
											2022-12-04 15:53:32 +08:00
+								        self.sentence_en = 'pizza shovel'
 								        self.sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。'
 								        self.addr = '浙江省杭州市余杭区文一西路969号亲橙里'
 								        self.addr1 = '浙江省西湖区灵隐隧道'
 								        self.addr2 = '内蒙古自治区巴彦淖尔市'
 								        self.ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装'
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
 								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								    def test_run_tcrf_by_direct_model_download(self):
 								        cache_path = snapshot_download(self.tcrf_model_id)
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								        model = TransformerCRFForNamedEntityRecognition(
 								            cache_path, tokenizer=tokenizer)
 								        pipeline1 = NamedEntityRecognitionPipeline(
 								            model, preprocessor=tokenizer)
 								        pipeline2 = pipeline(
 								            Tasks.named_entity_recognition,
 								            model=model,
 								            preprocessor=tokenizer)
 								        print(f'sentence: {self.sentence}\n'
 								              f'pipeline1:{pipeline1(input=self.sentence)}')
 								        print()
 								        print(f'pipeline2: {pipeline2(input=self.sentence)}')
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
 								    def test_run_lcrf_by_direct_model_download(self):
 								        cache_path = snapshot_download(self.lcrf_model_id)
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								        tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								        model = LSTMCRFForNamedEntityRecognition(
 								            cache_path, tokenizer=tokenizer)
 								        pipeline1 = NamedEntityRecognitionPipeline(
 								            model, preprocessor=tokenizer)
 								        pipeline2 = pipeline(
 								            Tasks.named_entity_recognition,
 								            model=model,
 								            preprocessor=tokenizer)
 								        print(f'sentence: {self.sentence}\n'
 								              f'pipeline1:{pipeline1(input=self.sentence)}')
 								        print()
 								        print(f'pipeline2: {pipeline2(input=self.sentence)}')
-												[to #42322933]clean up test level 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9662182

    * clean up test level

											
										
										
											2022-08-06 12:22:17 +08:00
+								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								    def test_run_tcrf_with_model_from_modelhub(self):
 								        model = Model.from_pretrained(self.tcrf_model_id)
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								        tokenizer = TokenClassificationTransformersPreprocessor(
 								            model.model_dir)
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=model,
 								            preprocessor=tokenizer)
 								        print(pipeline_ins(input=self.sentence))
 								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-												[to #42322933] add nlp/addr/structure and update token classificaiton related method 
      
											
										
										
											2022-11-21 16:14:53 +08:00
+								    def test_run_addrst_with_model_from_modelhub(self):
 								        model = Model.from_pretrained(
 								            'damo/nlp_structbert_address-parsing_chinese_base')
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								        tokenizer = TokenClassificationTransformersPreprocessor(
 								            model.model_dir)
-												[to #42322933] add nlp/addr/structure and update token classificaiton related method 
      
											
										
										
											2022-11-21 16:14:53 +08:00
+								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=model,
 								            preprocessor=tokenizer)
 								        print(pipeline_ins(input=self.addr))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_addrst_with_model_name(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.addr_model_id)
 								        print(pipeline_ins(input=self.addr))
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_addrst_with_model_name_batch(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.addr_model_id)
 								        print(
 								            pipeline_ins(
 								                input=[self.addr, self.addr1, self.addr2], batch_size=2))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_addrst_with_model_name_batch_iter(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=self.addr_model_id,
 								            padding=False)
 								        print(pipeline_ins(input=[self.addr, self.addr1, self.addr2]))
-												[to #42322933] add nlp/addr/structure and update token classificaiton related method 
      
											
										
										
											2022-11-21 16:14:53 +08:00
+								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								    def test_run_lcrf_with_model_from_modelhub(self):
 								        model = Model.from_pretrained(self.lcrf_model_id)
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								        tokenizer = TokenClassificationTransformersPreprocessor(
 								            model.model_dir)
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=model,
 								            preprocessor=tokenizer)
 								        print(pipeline_ins(input=self.sentence))
-												[to #42322933] update ner default model & fix tokenizer bug 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9627744

 
											
										
										
											2022-08-04 16:11:22 +08:00
+								    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								    def test_run_tcrf_with_model_name(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.tcrf_model_id)
 								        print(pipeline_ins(input=self.sentence))
 								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
 								    def test_run_lcrf_with_model_name(self):
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								        pipeline_ins = pipeline(
-												[to #42322933] add lstm-crf ner model code 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9901220

    * add lstm-crf ner model code


											
										
										
											2022-09-01 09:19:59 +08:00
+								            task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								        print(pipeline_ins(input=self.sentence))
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-												[to #42322933]token preprocess bug fix 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10608664



											
										
										
											2022-11-08 15:42:08 +08:00
+								    def test_run_lcrf_with_chinese_model_name(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.chinese_model_id)
 								        print(pipeline_ins(input=self.sentence_zh))
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_lcrf_with_chinese_model_name_batch_iter(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=self.chinese_model_id,
 								            padding=False)
 								        print(
 								            pipeline_ins(input=[
 								                self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
 								            ]))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_lcrf_with_chinese_model_name_batch(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.chinese_model_id)
 								        print(
 								            pipeline_ins(
 								                input=[
 								                    self.sentence_zh, self.sentence_zh[:20],
 								                    self.sentence_zh[10:]
 								                ],
 								                batch_size=2))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_lstm_with_chinese_model_name(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.lstm_model_id)
 								        print(pipeline_ins(input=self.sentence_zh))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_lstm_with_chinese_model_name_batch_iter(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=self.lstm_model_id,
 								            padding=False)
 								        print(
 								            pipeline_ins(input=[
 								                self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
 								            ]))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_lstm_with_chinese_model_name_batch(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.lstm_model_id)
 								        print(
 								            pipeline_ins(
 								                input=[
 								                    self.sentence_zh, self.sentence_zh[:20],
 								                    self.sentence_zh[10:]
 								                ],
 								                batch_size=2))
-												[to #42322933]add token-cls test cases and bug fix 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10585502

  
											
										
										
											2022-11-01 09:56:15 +08:00
+								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
 								    def test_run_english_with_model_name(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.english_model_id)
-												[to #42322933]token preprocess bug fix 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10608664



											
										
										
											2022-11-08 15:42:08 +08:00
+								        print(pipeline_ins(input=self.sentence_en))
-												[to #42322933]add token-cls test cases and bug fix 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10585502

  
											
										
										
											2022-11-01 09:56:15 +08:00
-												[to #42322933]  Refactor NLP and fix some user feedbacks 

1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class

----------------------------  Another refactor from version 36 -------------------------

13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
      TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
       NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513

    * add save_pretrained to preprocessor

* save preprocessor config in hook

* refactor label-id mapping fetching logic

* test ok on sentence-similarity

* run on finetuning

* fix bug

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/nlp/nlp_base.py

* add params to init

* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics

* Split trainer init impls to overridable methods

* remove some obsolete tokenizers

* unfinished

* support input params in pipeline

* fix bugs

* fix ut bug

* fix bug

* fix ut bug

* fix ut bug

* fix ut bug

* add base class for some preprocessors

* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config

* compatible with old code

* fix ut bug

* fix ut bugs

* fix bug

* add some comments

* fix ut bug

* add a requirement

* fix pre-commit

* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config

* fixbug

* Support function type in registry

* fix ut bug

* fix bug

* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/utils/hub.py

* remove obsolete file

* rename init args

* rename params

* fix merge bug

* add default preprocessor config for ner-model

* move a method a util file

* remove unused config

* Fix a bug in pbar

* bestckptsaver:change default ckpt numbers to 1

* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name

* Fix bug

* fix bug

* fix bug

* unfinished refactoring

* unfinished

* uw

* uw

* uw

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

# Conflicts:
#	modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
#	modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
#	modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
#	modelscope/preprocessors/nlp/text_generation_preprocessor.py

* uw

* uw

* unify nlp task outputs

* uw

* uw

* uw

* uw

* change the order of text cls pipeline

* refactor t5

* refactor tg task preprocessor

* fix

* unfinished

* temp

* refactor code

* unfinished

* unfinished

* unfinished

* unfinished

* uw

* Merge branch 'feat/refactor_config' into feat/refactor_trainer

* smoke test pass

* ut testing

* pre-commit passed

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/nlp/bert/document_segmentation.py
#	modelscope/pipelines/nlp/__init__.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py

* merge master

* unifnished

* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config

* fix bug

* fix ut bug

* support ner batch inference

* fix ut bug

* fix bug

* support batch inference on three nlp tasks

* unfinished

* fix bug

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/models/base/base_model.py
#	modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
#	modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
#	modelscope/pipelines/nlp/dialog_modeling_pipeline.py
#	modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
#	modelscope/pipelines/nlp/document_segmentation_pipeline.py
#	modelscope/pipelines/nlp/faq_question_answering_pipeline.py
#	modelscope/pipelines/nlp/feature_extraction_pipeline.py
#	modelscope/pipelines/nlp/fill_mask_pipeline.py
#	modelscope/pipelines/nlp/information_extraction_pipeline.py
#	modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
#	modelscope/pipelines/nlp/sentence_embedding_pipeline.py
#	modelscope/pipelines/nlp/summarization_pipeline.py
#	modelscope/pipelines/nlp/table_question_answering_pipeline.py
#	modelscope/pipelines/nlp/text2text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_classification_pipeline.py
#	modelscope/pipelines/nlp/text_error_correction_pipeline.py
#	modelscope/pipelines/nlp/text_generation_pipeline.py
#	modelscope/pipelines/nlp/text_ranking_pipeline.py
#	modelscope/pipelines/nlp/token_classification_pipeline.py
#	modelscope/pipelines/nlp/word_segmentation_pipeline.py
#	modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
#	modelscope/trainers/nlp_trainer.py

* pre-commit passed

* fix bug

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/preprocessors/__init__.py

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fixbug

* pre-commit passed

* fix bug

* fixbug

* fix bug

* fix bug

* fix bug

* fix bug

* self review done

* fixbug

* fix bug

* fix bug

* fix bugs

* remove sub-token offset mapping

* fix name bug

* add some tests

* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs

* add old logic back

* tmp save

* add tokenize by words logic back

* move outputs file back

* revert veco token-classification back

* fix typo

* Fix description

* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config

* Merge branch 'master' into feat/refactor_config

# Conflicts:
#	modelscope/pipelines/builder.py
											
										
										
											2022-11-30 23:52:17 +08:00
+								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_english_with_model_name_batch(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition, model=self.english_model_id)
 								        print(
 								            pipeline_ins(
 								                input=[self.ecom, self.sentence_zh, self.sentence],
 								                batch_size=2))
 								    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
 								    def test_run_english_with_model_name_batch_iter(self):
 								        pipeline_ins = pipeline(
 								            task=Tasks.named_entity_recognition,
 								            model=self.english_model_id,
 								            padding=False)
 								        print(pipeline_ins(input=[self.ecom, self.sentence_zh, self.sentence]))
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
+								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
 								    def test_run_with_default_model(self):
 								        pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
 								        print(pipeline_ins(input=self.sentence))
-												[to #42322933] add UT for NER&EL models 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10897188

   
											
										
										
											2022-12-04 15:53:32 +08:00
+								    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
 								    def test_run_with_all_modelcards(self):
 								        for item in self.all_modelcards_info:
 								            model_id = item['model_id']
 								            sentence = self.language_examples[item['language']]
 								            with self.subTest(model_id=model_id):
 								                pipeline_ins = pipeline(Tasks.named_entity_recognition,
 								                                        model_id)
 								                print(pipeline_ins(input=sentence))
-												[to #42322933] skip demo test by default

											
										
										
											2022-09-09 14:56:15 +08:00
+								    @unittest.skip('demo compatibility test is only enabled on a needed-basis')
-												[to #44657982] add unittest for demo and demotest utils 

unittest for demo service
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10006180

											
										
										
											2022-09-08 14:08:51 +08:00
+								    def test_demo_compatibility(self):
 								        self.compatibility_check()
-												[to #42322933] NLP/命名实体识别模型（NER）接入 
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9435758

* add model nncrf, which includes tcrf(transformer-crf) and lcrf(lstm-crf, will be implemented in 830 version).
* add NER metainfo, pipeline
											
										
										
											2022-07-21 16:26:31 +08:00
 								if __name__ == '__main__':
 								    unittest.main()