2022-10-26 14:52:22 +08:00
|
|
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
|
import unittest
|
|
|
|
|
|
|
|
|
|
from modelscope.hub.snapshot_download import snapshot_download
|
|
|
|
|
from modelscope.models import Model
|
|
|
|
|
from modelscope.models.nlp import (LSTMCRFForNamedEntityRecognition,
|
|
|
|
|
TransformerCRFForNamedEntityRecognition)
|
|
|
|
|
from modelscope.pipelines import pipeline
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
|
2022-10-26 14:52:22 +08:00
|
|
|
from modelscope.preprocessors import NERPreprocessorThai, NERPreprocessorViet
|
|
|
|
|
from modelscope.utils.constant import Tasks
|
|
|
|
|
from modelscope.utils.demo_utils import DemoCompatibilityCheck
|
|
|
|
|
from modelscope.utils.test_utils import test_level
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MultilingualNamedEntityRecognitionTest(unittest.TestCase,
|
|
|
|
|
DemoCompatibilityCheck):
|
|
|
|
|
|
|
|
|
|
def setUp(self) -> None:
|
|
|
|
|
self.task = Tasks.named_entity_recognition
|
|
|
|
|
self.model_id = 'damo/nlp_xlmr_named-entity-recognition_thai-ecommerce-title'
|
|
|
|
|
|
|
|
|
|
thai_tcrf_model_id = 'damo/nlp_xlmr_named-entity-recognition_thai-ecommerce-title'
|
|
|
|
|
thai_sentence = 'เครื่องชั่งดิจิตอลแบบตั้งพื้น150kg.'
|
|
|
|
|
|
|
|
|
|
viet_tcrf_model_id = 'damo/nlp_xlmr_named-entity-recognition_viet-ecommerce-title'
|
|
|
|
|
viet_sentence = 'Nón vành dễ thương cho bé gái'
|
|
|
|
|
|
2022-11-08 15:42:08 +08:00
|
|
|
multilingual_model_id = 'damo/nlp_raner_named-entity-recognition_multilingual-large-generic'
|
|
|
|
|
ml_stc = 'সমস্ত বেতন নিলামের সাধারণ ব্যবহারিক উদাহরণ বিভিন্ন পেনি নিলাম / বিডিং ফি নিলাম ওয়েবসাইটে পাওয়া যাবে।'
|
|
|
|
|
|
2022-10-26 14:52:22 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_by_direct_model_download_thai(self):
|
|
|
|
|
cache_path = snapshot_download(self.thai_tcrf_model_id)
|
|
|
|
|
tokenizer = NERPreprocessorThai(cache_path)
|
|
|
|
|
model = TransformerCRFForNamedEntityRecognition(
|
|
|
|
|
cache_path, tokenizer=tokenizer)
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
pipeline1 = NamedEntityRecognitionPipeline(
|
2022-10-26 14:52:22 +08:00
|
|
|
model, preprocessor=tokenizer)
|
|
|
|
|
pipeline2 = pipeline(
|
|
|
|
|
Tasks.named_entity_recognition,
|
|
|
|
|
model=model,
|
|
|
|
|
preprocessor=tokenizer)
|
|
|
|
|
print(f'thai_sentence: {self.thai_sentence}\n'
|
|
|
|
|
f'pipeline1:{pipeline1(input=self.thai_sentence)}')
|
|
|
|
|
print()
|
|
|
|
|
print(f'pipeline2: {pipeline2(input=self.thai_sentence)}')
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_from_modelhub_thai(self):
|
|
|
|
|
model = Model.from_pretrained(self.thai_tcrf_model_id)
|
|
|
|
|
tokenizer = NERPreprocessorThai(model.model_dir)
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition,
|
|
|
|
|
model=model,
|
|
|
|
|
preprocessor=tokenizer)
|
|
|
|
|
print(pipeline_ins(input=self.thai_sentence))
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_name_thai(self):
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition, model=self.thai_tcrf_model_id)
|
|
|
|
|
print(pipeline_ins(input=self.thai_sentence))
|
|
|
|
|
|
2022-11-08 15:42:08 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_name_multilingual(self):
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition,
|
|
|
|
|
model=self.multilingual_model_id)
|
|
|
|
|
print(pipeline_ins(input=self.ml_stc))
|
|
|
|
|
|
2022-10-26 14:52:22 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_by_direct_model_download_viet(self):
|
|
|
|
|
cache_path = snapshot_download(self.viet_tcrf_model_id)
|
|
|
|
|
tokenizer = NERPreprocessorViet(cache_path)
|
|
|
|
|
model = TransformerCRFForNamedEntityRecognition(
|
|
|
|
|
cache_path, tokenizer=tokenizer)
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
pipeline1 = NamedEntityRecognitionPipeline(
|
2022-10-26 14:52:22 +08:00
|
|
|
model, preprocessor=tokenizer)
|
|
|
|
|
pipeline2 = pipeline(
|
|
|
|
|
Tasks.named_entity_recognition,
|
|
|
|
|
model=model,
|
|
|
|
|
preprocessor=tokenizer)
|
|
|
|
|
print(f'viet_sentence: {self.viet_sentence}\n'
|
|
|
|
|
f'pipeline1:{pipeline1(input=self.viet_sentence)}')
|
|
|
|
|
print()
|
|
|
|
|
print(f'pipeline2: {pipeline2(input=self.viet_sentence)}')
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_from_modelhub_viet(self):
|
|
|
|
|
model = Model.from_pretrained(self.viet_tcrf_model_id)
|
|
|
|
|
tokenizer = NERPreprocessorViet(model.model_dir)
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition,
|
|
|
|
|
model=model,
|
|
|
|
|
preprocessor=tokenizer)
|
|
|
|
|
print(pipeline_ins(input=self.viet_sentence))
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_name_viet(self):
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id)
|
|
|
|
|
print(pipeline_ins(input=self.viet_sentence))
|
|
|
|
|
|
[to #42322933] Refactor NLP and fix some user feedbacks
1. Abstract keys of dicts needed by nlp metric classes into the init method
2. Add Preprocessor.save_pretrained to save preprocessor information
3. Abstract the config saving function, which can lead to normally saving in the direct call of from_pretrained, and the modification of cfg one by one when training.
4. Remove SbertTokenizer and VecoTokenizer, use transformers' tokenizers instead
5. Use model/preprocessor's from_pretrained in all nlp pipeline classes.
6. Add model_kwargs and preprocessor_kwargs in all nlp pipeline classes
7. Add base classes for fill-mask and text-classification preprocessor, as a demo for later changes
8. Fix user feedback: Re-train the model in continue training scenario
9. Fix user feedback: Too many checkpoint saved
10. Simplify the nlp-trainer
11. Fix user feedback: Split the default trainer's __init__ method, which makes user easier to override
12. Add safe_get to Config class
---------------------------- Another refactor from version 36 -------------------------
13. Name all nlp transformers' preprocessors from TaskNamePreprocessor to TaskNameTransformersPreprocessor, for example:
TextClassificationPreprocessor -> TextClassificationTransformersPreprocessor
14. Add a base class per task for all nlp tasks' preprocessors which has at least two sub-preprocessors
15. Add output classes of nlp models
16. Refactor the logic for token-classification
17. Fix bug: checkpoint_hook does not support pytorch_model.pt
18. Fix bug: Pipeline name does not match with task name, so inference will not succeed after training
NOTE: This is just a stop bleeding solution, the root cause is the uncertainty of the relationship between models and pipelines
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10723513
* add save_pretrained to preprocessor
* save preprocessor config in hook
* refactor label-id mapping fetching logic
* test ok on sentence-similarity
* run on finetuning
* fix bug
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/nlp/nlp_base.py
* add params to init
* 1. support max ckpt num 2. support ignoring others but bin file in continue training 3. add arguments to some nlp metrics
* Split trainer init impls to overridable methods
* remove some obsolete tokenizers
* unfinished
* support input params in pipeline
* fix bugs
* fix ut bug
* fix bug
* fix ut bug
* fix ut bug
* fix ut bug
* add base class for some preprocessors
* Merge commit '379867739548f394d0fa349ba07afe04adf4c8b6' into feat/refactor_config
* compatible with old code
* fix ut bug
* fix ut bugs
* fix bug
* add some comments
* fix ut bug
* add a requirement
* fix pre-commit
* Merge commit '0451b3d3cb2bebfef92ec2c227b2a3dd8d01dc6a' into feat/refactor_config
* fixbug
* Support function type in registry
* fix ut bug
* fix bug
* Merge commit '5f719e542b963f0d35457e5359df879a5eb80b82' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/nlp/multilingual_word_segmentation_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/utils/hub.py
* remove obsolete file
* rename init args
* rename params
* fix merge bug
* add default preprocessor config for ner-model
* move a method a util file
* remove unused config
* Fix a bug in pbar
* bestckptsaver:change default ckpt numbers to 1
* 1. Add assert to max_epoch 2. split init_dist and get_device 3. change cmp func name
* Fix bug
* fix bug
* fix bug
* unfinished refactoring
* unfinished
* uw
* uw
* uw
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
# Conflicts:
# modelscope/preprocessors/nlp/document_segmentation_preprocessor.py
# modelscope/preprocessors/nlp/faq_question_answering_preprocessor.py
# modelscope/preprocessors/nlp/relation_extraction_preprocessor.py
# modelscope/preprocessors/nlp/text_generation_preprocessor.py
* uw
* uw
* unify nlp task outputs
* uw
* uw
* uw
* uw
* change the order of text cls pipeline
* refactor t5
* refactor tg task preprocessor
* fix
* unfinished
* temp
* refactor code
* unfinished
* unfinished
* unfinished
* unfinished
* uw
* Merge branch 'feat/refactor_config' into feat/refactor_trainer
* smoke test pass
* ut testing
* pre-commit passed
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/nlp/bert/document_segmentation.py
# modelscope/pipelines/nlp/__init__.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
* merge master
* unifnished
* Merge branch 'feat/fix_bug_pipeline_name' into feat/refactor_config
* fix bug
* fix ut bug
* support ner batch inference
* fix ut bug
* fix bug
* support batch inference on three nlp tasks
* unfinished
* fix bug
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/models/base/base_model.py
# modelscope/pipelines/nlp/conversational_text_to_sql_pipeline.py
# modelscope/pipelines/nlp/dialog_intent_prediction_pipeline.py
# modelscope/pipelines/nlp/dialog_modeling_pipeline.py
# modelscope/pipelines/nlp/dialog_state_tracking_pipeline.py
# modelscope/pipelines/nlp/document_segmentation_pipeline.py
# modelscope/pipelines/nlp/faq_question_answering_pipeline.py
# modelscope/pipelines/nlp/feature_extraction_pipeline.py
# modelscope/pipelines/nlp/fill_mask_pipeline.py
# modelscope/pipelines/nlp/information_extraction_pipeline.py
# modelscope/pipelines/nlp/named_entity_recognition_pipeline.py
# modelscope/pipelines/nlp/sentence_embedding_pipeline.py
# modelscope/pipelines/nlp/summarization_pipeline.py
# modelscope/pipelines/nlp/table_question_answering_pipeline.py
# modelscope/pipelines/nlp/text2text_generation_pipeline.py
# modelscope/pipelines/nlp/text_classification_pipeline.py
# modelscope/pipelines/nlp/text_error_correction_pipeline.py
# modelscope/pipelines/nlp/text_generation_pipeline.py
# modelscope/pipelines/nlp/text_ranking_pipeline.py
# modelscope/pipelines/nlp/token_classification_pipeline.py
# modelscope/pipelines/nlp/word_segmentation_pipeline.py
# modelscope/pipelines/nlp/zero_shot_classification_pipeline.py
# modelscope/trainers/nlp_trainer.py
* pre-commit passed
* fix bug
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/preprocessors/__init__.py
* fix bug
* fix bug
* fix bug
* fix bug
* fix bug
* fixbug
* pre-commit passed
* fix bug
* fixbug
* fix bug
* fix bug
* fix bug
* fix bug
* self review done
* fixbug
* fix bug
* fix bug
* fix bugs
* remove sub-token offset mapping
* fix name bug
* add some tests
* 1. support batch inference of text-generation,text2text-generation,token-classification,text-classification 2. add corresponding UTs
* add old logic back
* tmp save
* add tokenize by words logic back
* move outputs file back
* revert veco token-classification back
* fix typo
* Fix description
* Merge commit '4dd99b8f6e4e7aefe047c68a1bedd95d3ec596d6' into feat/refactor_config
* Merge branch 'master' into feat/refactor_config
# Conflicts:
# modelscope/pipelines/builder.py
2022-11-30 23:52:17 +08:00
|
|
|
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_name_viet_batch(self):
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition, model=self.viet_tcrf_model_id)
|
|
|
|
|
print(
|
|
|
|
|
pipeline_ins(
|
|
|
|
|
input=[
|
|
|
|
|
self.viet_sentence, self.viet_sentence[:10],
|
|
|
|
|
self.viet_sentence[5:]
|
|
|
|
|
],
|
|
|
|
|
batch_size=2))
|
|
|
|
|
|
|
|
|
|
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
|
|
|
|
def test_run_tcrf_with_model_name_viet_batch_iter(self):
|
|
|
|
|
pipeline_ins = pipeline(
|
|
|
|
|
task=Tasks.named_entity_recognition,
|
|
|
|
|
model=self.viet_tcrf_model_id,
|
|
|
|
|
padding=False)
|
|
|
|
|
print(
|
|
|
|
|
pipeline_ins(input=[
|
|
|
|
|
self.viet_sentence, self.viet_sentence[:10],
|
|
|
|
|
self.viet_sentence[5:]
|
|
|
|
|
]))
|
|
|
|
|
|
2022-10-26 14:52:22 +08:00
|
|
|
@unittest.skip('demo compatibility test is only enabled on a needed-basis')
|
|
|
|
|
def test_demo_compatibility(self):
|
|
|
|
|
self.compatibility_check()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
unittest.main()
|