Files
modelscope/tests/pipelines/test_named_entity_recognition.py
Shen Huang 5ef842e38a enable token_cls_pipeline to inference on longer inputs and return entity probabilities (#551)
* allow token classification pipelines to predict longer sentences

* bugfix

* skip adaseq pipeline ut when connection error occurs

* return entity probabilities
2023-09-19 17:34:41 +08:00

494 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Copyright (c) Alibaba, Inc. and its affiliates.
import unittest
from modelscope.hub.snapshot_download import snapshot_download
from modelscope.models import Model
from modelscope.models.nlp import (LSTMForTokenClassificationWithCRF,
ModelForTokenClassificationWithCRF)
from modelscope.pipelines import pipeline
from modelscope.pipelines.nlp import NamedEntityRecognitionPipeline
from modelscope.preprocessors import \
TokenClassificationTransformersPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.test_utils import test_level
class NamedEntityRecognitionTest(unittest.TestCase):
language_examples = {
'zh':
'新华社北京二月十一日电(记者唐虹)',
'en':
'Italy recalled Marcello Cuttitta',
'ru':
'важным традиционным промыслом является производство пальмового масла .',
'fr':
'fer à souder électronique',
'es':
'el primer avistamiento por europeos de esta zona fue en 1606 , '
'en la expedición española mandada por luis váez de torres .',
'nl':
'in het vorige seizoen promoveerden sc cambuur , dat kampioen werd en go ahead eagles via de play offs .',
'tr':
'köyün pırasa kavurması ve içi yağlama ve akıtma adındaki hamur işleri meşhurdur . ; çörek ekmeği ; '
'diye adlandırdıkları mayasız ekmeği unutmamaklazım .',
'ko':
'국립진주박물관은 1984년 11월 2일 개관하였으며 한국 전통목조탑을 석조 건물로 형상화한 것으로 건축가 김수근 선생의 대표적 작품이다 .',
'fa':
'ﺞﻤﻋیﺕ ﺍیﻥ ﺎﺴﺗﺎﻧ ۳۰ ﻩﺯﺍﺭ ﻦﻓﺭ ﺎﺴﺗ ﻭ ﻢﻧﺎﺒﻋ ﻢﻬﻣی ﺍﺯ ﺲﻧگ ﺂﻬﻧ ﺩﺍﺭﺩ .',
'de':
'die szene beinhaltete lenny baker und christopher walken .',
'hi':
'१४९२ में एक चार्टर के आधार पर, उसके पिता ने उसे वाडोविस के उत्तराधिकारी के रूप में छोड़ दिया।',
'bn':
'যদিও গির্জার সবসময় রাজকীয় পিউ থাকত, তবে গির্জায় রাজকীয়ভাবে এটিই ছিল প্রথম দেখা।',
'multi':
'新华社北京二月十一日电(记者唐虹)',
}
all_modelcards_info = [
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-news',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-social_media',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-generic',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-resume',
'language': 'zh'
},
{
'model_id': 'damo/nlp_lstm_named-entity-recognition_chinese-news',
'language': 'zh'
},
{
'model_id':
'damo/nlp_lstm_named-entity-recognition_chinese-social_media',
'language': 'zh'
},
{
'model_id':
'damo/nlp_lstm_named-entity-recognition_chinese-generic',
'language': 'zh'
},
{
'model_id':
'damo/nlp_lstm_named-entity-recognition_chinese-resume',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-book',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-finance',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-game',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-bank',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-literature',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-cmeee',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-news',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-social_media',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-literature',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-politics',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-music',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-science',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-ai',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-wiki',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-large-generic',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-generic',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_multilingual-large-generic',
'language': 'multi'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_russian-large-generic',
'language': 'ru'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_spanish-large-generic',
'language': 'es'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_dutch-large-generic',
'language': 'nl'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_turkish-large-generic',
'language': 'tr'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_korean-large-generic',
'language': 'ko'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_farsi-large-generic',
'language': 'fa'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_german-large-generic',
'language': 'de'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_hindi-large-generic',
'language': 'hi'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_bangla-large-generic',
'language': 'bn'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-ecom',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_chinese-base-ecom-50cls',
'language': 'zh'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_english-large-ecom',
'language': 'en'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_russian-large-ecom',
'language': 'ru'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_french-large-ecom',
'language': 'fr'
},
{
'model_id':
'damo/nlp_raner_named-entity-recognition_spanish-large-ecom',
'language': 'es'
},
{
'model_id':
'damo/nlp_structbert_keyphrase-extraction_base-icassp2023-mug-track4-baseline',
'language': 'zh'
},
{
'model_id': 'damo/nlp_raner_chunking_english-large',
'language': 'en'
},
]
def setUp(self) -> None:
self.task = Tasks.named_entity_recognition
self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
self.english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
self.chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic'
self.tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
self.lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
self.addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base'
self.lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic'
self.sentence = '这与温岭市新河镇的一个神秘的传说有关。[SEP]地名'
self.sentence_en = 'pizza shovel'
self.sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。'
self.addr = '浙江省杭州市余杭区文一西路969号亲橙里'
self.addr1 = '浙江省西湖区灵隐隧道'
self.addr2 = '内蒙古自治区巴彦淖尔市'
self.ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装'
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_tcrf_by_direct_model_download(self):
cache_path = snapshot_download(self.tcrf_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = ModelForTokenClassificationWithCRF.from_pretrained(cache_path)
pipeline1 = NamedEntityRecognitionPipeline(
model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.named_entity_recognition,
model=model,
preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lcrf_by_direct_model_download(self):
cache_path = snapshot_download(self.lcrf_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(cache_path)
model = LSTMForTokenClassificationWithCRF.from_pretrained(cache_path)
pipeline1 = NamedEntityRecognitionPipeline(
model, preprocessor=tokenizer)
pipeline2 = pipeline(
Tasks.named_entity_recognition,
model=model,
preprocessor=tokenizer)
print(f'sentence: {self.sentence}\n'
f'pipeline1:{pipeline1(input=self.sentence)}')
print()
print(f'pipeline2: {pipeline2(input=self.sentence)}')
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_tcrf_with_model_from_modelhub(self):
model = Model.from_pretrained(self.tcrf_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=model,
preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_addrst_with_model_from_modelhub(self):
model = Model.from_pretrained(
'damo/nlp_structbert_address-parsing_chinese_base')
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=model,
preprocessor=tokenizer)
print(pipeline_ins(input=self.addr))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_addrst_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.addr_model_id)
print(pipeline_ins(input=self.addr))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_addrst_with_model_name_batch(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.addr_model_id)
print(
pipeline_ins(
input=[self.addr, self.addr1, self.addr2], batch_size=2))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_addrst_with_model_name_batch_iter(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=self.addr_model_id,
padding=False)
print(pipeline_ins(input=[self.addr, self.addr1, self.addr2]))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lcrf_with_model_from_modelhub(self):
model = Model.from_pretrained(self.lcrf_model_id)
tokenizer = TokenClassificationTransformersPreprocessor(
model.model_dir)
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=model,
preprocessor=tokenizer)
print(pipeline_ins(input=self.sentence))
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_tcrf_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.tcrf_model_id)
print(pipeline_ins(input=self.sentence))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_lcrf_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
print(pipeline_ins(input=self.sentence))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lcrf_with_chinese_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.chinese_model_id)
print(pipeline_ins(input=self.sentence_zh))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lcrf_with_chinese_model_name_batch_iter(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=self.chinese_model_id,
padding=False)
print(
pipeline_ins(input=[
self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
]))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lcrf_with_chinese_model_name_batch(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.chinese_model_id)
print(
pipeline_ins(
input=[
self.sentence_zh, self.sentence_zh[:20],
self.sentence_zh[10:]
],
batch_size=2))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstm_with_chinese_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.lstm_model_id)
print(pipeline_ins(input=self.sentence_zh))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstm_with_chinese_model_name_batch_iter(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=self.lstm_model_id,
padding=False)
print(
pipeline_ins(input=[
self.sentence_zh, self.sentence_zh[:20], self.sentence_zh[10:]
]))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_lstm_with_chinese_model_name_batch(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.lstm_model_id)
print(
pipeline_ins(
input=[
self.sentence_zh, self.sentence_zh[:20],
self.sentence_zh[10:]
],
batch_size=2))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_english_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.english_model_id)
print(pipeline_ins(input=self.sentence_en))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_english_with_model_name_batch(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.english_model_id)
print(
pipeline_ins(
input=[self.ecom, self.sentence_zh, self.sentence],
batch_size=2))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_english_with_model_name_batch_iter(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition,
model=self.english_model_id,
padding=False)
print(pipeline_ins(input=[self.ecom, self.sentence_zh, self.sentence]))
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_default_model(self):
pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
print(pipeline_ins(input=self.sentence))
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_long_chinese_with_model_name(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.chinese_model_id)
print(
pipeline_ins(
input=self.sentence + '. ' * 1000,
split_max_length=300)) # longer than 512
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_long_chinese_with_model_name_batch(self):
pipeline_ins = pipeline(
task=Tasks.named_entity_recognition, model=self.chinese_model_id)
print(
pipeline_ins(
input=[self.sentence + '. ' * 1000] * 2,
batch_size=2,
split_max_length=300)) # longer than 512
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
def test_run_with_all_modelcards(self):
for item in self.all_modelcards_info:
model_id = item['model_id']
sentence = self.language_examples[item['language']]
with self.subTest(model_id=model_id):
pipeline_ins = pipeline(Tasks.named_entity_recognition,
model_id)
print(pipeline_ins(input=sentence))
if __name__ == '__main__':
unittest.main()