[to #42322933] add extractive-summarization and topic-segmentation

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10856839
This commit is contained in:
shichen.fsc
2022-11-25 19:29:02 +08:00
parent 2b62084146
commit acb8d36699
12 changed files with 552 additions and 56 deletions

View File

@@ -82,6 +82,7 @@ class Models(object):
gpt_neo = 'gpt-neo' gpt_neo = 'gpt-neo'
plug = 'plug' plug = 'plug'
bert_for_ds = 'bert-for-document-segmentation' bert_for_ds = 'bert-for-document-segmentation'
ponet_for_ds = 'ponet-for-document-segmentation'
ponet = 'ponet' ponet = 'ponet'
T5 = 'T5' T5 = 'T5'
mglm = 'mglm' mglm = 'mglm'
@@ -257,6 +258,7 @@ class Pipelines(object):
text_ranking = 'text-ranking' text_ranking = 'text-ranking'
relation_extraction = 'relation-extraction' relation_extraction = 'relation-extraction'
document_segmentation = 'document-segmentation' document_segmentation = 'document-segmentation'
extractive_summarization = 'extractive-summarization'
feature_extraction = 'feature-extraction' feature_extraction = 'feature-extraction'
mglm_text_summarization = 'mglm-text-summarization' mglm_text_summarization = 'mglm-text-summarization'
translation_en_to_de = 'translation_en_to_de' # keep it underscore translation_en_to_de = 'translation_en_to_de' # keep it underscore

View File

@@ -21,16 +21,18 @@ __all__ = ['BertForDocumentSegmentation']
Tasks.document_segmentation, module_name=Models.bert_for_ds) Tasks.document_segmentation, module_name=Models.bert_for_ds)
class BertForDocumentSegmentation(Model): class BertForDocumentSegmentation(Model):
def __init__(self, model_dir: str, *args, **kwargs): def __init__(self, model_dir: str, model_config: Dict[str, Any], *args,
super().__init__(model_dir, *args, **kwargs) **kwargs):
super().__init__(model_dir, model_config, *args, **kwargs)
self.model_cfg = model_config
def build_with_config(self, config): def build_with_config(self, config):
self.bert_model = BertForDocumentSegmentationBase.from_pretrained( self.bert_model = BertForDocumentSegmentationBase.from_pretrained(
self.model_dir, from_tf=False, config=config) self.model_dir, from_tf=False, config=config)
return self.bert_model return self.bert_model
def forward(self, input: Dict[str, Dict]) -> Dict[str, Any]: def forward(self) -> Dict[str, Any]:
pass return self.model_cfg
class BertForDocumentSegmentationBase(BertPreTrainedModel): class BertForDocumentSegmentationBase(BertPreTrainedModel):

View File

@@ -22,12 +22,14 @@ if TYPE_CHECKING:
from .backbone import (PoNetModel, PoNetPreTrainedModel) from .backbone import (PoNetModel, PoNetPreTrainedModel)
from .tokenization import PoNetTokenizer from .tokenization import PoNetTokenizer
from .fill_mask import PoNetForMaskedLM from .fill_mask import PoNetForMaskedLM
from .document_segmentation import PoNetForDocumentSegmentation
else: else:
_import_structure = { _import_structure = {
'configuration': ['PoNetConfig'], 'configuration': ['PoNetConfig'],
'backbone': ['PoNetModel', 'PoNetPreTrainedModel'], 'backbone': ['PoNetModel', 'PoNetPreTrainedModel'],
'fill_mask': ['PoNetForMaskedLM'], 'fill_mask': ['PoNetForMaskedLM'],
'tokenization': ['PoNetTokenizer'], 'tokenization': ['PoNetTokenizer'],
'document_segmentation': ['PoNetForDocumentSegmentation']
} }
import sys import sys

View File

@@ -600,8 +600,7 @@ class PoNetPooler(nn.Module):
class PoNetPreTrainedModel(TorchModel, PreTrainedModel): class PoNetPreTrainedModel(TorchModel, PreTrainedModel):
""" """
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained A base class to handle weights initialization and a simple interface for loading pretrained models.
models.
""" """
config_class = PoNetConfig config_class = PoNetConfig
@@ -643,6 +642,34 @@ class PoNetPreTrainedModel(TorchModel, PreTrainedModel):
return model return model
class PoNetPreTrainedModelV2(PreTrainedModel):
"""
A base class to handle weights initialization and a simple interface for loading pretrained models.
"""
config_class = PoNetConfig
base_model_prefix = 'ponet'
_keys_to_ignore_on_load_missing = [r'position_ids']
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(
mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(
mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
@MODELS.register_module(Tasks.backbone, module_name=Models.ponet) @MODELS.register_module(Tasks.backbone, module_name=Models.ponet)
class PoNetModel(PoNetPreTrainedModel): class PoNetModel(PoNetPreTrainedModel):
"""The bare PoNet Model transformer outputting raw hidden-states without any specific head on top. """The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.

View File

@@ -0,0 +1,115 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Any, Dict
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import TokenClassifierOutput
from modelscope.metainfo import Models
from modelscope.models.base import Model
from modelscope.models.builder import MODELS
from modelscope.utils.constant import Tasks
from .backbone import PoNetModel, PoNetPreTrainedModelV2
__all__ = ['PoNetForDocumentSegmentation']
@MODELS.register_module(
Tasks.document_segmentation, module_name=Models.ponet_for_ds)
@MODELS.register_module(
Tasks.extractive_summarization, module_name=Models.ponet_for_ds)
class PoNetForDocumentSegmentation(Model):
def __init__(self, model_dir: str, model_config: Dict[str, Any], *args,
**kwargs):
super().__init__(model_dir, model_config, *args, **kwargs)
self.model_cfg = model_config
def build_with_config(self, config):
self.ponet_model = PoNetForDocumentSegmentationBase.from_pretrained(
self.model_dir, config=config)
return self.ponet_model
def forward(self) -> Dict[str, Any]:
return self.model_cfg
class PoNetForDocumentSegmentationBase(PoNetPreTrainedModelV2):
_keys_to_ignore_on_load_unexpected = [r'pooler']
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.ponet = PoNetModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
segment_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.ponet(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
segment_ids=segment_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1),
torch.tensor(loss_fct.ignore_index).type_as(labels))
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(
logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits, ) + outputs[2:]
return ((loss, ) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

View File

@@ -11,6 +11,7 @@ if TYPE_CHECKING:
from .dialog_modeling_pipeline import DialogModelingPipeline from .dialog_modeling_pipeline import DialogModelingPipeline
from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline from .dialog_state_tracking_pipeline import DialogStateTrackingPipeline
from .document_segmentation_pipeline import DocumentSegmentationPipeline from .document_segmentation_pipeline import DocumentSegmentationPipeline
from .extractive_summarization_pipeline import ExtractiveSummarizationPipeline
from .fasttext_sequence_classification_pipeline import FasttextSequenceClassificationPipeline from .fasttext_sequence_classification_pipeline import FasttextSequenceClassificationPipeline
from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline from .faq_question_answering_pipeline import FaqQuestionAnsweringPipeline
from .feature_extraction_pipeline import FeatureExtractionPipeline from .feature_extraction_pipeline import FeatureExtractionPipeline
@@ -45,6 +46,8 @@ else:
'domain_classification_pipeline': 'domain_classification_pipeline':
['FasttextSequenceClassificationPipeline'], ['FasttextSequenceClassificationPipeline'],
'document_segmentation_pipeline': ['DocumentSegmentationPipeline'], 'document_segmentation_pipeline': ['DocumentSegmentationPipeline'],
'extractive_summarization_pipeline':
['ExtractiveSummarizationPipeline'],
'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'], 'faq_question_answering_pipeline': ['FaqQuestionAnsweringPipeline'],
'feature_extraction_pipeline': ['FeatureExtractionPipeline'], 'feature_extraction_pipeline': ['FeatureExtractionPipeline'],
'fill_mask_pipeline': ['FillMaskPipeline'], 'fill_mask_pipeline': ['FillMaskPipeline'],

View File

@@ -10,6 +10,7 @@ from transformers.models.bert.modeling_bert import BertConfig
from modelscope.metainfo import Pipelines from modelscope.metainfo import Pipelines
from modelscope.models import Model from modelscope.models import Model
from modelscope.models.nlp.ponet.configuration import PoNetConfig
from modelscope.outputs import OutputKeys from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.base import Pipeline, Tensor
from modelscope.pipelines.builder import PIPELINES from modelscope.pipelines.builder import PIPELINES
@@ -35,7 +36,12 @@ class DocumentSegmentationPipeline(Pipeline):
Model) else Model.from_pretrained(model) Model) else Model.from_pretrained(model)
self.model_dir = model.model_dir self.model_dir = model.model_dir
self.model_cfg = model.forward()
if self.model_cfg['type'] == 'bert':
config = BertConfig.from_pretrained(model.model_dir, num_labels=2) config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
elif self.model_cfg['type'] == 'ponet':
config = PoNetConfig.from_pretrained(model.model_dir, num_labels=2)
self.document_segmentation_model = model.build_with_config( self.document_segmentation_model = model.build_with_config(
config=config) config=config)
@@ -47,23 +53,33 @@ class DocumentSegmentationPipeline(Pipeline):
self.preprocessor = preprocessor self.preprocessor = preprocessor
def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]: def __call__(
self, documents: Union[List[List[str]], List[str],
str]) -> Dict[str, Any]:
output = self.predict(documents) output = self.predict(documents)
output = self.postprocess(output) output = self.postprocess(output)
return output return output
def predict(self, documents: Union[List[str], str]) -> Dict[str, Any]: def predict(
self, documents: Union[List[List[str]], List[str],
str]) -> Dict[str, Any]:
pred_samples = self.cut_documents(documents) pred_samples = self.cut_documents(documents)
if self.model_cfg['level'] == 'topic':
paragraphs = pred_samples.pop('paragraphs')
predict_examples = Dataset.from_dict(pred_samples) predict_examples = Dataset.from_dict(pred_samples)
# Predict Feature Creation # Predict Feature Creation
predict_dataset = self.preprocessor(predict_examples) predict_dataset = self.preprocessor(predict_examples, self.model_cfg)
num_examples = len( num_examples = len(
predict_examples[self.preprocessor.context_column_name]) predict_examples[self.preprocessor.context_column_name])
num_samples = len( num_samples = len(
predict_dataset[self.preprocessor.context_column_name]) predict_dataset[self.preprocessor.context_column_name])
if self.model_cfg['type'] == 'bert':
predict_dataset.pop('segment_ids') predict_dataset.pop('segment_ids')
labels = predict_dataset.pop('labels') labels = predict_dataset.pop('labels')
sentences = predict_dataset.pop('sentences') sentences = predict_dataset.pop('sentences')
example_ids = predict_dataset.pop( example_ids = predict_dataset.pop(
@@ -82,6 +98,7 @@ class DocumentSegmentationPipeline(Pipeline):
predictions), 'sample {} infer_sample {} prediction {}'.format( predictions), 'sample {} infer_sample {} prediction {}'.format(
num_samples, len(sentences), len(predictions)) num_samples, len(sentences), len(predictions))
# Remove ignored index (special tokens) # Remove ignored index (special tokens)
true_predictions = [ true_predictions = [
[ [
self.preprocessor.label_list[p] self.preprocessor.label_list[p]
@@ -99,10 +116,19 @@ class DocumentSegmentationPipeline(Pipeline):
# Save predictions # Save predictions
out = [] out = []
for i in range(num_examples): for i in range(num_examples):
if self.model_cfg['level'] == 'topic':
out.append({
'sentences': [],
'labels': [],
'predictions': [],
'paragraphs': paragraphs[i]
})
else:
out.append({'sentences': [], 'labels': [], 'predictions': []}) out.append({'sentences': [], 'labels': [], 'predictions': []})
for prediction, sentence_list, label, example_id in zip( for prediction, sentence_list, label, example_id in zip(
true_predictions, sentences, true_labels, example_ids): true_predictions, sentences, true_labels, example_ids):
if self.model_cfg['level'] == 'doc':
if len(label) < len(sentence_list): if len(label) < len(sentence_list):
label.append('B-EOP') label.append('B-EOP')
prediction.append('B-EOP') prediction.append('B-EOP')
@@ -110,10 +136,18 @@ class DocumentSegmentationPipeline(Pipeline):
len(sentence_list), len(prediction)) len(sentence_list), len(prediction))
assert len(sentence_list) == len(label), '{} {}'.format( assert len(sentence_list) == len(label), '{} {}'.format(
len(sentence_list), len(label)) len(sentence_list), len(label))
out[example_id]['sentences'].extend(sentence_list) out[example_id]['sentences'].extend(sentence_list)
out[example_id]['labels'].extend(label) out[example_id]['labels'].extend(label)
out[example_id]['predictions'].extend(prediction) out[example_id]['predictions'].extend(prediction)
if self.model_cfg['level'] == 'topic':
for i in range(num_examples):
assert len(out[i]['predictions']) + 1 == len(
out[i]['paragraphs'])
out[i]['predictions'].append('B-EOP')
out[i]['labels'].append('B-EOP')
return out return out
def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
@@ -126,7 +160,28 @@ class DocumentSegmentationPipeline(Pipeline):
Dict[str, str]: the prediction results Dict[str, str]: the prediction results
""" """
result = [] result = []
res_preds = []
list_count = len(inputs) list_count = len(inputs)
if self.model_cfg['level'] == 'topic':
for num in range(list_count):
res = []
pred = []
for s, p, l in zip(inputs[num]['paragraphs'],
inputs[num]['predictions'],
inputs[num]['labels']):
s = s.strip()
if p == 'B-EOP':
s = ''.join([s, '\n\n\t'])
pred.append(1)
else:
s = ''.join([s, '\n\t'])
pred.append(0)
res.append(s)
res_preds.append(pred)
document = ('\t' + ''.join(res).strip())
result.append(document)
else:
for num in range(list_count): for num in range(list_count):
res = [] res = []
for s, p in zip(inputs[num]['sentences'], for s, p in zip(inputs[num]['sentences'],
@@ -144,14 +199,45 @@ class DocumentSegmentationPipeline(Pipeline):
else: else:
return {OutputKeys.TEXT: result} return {OutputKeys.TEXT: result}
def cut_documents(self, para: Union[List[str], str]): def cut_documents(self, para: Union[List[List[str]], List[str], str]):
document_list = para document_list = para
if isinstance(para, str): paragraphs = []
document_list = [para]
sentences = [] sentences = []
labels = [] labels = []
example_id = [] example_id = []
id = 0 id = 0
if self.model_cfg['level'] == 'topic':
if isinstance(para, str):
document_list = [[para]]
elif isinstance(para[0], str):
document_list = [para]
for document in document_list:
sentence = []
label = []
for item in document:
sentence_of_current_paragraph = self.cut_sentence(item)
sentence.extend(sentence_of_current_paragraph)
label.extend(['-100']
* (len(sentence_of_current_paragraph) - 1)
+ ['B-EOP'])
paragraphs.append(document)
sentences.append(sentence)
labels.append(label)
example_id.append(id)
id += 1
return {
'example_id': example_id,
'sentences': sentences,
'paragraphs': paragraphs,
'labels': labels
}
else:
if isinstance(para, str):
document_list = [para]
for document in document_list: for document in document_list:
sentence = self.cut_sentence(document) sentence = self.cut_sentence(document)
label = ['O'] * (len(sentence) - 1) + ['B-EOP'] label = ['O'] * (len(sentence) - 1) + ['B-EOP']

View File

@@ -0,0 +1,181 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import re
from typing import Any, Dict, List, Union
import numpy as np
import torch
from datasets import Dataset
from transformers.models.bert.modeling_bert import BertConfig
from modelscope.metainfo import Pipelines
from modelscope.models import Model
from modelscope.models.nlp.ponet.configuration import PoNetConfig
from modelscope.outputs import OutputKeys
from modelscope.pipelines.base import Pipeline, Tensor
from modelscope.pipelines.builder import PIPELINES
from modelscope.preprocessors import DocumentSegmentationPreprocessor
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
__all__ = ['ExtractiveSummarizationPipeline']
@PIPELINES.register_module(
Tasks.extractive_summarization,
module_name=Pipelines.extractive_summarization)
class ExtractiveSummarizationPipeline(Pipeline):
def __init__(self,
model: Union[Model, str],
preprocessor: DocumentSegmentationPreprocessor = None,
**kwargs):
model = model if isinstance(model,
Model) else Model.from_pretrained(model)
self.model_dir = model.model_dir
self.model_cfg = model.forward()
if self.model_cfg['type'] == 'bert':
config = BertConfig.from_pretrained(model.model_dir, num_labels=2)
elif self.model_cfg['type'] == 'ponet':
config = PoNetConfig.from_pretrained(model.model_dir, num_labels=2)
self.extractive_summarization_model = model.build_with_config(
config=config)
if preprocessor is None:
preprocessor = DocumentSegmentationPreprocessor(
self.model_dir, config)
super().__init__(model=model, preprocessor=preprocessor, **kwargs)
self.preprocessor = preprocessor
def __call__(self, documents: Union[List[str], str]) -> Dict[str, Any]:
output = self.predict(documents)
output = self.postprocess(output)
return output
def predict(self, documents: Union[List[str], str]) -> Dict[str, Any]:
pred_samples = self.cut_documents(documents)
predict_examples = Dataset.from_dict(pred_samples)
# Predict Feature Creation
predict_dataset = self.preprocessor(predict_examples, self.model_cfg)
num_examples = len(
predict_examples[self.preprocessor.context_column_name])
num_samples = len(
predict_dataset[self.preprocessor.context_column_name])
labels = predict_dataset.pop('labels')
sentences = predict_dataset.pop('sentences')
example_ids = predict_dataset.pop(
self.preprocessor.example_id_column_name)
with torch.no_grad():
input = {
key: torch.tensor(val)
for key, val in predict_dataset.items()
}
logits = self.extractive_summarization_model.forward(
**input).logits
predictions = np.argmax(logits, axis=2)
assert len(sentences) == len(
predictions), 'sample {} infer_sample {} prediction {}'.format(
num_samples, len(sentences), len(predictions))
# Remove ignored index (special tokens)
true_predictions = [
[
self.preprocessor.label_list[p]
for (p, l) in zip(prediction, label) if l != -100 # noqa *
] for prediction, label in zip(predictions, labels)
]
true_labels = [
[
self.preprocessor.label_list[l]
for (p, l) in zip(prediction, label) if l != -100 # noqa *
] for prediction, label in zip(predictions, labels)
]
# Save predictions
out = []
for i in range(num_examples):
out.append({'sentences': [], 'labels': [], 'predictions': []})
for prediction, sentence_list, label, example_id in zip(
true_predictions, sentences, true_labels, example_ids):
if len(label) < len(sentence_list):
label.append('O')
prediction.append('O')
assert len(sentence_list) == len(prediction), '{} {}'.format(
len(sentence_list), len(prediction))
assert len(sentence_list) == len(label), '{} {}'.format(
len(sentence_list), len(label))
out[example_id]['sentences'].extend(sentence_list)
out[example_id]['labels'].extend(label)
out[example_id]['predictions'].extend(prediction)
return out
def postprocess(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
"""process the prediction results
Args:
inputs (Dict[str, Any]): _description_
Returns:
Dict[str, str]: the prediction results
"""
result = []
list_count = len(inputs)
for num in range(list_count):
res = []
for s, p in zip(inputs[num]['sentences'],
inputs[num]['predictions']):
s = s.strip()
if p == 'B-EOP':
res.append(s)
result.append('\n'.join(res))
if list_count == 1:
return {OutputKeys.TEXT: result[0]}
else:
return {OutputKeys.TEXT: result}
def cut_documents(self, para: Union[List[str], str]):
if isinstance(para, str):
document_list = [para]
else:
document_list = para
sentences = []
labels = []
example_id = []
id = 0
for document in document_list:
sentence = self.cut_sentence(document)
label = ['O'] * (len(sentence) - 1) + ['B-EOP']
sentences.append(sentence)
labels.append(label)
example_id.append(id)
id += 1
return {
'example_id': example_id,
'sentences': sentences,
'labels': labels
}
def cut_sentence(self, para):
para = re.sub(r'([。!.!\?])([^”’])', r'\1\n\2', para) # noqa *
para = re.sub(r'(\.{6})([^”’])', r'\1\n\2', para) # noqa *
para = re.sub(r'(\{2})([^”’])', r'\1\n\2', para) # noqa *
para = re.sub(r'([。!?\?][”’])([^,。!?\?])', r'\1\n\2', para) # noqa *
para = para.rstrip()
return [_ for _ in para.split('\n') if _]

View File

@@ -37,7 +37,7 @@ class DocumentSegmentationPreprocessor(NLPBasePreprocessor):
self.max_seq_length = config.max_position_embeddings self.max_seq_length = config.max_position_embeddings
self.label_list = ['B-EOP', 'O'] self.label_list = ['B-EOP', 'O']
def __call__(self, examples) -> Dict[str, Any]: def __call__(self, examples, model_cfg=None) -> Dict[str, Any]:
questions = examples[self.question_column_name] questions = examples[self.question_column_name]
contexts = examples[self.context_column_name] contexts = examples[self.context_column_name]
example_ids = examples[self.example_id_column_name] example_ids = examples[self.example_id_column_name]
@@ -72,6 +72,8 @@ class DocumentSegmentationPreprocessor(NLPBasePreprocessor):
example_token_labels = [] example_token_labels = []
segment_id = [] segment_id = []
cur_seg_id = 1 cur_seg_id = 1
para_segment_id = []
cut_para_seg_id = 1
for token_index in range(len(example_input_ids)): for token_index in range(len(example_input_ids)):
if example_input_ids[token_index] in self.target_specical_ids: if example_input_ids[token_index] in self.target_specical_ids:
example_token_labels.append(example_labels[cur_seg_id - 1]) example_token_labels.append(example_labels[cur_seg_id - 1])
@@ -81,6 +83,16 @@ class DocumentSegmentationPreprocessor(NLPBasePreprocessor):
example_token_labels.append(-100) example_token_labels.append(-100)
segment_id.append(cur_seg_id) segment_id.append(cur_seg_id)
if example_token_labels[token_index] != -100:
para_segment_id.append(cut_para_seg_id)
cut_para_seg_id += 1
else:
para_segment_id.append(cut_para_seg_id)
if model_cfg is not None and model_cfg[
'type'] == 'ponet' and model_cfg['level'] == 'topic':
segment_ids.append(para_segment_id)
else:
segment_ids.append(segment_id) segment_ids.append(segment_id)
token_seq_labels.append(example_token_labels) token_seq_labels.append(example_token_labels)

View File

@@ -127,6 +127,7 @@ class NLPTasks(object):
faq_question_answering = 'faq-question-answering' faq_question_answering = 'faq-question-answering'
information_extraction = 'information-extraction' information_extraction = 'information-extraction'
document_segmentation = 'document-segmentation' document_segmentation = 'document-segmentation'
extractive_summarization = 'extractive-summarization'
feature_extraction = 'feature-extraction' feature_extraction = 'feature-extraction'

View File

@@ -17,10 +17,11 @@ class DocumentSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
def setUp(self) -> None: def setUp(self) -> None:
self.task = Tasks.document_segmentation self.task = Tasks.document_segmentation
self.model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
model_id = 'damo/nlp_bert_document-segmentation_chinese-base' bert_ds_model_id = 'damo/nlp_bert_document-segmentation_chinese-base'
eng_model_id = 'damo/nlp_bert_document-segmentation_english-base' bert_ds_eng_model_id = 'damo/nlp_bert_document-segmentation_english-base'
ponet_ts_model_id = 'damo/nlp_ponet_document-segmentation_topic-level_chinese-base'
sentences = '近年来随着端到端语音识别的流行基于Transformer结构的语音识别系统逐渐成为了主流。然而由于Transformer是一种自回归模型需要逐个生成目标文字计算复杂度随着目标文字数量线性增加限制了其在工业生产中的应用。针对Transoformer模型自回归生成文字的低计算效率缺陷学术界提出了非自回归模型来并行的输出目标文字。根据生成目标文字时迭代轮数非自回归模型分为多轮迭代式与单轮迭代非自回归模型。其中实用的是基于单轮迭代的非自回归模型。对于单轮非自回归模型现有工作往往聚焦于如何更加准确的预测目标文字个数如CTC-enhanced采用CTC预测输出文字个数尽管如此考虑到现实应用中语速、口音、静音以及噪声等因素的影响如何准确的预测目标文字个数以及抽取目标文字对应的声学隐变量仍然是一个比较大的挑战另外一方面我们通过对比自回归模型与单轮非自回归模型在工业大数据上的错误类型如下图所示AR与vanilla NAR发现相比于自回归模型非自回归模型在预测目标文字个数方面差距较小但是替换错误显著的增加我们认为这是由于单轮非自回归模型中条件独立假设导致的语义信息丢失。于此同时目前非自回归模型主要停留在学术验证阶段还没有工业大数据上的相关实验与结论。' # noqa * sentences = '近年来随着端到端语音识别的流行基于Transformer结构的语音识别系统逐渐成为了主流。然而由于Transformer是一种自回归模型需要逐个生成目标文字计算复杂度随着目标文字数量线性增加限制了其在工业生产中的应用。针对Transoformer模型自回归生成文字的低计算效率缺陷学术界提出了非自回归模型来并行的输出目标文字。根据生成目标文字时迭代轮数非自回归模型分为多轮迭代式与单轮迭代非自回归模型。其中实用的是基于单轮迭代的非自回归模型。对于单轮非自回归模型现有工作往往聚焦于如何更加准确的预测目标文字个数如CTC-enhanced采用CTC预测输出文字个数尽管如此考虑到现实应用中语速、口音、静音以及噪声等因素的影响如何准确的预测目标文字个数以及抽取目标文字对应的声学隐变量仍然是一个比较大的挑战另外一方面我们通过对比自回归模型与单轮非自回归模型在工业大数据上的错误类型如下图所示AR与vanilla NAR发现相比于自回归模型非自回归模型在预测目标文字个数方面差距较小但是替换错误显著的增加我们认为这是由于单轮非自回归模型中条件独立假设导致的语义信息丢失。于此同时目前非自回归模型主要停留在学术验证阶段还没有工业大数据上的相关实验与结论。' # noqa *
sentences_1 = '移动端语音唤醒模型检测关键词为“小云小云”。模型主体为4层FSMN结构使用CTC训练准则参数量750K适用于移动端设备运行。模型输入为Fbank特征输出为基于char建模的中文全集token预测测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式basetrain过程使用大量内部移动端数据在此基础上使用1万条设备端录制安静场景“小云小云”数据进行微调得到最终面向业务的模型。后续用户可在basetrain模型基础上使用其他关键词数据进行微调得到新的语音唤醒模型但暂时未开放模型finetune功能。' # noqa * sentences_1 = '移动端语音唤醒模型检测关键词为“小云小云”。模型主体为4层FSMN结构使用CTC训练准则参数量750K适用于移动端设备运行。模型输入为Fbank特征输出为基于char建模的中文全集token预测测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式basetrain过程使用大量内部移动端数据在此基础上使用1万条设备端录制安静场景“小云小云”数据进行微调得到最终面向业务的模型。后续用户可在basetrain模型基础上使用其他关键词数据进行微调得到新的语音唤醒模型但暂时未开放模型finetune功能。' # noqa *
eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (12201263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.' # noqa * eng_sentences = 'The Saint Alexander Nevsky Church was established in 1936 by Archbishop Vitaly (Maximenko) () on a tract of land donated by Yulia Martinovna Plavskaya.The initial chapel, dedicated to the memory of the great prince St. Alexander Nevsky (12201263), was blessed in May, 1936.The church building was subsequently expanded three times.In 1987, ground was cleared for the construction of the new church and on September 12, 1989, on the Feast Day of St. Alexander Nevsky, the cornerstone was laid and the relics of St. Herman of Alaska placed in the foundation.The imposing edifice, completed in 1997, is the work of Nikolaus Karsanov, architect and Protopresbyter Valery Lukianov, engineer.Funds were raised through donations.The Great blessing of the cathedral took place on October 18, 1997 with seven bishops, headed by Metropolitan Vitaly Ustinov, and 36 priests and deacons officiating, some 800 faithful attended the festivity.The old church was rededicated to Our Lady of Tikhvin.Metropolitan Hilarion (Kapral) announced, that cathedral will officially become the episcopal See of the Ruling Bishop of the Eastern American Diocese and the administrative center of the Diocese on September 12, 2014.At present the parish serves the spiritual needs of 300 members.The parochial school instructs over 90 boys and girls in religion, Russian language and history.The school meets every Saturday.The choir is directed by Andrew Burbelo.The sisterhood attends to the needs of the church and a church council acts in the administration of the community.The cathedral is decorated by frescoes in the Byzantine style.The iconography project was fulfilled by Father Andrew Erastov and his students from 1995 until 2001.' # noqa *
@@ -31,23 +32,32 @@ class DocumentSegmentationTest(unittest.TestCase, DemoCompatibilityCheck):
return result return result
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_document(self): def test_run_with_document_segmentation(self):
logger.info('Run document segmentation with one document ...') logger.info('Run document segmentation (Bert) with one document ...')
result = self.run_pipeline( result = self.run_pipeline(
model_id=self.model_id, documents=self.sentences) model_id=self.bert_ds_model_id, documents=self.sentences)
print(result[OutputKeys.TEXT]) print(result[OutputKeys.TEXT])
result = self.run_pipeline( result = self.run_pipeline(
model_id=self.eng_model_id, documents=self.eng_sentences) model_id=self.bert_ds_eng_model_id, documents=self.eng_sentences)
print(result[OutputKeys.TEXT]) print(result[OutputKeys.TEXT])
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
def test_run_with_documents(self): def test_run_with_topic_segmentation(self):
logger.info('Run document segmentation with many documents ...') logger.info('Run topic segmentation (PoNet) with one document ...')
result = self.run_pipeline( result = self.run_pipeline(
model_id=self.model_id, model_id=self.ponet_ts_model_id, documents=self.sentences)
# print("return:")
print(result[OutputKeys.TEXT])
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
def test_run_with_documents_segmentation(self):
logger.info('Run document segmentation (Bert) with many documents ...')
result = self.run_pipeline(
model_id=self.bert_ds_model_id,
documents=[self.sentences, self.sentences_1]) documents=[self.sentences, self.sentences_1])
documents_list = result[OutputKeys.TEXT] documents_list = result[OutputKeys.TEXT]

File diff suppressed because one or more lines are too long