mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-25 12:39:25 +01:00
Merge branch 'master' into dev_face2d_fixbug
This commit is contained in:
@@ -128,7 +128,7 @@ class TorchModelExporter(Exporter):
|
||||
args_list = list(args)
|
||||
else:
|
||||
args_list = [args]
|
||||
if isinstance(args_list[-1], dict):
|
||||
if isinstance(args_list[-1], Mapping):
|
||||
args_dict = args_list[-1]
|
||||
args_list = args_list[:-1]
|
||||
n_nonkeyword = len(args_list)
|
||||
@@ -284,9 +284,8 @@ class TorchModelExporter(Exporter):
|
||||
'Model property dummy_inputs must be set.')
|
||||
dummy_inputs = collate_fn(dummy_inputs, device)
|
||||
if isinstance(dummy_inputs, Mapping):
|
||||
dummy_inputs = self._decide_input_format(model, dummy_inputs)
|
||||
dummy_inputs_filter = []
|
||||
for _input in dummy_inputs:
|
||||
for _input in self._decide_input_format(model, dummy_inputs):
|
||||
if _input is not None:
|
||||
dummy_inputs_filter.append(_input)
|
||||
else:
|
||||
|
||||
@@ -23,7 +23,8 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
|
||||
API_RESPONSE_FIELD_MESSAGE,
|
||||
API_RESPONSE_FIELD_USERNAME,
|
||||
DEFAULT_CREDENTIALS_PATH,
|
||||
MODELSCOPE_ENVIRONMENT, ONE_YEAR_SECONDS,
|
||||
MODELSCOPE_ENVIRONMENT,
|
||||
MODELSCOPE_USERNAME, ONE_YEAR_SECONDS,
|
||||
Licenses, ModelVisibility)
|
||||
from modelscope.hub.errors import (InvalidParameter, NotExistError,
|
||||
NotLoginException, NoValidRevisionError,
|
||||
@@ -38,8 +39,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
|
||||
DEFAULT_MODEL_REVISION,
|
||||
DEFAULT_REPOSITORY_REVISION,
|
||||
MASTER_MODEL_BRANCH, DatasetFormations,
|
||||
DatasetMetaFormats, DownloadMode,
|
||||
ModelFile)
|
||||
DatasetMetaFormats, DownloadChannel,
|
||||
DownloadMode, ModelFile)
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .utils.utils import (get_endpoint, get_release_datetime,
|
||||
model_id_to_group_owner_name)
|
||||
@@ -645,6 +646,25 @@ class HubApi:
|
||||
def check_local_cookies(self, use_cookies) -> CookieJar:
|
||||
return self._check_cookie(use_cookies=use_cookies)
|
||||
|
||||
def dataset_download_uv(self, dataset_name: str, namespace: str):
|
||||
if not dataset_name or not namespace:
|
||||
raise ValueError('dataset_name or namespace cannot be empty!')
|
||||
|
||||
# get channel and user_name
|
||||
channel = DownloadChannel.LOCAL.value
|
||||
user_name = ''
|
||||
if MODELSCOPE_ENVIRONMENT in os.environ:
|
||||
channel = os.environ[MODELSCOPE_ENVIRONMENT]
|
||||
if MODELSCOPE_USERNAME in os.environ:
|
||||
user_name = os.environ[MODELSCOPE_USERNAME]
|
||||
|
||||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/uv/{channel}?user={user_name}'
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
r = requests.post(url, cookies=cookies, headers=self.headers)
|
||||
resp = r.json()
|
||||
raise_on_error(resp)
|
||||
return resp['Message']
|
||||
|
||||
|
||||
class ModelScopeConfig:
|
||||
path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
|
||||
@@ -760,14 +780,18 @@ class ModelScopeConfig:
|
||||
env = 'custom'
|
||||
if MODELSCOPE_ENVIRONMENT in os.environ:
|
||||
env = os.environ[MODELSCOPE_ENVIRONMENT]
|
||||
user_name = 'unknown'
|
||||
if MODELSCOPE_USERNAME in os.environ:
|
||||
user_name = os.environ[MODELSCOPE_USERNAME]
|
||||
|
||||
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s' % (
|
||||
ua = 'modelscope/%s; python/%s; session_id/%s; platform/%s; processor/%s; env/%s; user/%s' % (
|
||||
__version__,
|
||||
platform.python_version(),
|
||||
ModelScopeConfig.get_user_session_id(),
|
||||
platform.platform(),
|
||||
platform.processor(),
|
||||
env,
|
||||
user_name,
|
||||
)
|
||||
if isinstance(user_agent, dict):
|
||||
ua = '; '.join(f'{k}/{v}' for k, v in user_agent.items())
|
||||
|
||||
@@ -18,6 +18,7 @@ API_RESPONSE_FIELD_EMAIL = 'Email'
|
||||
API_RESPONSE_FIELD_MESSAGE = 'Message'
|
||||
MODELSCOPE_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT'
|
||||
MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG'
|
||||
MODELSCOPE_USERNAME = 'MODELSCOPE_USERNAME'
|
||||
ONE_YEAR_SECONDS = 24 * 365 * 60 * 60
|
||||
|
||||
|
||||
|
||||
@@ -349,11 +349,13 @@ class CLIP(nn.Module):
|
||||
text_num_hidden_layers: int,
|
||||
text_type_vocab_size: int,
|
||||
tokenizer: FullTokenizer,
|
||||
# vision_head_width, added this param for ViT-H
|
||||
vision_head_width: int = 64,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if isinstance(vision_layers, (tuple, list)):
|
||||
vision_heads = vision_width * 32 // 64
|
||||
vision_heads = vision_width * 32 // vision_head_width
|
||||
self.visual = ModifiedResNet(
|
||||
layers=vision_layers,
|
||||
output_dim=embed_dim,
|
||||
@@ -361,7 +363,7 @@ class CLIP(nn.Module):
|
||||
input_resolution=image_resolution,
|
||||
width=vision_width)
|
||||
else:
|
||||
vision_heads = vision_width // 64
|
||||
vision_heads = vision_width // vision_head_width
|
||||
self.visual = VisualTransformer(
|
||||
input_resolution=image_resolution,
|
||||
patch_size=vision_patch_size,
|
||||
|
||||
3
modelscope/models/science/unifold/modules/__init__.py
Normal file
3
modelscope/models/science/unifold/modules/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# The Uni-fold implementation is also open-sourced by the authors under Apache-2.0 license,
|
||||
# and is publicly available at https://github.com/dptech-corp/Uni-Fold.
|
||||
"""Unifold Modules."""
|
||||
@@ -274,6 +274,8 @@ class MsDataset:
|
||||
try:
|
||||
api.on_dataset_download(
|
||||
dataset_name=download_dataset, namespace=namespace)
|
||||
api.dataset_download_uv(
|
||||
dataset_name=download_dataset, namespace=namespace)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
|
||||
@@ -491,17 +491,8 @@ TASK_OUTPUTS = {
|
||||
# word segmentation result for single sample
|
||||
# {
|
||||
# "output": "今天 天气 不错 , 适合 出去 游玩"
|
||||
# "labels": [
|
||||
# {'word': '今天', 'label': 'PROPN'},
|
||||
# {'word': '天气', 'label': 'PROPN'},
|
||||
# {'word': '不错', 'label': 'VERB'},
|
||||
# {'word': ',', 'label': 'NUM'},
|
||||
# {'word': '适合', 'label': 'NOUN'},
|
||||
# {'word': '出去', 'label': 'PART'},
|
||||
# {'word': '游玩', 'label': 'ADV'},
|
||||
# ]
|
||||
# }
|
||||
Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
|
||||
Tasks.word_segmentation: [OutputKeys.OUTPUT],
|
||||
|
||||
# TODO @wenmeng.zwm support list of result check
|
||||
# named entity recognition result for single sample
|
||||
|
||||
@@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = {
|
||||
'damo/cv_resnet50_live-category'),
|
||||
Tasks.video_category: (Pipelines.video_category,
|
||||
'damo/cv_resnet50_video-category'),
|
||||
Tasks.multi_modal_embedding:
|
||||
(Pipelines.multi_modal_embedding,
|
||||
'damo/multi-modal_clip-vit-large-patch14_zh'),
|
||||
Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding,
|
||||
'damo/multi-modal_clip-vit-base-patch16_zh'),
|
||||
Tasks.generative_multi_modal_embedding:
|
||||
(Pipelines.generative_multi_modal_embedding,
|
||||
'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'
|
||||
|
||||
@@ -109,13 +109,13 @@ class TokenClassificationPipeline(Pipeline):
|
||||
chunk['span'] = text[chunk['start']:chunk['end']]
|
||||
chunks.append(chunk)
|
||||
|
||||
# for cws output
|
||||
# for cws outputs
|
||||
if len(chunks) > 0 and chunks[0]['type'] == 'cws':
|
||||
spans = [
|
||||
chunk['span'] for chunk in chunks if chunk['span'].strip()
|
||||
]
|
||||
seg_result = ' '.join(spans)
|
||||
outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
|
||||
outputs = {OutputKeys.OUTPUT: seg_result}
|
||||
|
||||
# for ner outputs
|
||||
else:
|
||||
|
||||
@@ -115,15 +115,15 @@ class WordSegmentationPipeline(Pipeline):
|
||||
chunk['span'] = text[chunk['start']:chunk['end']]
|
||||
chunks.append(chunk)
|
||||
|
||||
# for cws output
|
||||
# for cws outputs
|
||||
if len(chunks) > 0 and chunks[0]['type'] == 'cws':
|
||||
spans = [
|
||||
chunk['span'] for chunk in chunks if chunk['span'].strip()
|
||||
]
|
||||
seg_result = ' '.join(spans)
|
||||
outputs = {OutputKeys.OUTPUT: seg_result, OutputKeys.LABELS: []}
|
||||
outputs = {OutputKeys.OUTPUT: seg_result}
|
||||
|
||||
# for ner outpus
|
||||
# for ner output
|
||||
else:
|
||||
outputs = {OutputKeys.OUTPUT: chunks}
|
||||
return outputs
|
||||
|
||||
@@ -96,7 +96,6 @@ class OfaPreprocessor(Preprocessor):
|
||||
data = input
|
||||
else:
|
||||
data = self._build_dict(input)
|
||||
data = self._ofa_input_compatibility_conversion(data)
|
||||
sample = self.preprocess(data)
|
||||
str_data = dict()
|
||||
for k, v in data.items():
|
||||
|
||||
@@ -34,6 +34,7 @@ class NLPBasePreprocessor(Preprocessor, ABC):
|
||||
label=None,
|
||||
label2id=None,
|
||||
mode=ModeKeys.INFERENCE,
|
||||
use_fast=None,
|
||||
**kwargs):
|
||||
"""The NLP preprocessor base class.
|
||||
|
||||
@@ -45,14 +46,18 @@ class NLPBasePreprocessor(Preprocessor, ABC):
|
||||
label2id: An optional label2id mapping, the class will try to call utils.parse_label_mapping
|
||||
if this mapping is not supplied.
|
||||
mode: Run this preprocessor in either 'train'/'eval'/'inference' mode
|
||||
use_fast: use the fast version of tokenizer
|
||||
|
||||
"""
|
||||
self.model_dir = model_dir
|
||||
self.first_sequence = first_sequence
|
||||
self.second_sequence = second_sequence
|
||||
self.label = label
|
||||
|
||||
self.use_fast = kwargs.pop('use_fast', None)
|
||||
if self.use_fast is None and os.path.isfile(
|
||||
self.use_fast = use_fast
|
||||
if self.use_fast is None and model_dir is None:
|
||||
self.use_fast = False
|
||||
elif self.use_fast is None and os.path.isfile(
|
||||
os.path.join(model_dir, 'tokenizer_config.json')):
|
||||
with open(os.path.join(model_dir, 'tokenizer_config.json'),
|
||||
'r') as f:
|
||||
@@ -61,8 +66,8 @@ class NLPBasePreprocessor(Preprocessor, ABC):
|
||||
self.use_fast = False if self.use_fast is None else self.use_fast
|
||||
|
||||
self.label2id = label2id
|
||||
if self.label2id is None:
|
||||
self.label2id = parse_label_mapping(self.model_dir)
|
||||
if self.label2id is None and model_dir is not None:
|
||||
self.label2id = parse_label_mapping(model_dir)
|
||||
super().__init__(mode, **kwargs)
|
||||
|
||||
@property
|
||||
@@ -106,6 +111,7 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
|
||||
label: str = 'label',
|
||||
label2id: dict = None,
|
||||
mode: str = ModeKeys.INFERENCE,
|
||||
use_fast: bool = None,
|
||||
**kwargs):
|
||||
"""The NLP tokenizer preprocessor base class.
|
||||
|
||||
@@ -122,11 +128,12 @@ class NLPTokenizerPreprocessorBase(NLPBasePreprocessor):
|
||||
- config.json label2id/id2label
|
||||
- label_mapping.json
|
||||
mode: Run this preprocessor in either 'train'/'eval'/'inference' mode, the behavior may be different.
|
||||
use_fast: use the fast version of tokenizer
|
||||
kwargs: These kwargs will be directly fed into the tokenizer.
|
||||
"""
|
||||
|
||||
super().__init__(model_dir, first_sequence, second_sequence, label,
|
||||
label2id, mode)
|
||||
label2id, mode, use_fast, **kwargs)
|
||||
self.model_dir = model_dir
|
||||
self.tokenize_kwargs = kwargs
|
||||
self.tokenizer = self.build_tokenizer(model_dir)
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from typing import Any, Dict, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Preprocessors
|
||||
@@ -20,9 +21,7 @@ class WordSegmentationBlankSetToLabelPreprocessor(NLPBasePreprocessor):
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.first_sequence: str = kwargs.pop('first_sequence',
|
||||
'first_sequence')
|
||||
self.first_sequence: str = kwargs.pop('first_sequence', 'tokens')
|
||||
self.label = kwargs.pop('label', OutputKeys.LABELS)
|
||||
|
||||
def __call__(self, data: str) -> Union[Dict[str, Any], Tuple]:
|
||||
@@ -80,10 +79,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
|
||||
'is_split_into_words', False)
|
||||
if 'label2id' in kwargs:
|
||||
kwargs.pop('label2id')
|
||||
self.tokenize_kwargs = kwargs
|
||||
|
||||
@type_assert(object, str)
|
||||
def __call__(self, data: str) -> Dict[str, Any]:
|
||||
@type_assert(object, (str, dict))
|
||||
def __call__(self, data: Union[dict, str]) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
|
||||
Args:
|
||||
@@ -99,18 +97,24 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
|
||||
text = None
|
||||
labels_list = None
|
||||
if isinstance(data, str):
|
||||
# for inference inputs without label
|
||||
text = data
|
||||
self.tokenize_kwargs['add_special_tokens'] = False
|
||||
elif isinstance(data, dict):
|
||||
# for finetune inputs with label
|
||||
text = data.get(self.first_sequence)
|
||||
labels_list = data.get(self.label)
|
||||
if isinstance(text, list):
|
||||
self.tokenize_kwargs['is_split_into_words'] = True
|
||||
|
||||
input_ids = []
|
||||
label_mask = []
|
||||
offset_mapping = []
|
||||
if self.is_split_into_words:
|
||||
for offset, token in enumerate(list(data)):
|
||||
subtoken_ids = self.tokenizer.encode(
|
||||
token, add_special_tokens=False)
|
||||
token_type_ids = []
|
||||
if self.is_split_into_words and self._mode == ModeKeys.INFERENCE:
|
||||
for offset, token in enumerate(list(text)):
|
||||
subtoken_ids = self.tokenizer.encode(token,
|
||||
**self.tokenize_kwargs)
|
||||
if len(subtoken_ids) == 0:
|
||||
subtoken_ids = [self.tokenizer.unk_token_id]
|
||||
input_ids.extend(subtoken_ids)
|
||||
@@ -119,10 +123,9 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
|
||||
else:
|
||||
if self.tokenizer.is_fast:
|
||||
encodings = self.tokenizer(
|
||||
text,
|
||||
add_special_tokens=False,
|
||||
return_offsets_mapping=True,
|
||||
**self.tokenize_kwargs)
|
||||
text, return_offsets_mapping=True, **self.tokenize_kwargs)
|
||||
attention_mask = encodings['attention_mask']
|
||||
token_type_ids = encodings['token_type_ids']
|
||||
input_ids = encodings['input_ids']
|
||||
word_ids = encodings.word_ids()
|
||||
for i in range(len(word_ids)):
|
||||
@@ -137,75 +140,85 @@ class TokenClassificationPreprocessor(NLPTokenizerPreprocessorBase):
|
||||
label_mask.append(1)
|
||||
offset_mapping.append(encodings['offset_mapping'][i])
|
||||
else:
|
||||
encodings = self.tokenizer(
|
||||
text, add_special_tokens=False, **self.tokenize_kwargs)
|
||||
encodings = self.tokenizer(text, **self.tokenize_kwargs)
|
||||
input_ids = encodings['input_ids']
|
||||
label_mask, offset_mapping = self.get_label_mask_and_offset_mapping(
|
||||
text)
|
||||
|
||||
if len(input_ids) >= self.sequence_length - 2:
|
||||
input_ids = input_ids[:self.sequence_length - 2]
|
||||
label_mask = label_mask[:self.sequence_length - 2]
|
||||
input_ids = [self.tokenizer.cls_token_id
|
||||
] + input_ids + [self.tokenizer.sep_token_id]
|
||||
label_mask = [0] + label_mask + [0]
|
||||
attention_mask = [1] * len(input_ids)
|
||||
offset_mapping = offset_mapping[:sum(label_mask)]
|
||||
|
||||
if not self.is_transformer_based_model:
|
||||
input_ids = input_ids[1:-1]
|
||||
attention_mask = attention_mask[1:-1]
|
||||
label_mask = label_mask[1:-1]
|
||||
|
||||
if self._mode == ModeKeys.INFERENCE:
|
||||
if len(input_ids) >= self.sequence_length - 2:
|
||||
input_ids = input_ids[:self.sequence_length - 2]
|
||||
label_mask = label_mask[:self.sequence_length - 2]
|
||||
input_ids = [self.tokenizer.cls_token_id
|
||||
] + input_ids + [self.tokenizer.sep_token_id]
|
||||
label_mask = [0] + label_mask + [0]
|
||||
attention_mask = [1] * len(input_ids)
|
||||
offset_mapping = offset_mapping[:sum(label_mask)]
|
||||
|
||||
if not self.is_transformer_based_model:
|
||||
input_ids = input_ids[1:-1]
|
||||
attention_mask = attention_mask[1:-1]
|
||||
label_mask = label_mask[1:-1]
|
||||
|
||||
input_ids = torch.tensor(input_ids).unsqueeze(0)
|
||||
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
|
||||
label_mask = torch.tensor(
|
||||
label_mask, dtype=torch.bool).unsqueeze(0)
|
||||
|
||||
# the token classification
|
||||
output = {
|
||||
'text': text,
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'label_mask': label_mask,
|
||||
'offset_mapping': offset_mapping
|
||||
}
|
||||
# the token classification
|
||||
output = {
|
||||
'text': text,
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'label_mask': label_mask,
|
||||
'offset_mapping': offset_mapping
|
||||
}
|
||||
else:
|
||||
output = {
|
||||
'input_ids': input_ids,
|
||||
'token_type_ids': token_type_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'label_mask': label_mask,
|
||||
}
|
||||
|
||||
# align the labels with tokenized text
|
||||
if labels_list is not None:
|
||||
assert self.label2id is not None
|
||||
# Map that sends B-Xxx label to its I-Xxx counterpart
|
||||
b_to_i_label = []
|
||||
label_enumerate_values = [
|
||||
k for k, v in sorted(
|
||||
self.label2id.items(), key=lambda item: item[1])
|
||||
]
|
||||
for idx, label in enumerate(label_enumerate_values):
|
||||
if label.startswith('B-') and label.replace(
|
||||
'B-', 'I-') in label_enumerate_values:
|
||||
b_to_i_label.append(
|
||||
label_enumerate_values.index(
|
||||
label.replace('B-', 'I-')))
|
||||
else:
|
||||
b_to_i_label.append(idx)
|
||||
|
||||
label_row = [self.label2id[lb] for lb in labels_list]
|
||||
previous_word_idx = None
|
||||
label_ids = []
|
||||
for word_idx in word_ids:
|
||||
if word_idx is None:
|
||||
label_ids.append(-100)
|
||||
elif word_idx != previous_word_idx:
|
||||
label_ids.append(label_row[word_idx])
|
||||
else:
|
||||
if self.label_all_tokens:
|
||||
label_ids.append(b_to_i_label[label_row[word_idx]])
|
||||
# align the labels with tokenized text
|
||||
if labels_list is not None:
|
||||
assert self.label2id is not None
|
||||
# Map that sends B-Xxx label to its I-Xxx counterpart
|
||||
b_to_i_label = []
|
||||
label_enumerate_values = [
|
||||
k for k, v in sorted(
|
||||
self.label2id.items(), key=lambda item: item[1])
|
||||
]
|
||||
for idx, label in enumerate(label_enumerate_values):
|
||||
if label.startswith('B-') and label.replace(
|
||||
'B-', 'I-') in label_enumerate_values:
|
||||
b_to_i_label.append(
|
||||
label_enumerate_values.index(
|
||||
label.replace('B-', 'I-')))
|
||||
else:
|
||||
b_to_i_label.append(idx)
|
||||
|
||||
label_row = [self.label2id[lb] for lb in labels_list]
|
||||
previous_word_idx = None
|
||||
label_ids = []
|
||||
for word_idx in word_ids:
|
||||
if word_idx is None:
|
||||
label_ids.append(-100)
|
||||
previous_word_idx = word_idx
|
||||
labels = label_ids
|
||||
output['labels'] = labels
|
||||
elif word_idx != previous_word_idx:
|
||||
label_ids.append(label_row[word_idx])
|
||||
else:
|
||||
if self.label_all_tokens:
|
||||
label_ids.append(b_to_i_label[label_row[word_idx]])
|
||||
else:
|
||||
label_ids.append(-100)
|
||||
previous_word_idx = word_idx
|
||||
labels = label_ids
|
||||
output['labels'] = labels
|
||||
output = {
|
||||
k: np.array(v) if isinstance(v, list) else v
|
||||
for k, v in output.items()
|
||||
}
|
||||
return output
|
||||
|
||||
def get_tokenizer_class(self):
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
import unicodedata2
|
||||
from torchvision import transforms
|
||||
from torchvision.transforms import InterpolationMode
|
||||
from torchvision.transforms import functional as F
|
||||
from zhconv import convert
|
||||
|
||||
from modelscope.preprocessors.image import load_image
|
||||
from modelscope.utils.constant import ModeKeys
|
||||
from .base import OfaBasePreprocessor
|
||||
|
||||
@@ -98,8 +98,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
|
||||
|
||||
def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample = self._build_infer_sample(data)
|
||||
target = data[self.column_map['text']]
|
||||
target = target.translate(self.transtab).strip()
|
||||
target = sample['label']
|
||||
target_token_list = target.strip().split()
|
||||
target = ' '.join(target_token_list[:self.max_tgt_length])
|
||||
sample['target'] = self.tokenize_text(target, add_bos=False)
|
||||
@@ -119,5 +118,7 @@ class OfaOcrRecognitionPreprocessor(OfaBasePreprocessor):
|
||||
'patch_mask': torch.tensor([True])
|
||||
}
|
||||
if 'text' in self.column_map and self.column_map['text'] in data:
|
||||
sample['label'] = data[self.column_map['text']]
|
||||
target = data[self.column_map['text']]
|
||||
target = unicodedata2.normalize('NFKC', convert(target, 'zh-hans'))
|
||||
sample['label'] = target
|
||||
return sample
|
||||
|
||||
@@ -18,7 +18,7 @@ class TextGenerationTrainer(NlpEpochBasedTrainer):
|
||||
return tokenizer.decode(tokens.tolist(), skip_special_tokens=True)
|
||||
|
||||
def evaluation_step(self, data):
|
||||
model = self.model
|
||||
model = self.model.module if self._dist else self.model
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
@@ -586,14 +586,16 @@ class NlpEpochBasedTrainer(EpochBasedTrainer):
|
||||
preprocessor_mode=ModeKeys.TRAIN,
|
||||
**model_args,
|
||||
**self.train_keys,
|
||||
mode=ModeKeys.TRAIN)
|
||||
mode=ModeKeys.TRAIN,
|
||||
use_fast=True)
|
||||
eval_preprocessor = Preprocessor.from_pretrained(
|
||||
self.model_dir,
|
||||
cfg_dict=self.cfg,
|
||||
preprocessor_mode=ModeKeys.EVAL,
|
||||
**model_args,
|
||||
**self.eval_keys,
|
||||
mode=ModeKeys.EVAL)
|
||||
mode=ModeKeys.EVAL,
|
||||
use_fast=True)
|
||||
return train_preprocessor, eval_preprocessor
|
||||
|
||||
|
||||
|
||||
@@ -876,7 +876,7 @@ class EpochBasedTrainer(BaseTrainer):
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
"""
|
||||
model = self.model
|
||||
model = self.model.module if self._dist else self.model
|
||||
model.eval()
|
||||
|
||||
if is_parallel(model):
|
||||
|
||||
@@ -238,6 +238,14 @@ class DownloadMode(enum.Enum):
|
||||
FORCE_REDOWNLOAD = 'force_redownload'
|
||||
|
||||
|
||||
class DownloadChannel(enum.Enum):
|
||||
""" Channels of datasets downloading for uv/pv counting.
|
||||
"""
|
||||
LOCAL = 'local'
|
||||
DSW = 'dsw'
|
||||
EAIS = 'eais'
|
||||
|
||||
|
||||
class UploadMode(enum.Enum):
|
||||
""" How to upload object to remote.
|
||||
"""
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
addict
|
||||
attrs
|
||||
datasets
|
||||
# version beyond 2.5.2 introduces compatbility issue and is being resolved
|
||||
datasets<=2.5.2
|
||||
easydict
|
||||
einops
|
||||
filelock>=3.3.0
|
||||
|
||||
@@ -11,3 +11,5 @@ timm
|
||||
tokenizers
|
||||
torchvision
|
||||
transformers>=4.12.0
|
||||
unicodedata2
|
||||
zhconv
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
biopython
|
||||
iopath
|
||||
ipdb
|
||||
lmdb
|
||||
ml_collections
|
||||
scipy
|
||||
|
||||
@@ -8,7 +8,8 @@ import zipfile
|
||||
from modelscope.msdatasets import MsDataset
|
||||
from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects
|
||||
from modelscope.utils import logger as logging
|
||||
from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile
|
||||
from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode,
|
||||
ModelFile)
|
||||
from modelscope.utils.test_utils import test_level
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
@@ -104,7 +105,10 @@ class DatasetUploadTest(unittest.TestCase):
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
def test_ds_download_dir(self):
|
||||
test_ds = MsDataset.load(self.dataset_name, self.namespace)
|
||||
test_ds = MsDataset.load(
|
||||
self.dataset_name,
|
||||
namespace=self.namespace,
|
||||
download_mode=DownloadMode.FORCE_REDOWNLOAD)
|
||||
assert test_ds.config_kwargs['split_config'].values()
|
||||
|
||||
@unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
|
||||
|
||||
@@ -21,9 +21,10 @@ class TestModelOutput(unittest.TestCase):
|
||||
self.assertEqual(outputs['logits'], torch.Tensor([1]))
|
||||
self.assertEqual(outputs[0], torch.Tensor([1]))
|
||||
self.assertEqual(outputs.logits, torch.Tensor([1]))
|
||||
outputs.loss = torch.Tensor([2])
|
||||
logits, loss = outputs
|
||||
self.assertEqual(logits, torch.Tensor([1]))
|
||||
self.assertTrue(loss is None)
|
||||
self.assertTrue(loss is not None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -19,9 +19,11 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
self.task = Tasks.named_entity_recognition
|
||||
self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
|
||||
|
||||
english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom'
|
||||
tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news'
|
||||
lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news'
|
||||
sentence = '这与温岭市新河镇的一个神秘的传说有关。'
|
||||
sentence_en = 'pizza shovel'
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_tcrf_by_direct_model_download(self):
|
||||
@@ -89,6 +91,12 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
task=Tasks.named_entity_recognition, model=self.lcrf_model_id)
|
||||
print(pipeline_ins(input=self.sentence))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_english_with_model_name(self):
|
||||
pipeline_ins = pipeline(
|
||||
task=Tasks.named_entity_recognition, model=self.english_model_id)
|
||||
print(pipeline_ins(input='pizza shovel'))
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
def test_run_with_default_model(self):
|
||||
pipeline_ins = pipeline(task=Tasks.named_entity_recognition)
|
||||
|
||||
@@ -19,7 +19,7 @@ class UnifoldProteinStructureTest(unittest.TestCase, DemoCompatibilityCheck):
|
||||
self.protein_multimer = 'GAMGLPEEPSSPQESTLKALSLYEAHLSSYIMYLQTFLVKTKQKVNNKNYPEFTLFDTSKLKKDQTLKSIKT' + \
|
||||
'NIAALKNHIDKIKPIAMQIYKKYSKNIP'
|
||||
|
||||
@unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
|
||||
@unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
|
||||
def test_run_by_direct_model_download(self):
|
||||
model_dir = snapshot_download(self.model_id)
|
||||
mono_pipeline_ins = pipeline(task=self.task, model=model_dir)
|
||||
|
||||
@@ -87,7 +87,7 @@ class TestFinetuneTokenClassification(unittest.TestCase):
|
||||
cfg['dataset'] = {
|
||||
'train': {
|
||||
'labels': label_enumerate_values,
|
||||
'first_sequence': 'first_sequence',
|
||||
'first_sequence': 'tokens',
|
||||
'label': 'labels',
|
||||
}
|
||||
}
|
||||
|
||||
@@ -85,7 +85,7 @@ class TestOfaTrainer(unittest.TestCase):
|
||||
'ocr_fudanvi_zh',
|
||||
subset_name='scene',
|
||||
namespace='modelscope',
|
||||
split='train[:200]',
|
||||
split='train[800:900]',
|
||||
download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS),
|
||||
eval_dataset=MsDataset.load(
|
||||
'ocr_fudanvi_zh',
|
||||
|
||||
Reference in New Issue
Block a user