mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-16 16:27:45 +01:00
Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/14103985 * tester * bug fixed: download_file not import * input example analyzer * api: get_task_input_examples * update chat * 43 task * fix decode_base64 * json format * schema
514 lines
14 KiB
Python
514 lines
14 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
from modelscope.utils.constant import Tasks
|
|
|
|
|
|
class InputKeys(object):
|
|
IMAGE = 'image'
|
|
TEXT = 'text'
|
|
VIDEO = 'video'
|
|
|
|
|
|
class InputType(object):
|
|
IMAGE = 'image'
|
|
TEXT = 'text'
|
|
AUDIO = 'audio'
|
|
VIDEO = 'video'
|
|
BOX = 'box'
|
|
DICT = 'dict'
|
|
LIST = 'list'
|
|
NUMBER = 'number'
|
|
|
|
|
|
INPUT_TYPE = {
|
|
InputType.IMAGE: (str, np.ndarray, Image.Image),
|
|
InputType.TEXT: str,
|
|
InputType.AUDIO: (str, bytes, np.ndarray),
|
|
InputType.VIDEO: (str, np.ndarray, 'cv2.VideoCapture'),
|
|
InputType.BOX: (list, np.ndarray),
|
|
InputType.DICT: (dict, type(None)),
|
|
InputType.LIST: (list, type(None)),
|
|
InputType.NUMBER: int,
|
|
}
|
|
|
|
INPUT_TYPE_SCHEMA = {
|
|
InputType.IMAGE: {
|
|
'type': 'string',
|
|
'description': 'Base64 encoded image file or url string.'
|
|
}, # support url or base64 encoded file.
|
|
InputType.AUDIO: {
|
|
'type': 'string',
|
|
'description': 'Base64 encoded audio file or url string..'
|
|
}, # support url or base64 encoded file.
|
|
InputType.VIDEO: {
|
|
'type': 'string',
|
|
'description': 'Base64 encoded video file or url string..'
|
|
}, # support url or base64 encoded file.
|
|
InputType.TEXT: {
|
|
'type': 'string',
|
|
'description': 'The input text.'
|
|
},
|
|
InputType.BOX: {
|
|
'type': 'array',
|
|
'description': 'Box coordinate, should be int.',
|
|
'items': {
|
|
'type': 'number'
|
|
}
|
|
},
|
|
InputType.DICT: { # unknown properties
|
|
'type': 'object',
|
|
},
|
|
InputType.LIST: {
|
|
'type': 'array'
|
|
}, # unknown item type.
|
|
InputType.NUMBER: {
|
|
'type': 'integer'
|
|
},
|
|
}
|
|
|
|
|
|
def check_input_type(input_type, input):
|
|
expected_type = INPUT_TYPE[input_type]
|
|
if input_type == InputType.VIDEO:
|
|
# special type checking using class name, to avoid introduction of opencv dependency into fundamental framework.
|
|
assert type(input).__name__ == 'VideoCapture' or isinstance(input, expected_type),\
|
|
f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
|
|
else:
|
|
assert isinstance(input, expected_type), \
|
|
f'invalid input type for {input_type}, expected {expected_type} but got {type(input)}\n {input}'
|
|
|
|
|
|
TASK_INPUTS = {
|
|
Tasks.image_text_retrieval: {
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
InputKeys.TEXT: InputType.TEXT
|
|
},
|
|
Tasks.general_recognition: {
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
InputKeys.TEXT: InputType.TEXT
|
|
},
|
|
Tasks.video_depth_estimation: {
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
InputKeys.TEXT: InputType.TEXT
|
|
},
|
|
Tasks.indoor_layout_estimation:
|
|
InputType.IMAGE,
|
|
Tasks.image_demoireing:
|
|
InputType.IMAGE,
|
|
Tasks.panorama_depth_estimation:
|
|
InputType.IMAGE,
|
|
Tasks.video_depth_estimation:
|
|
InputType.VIDEO,
|
|
Tasks.animal_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.motion_generation:
|
|
InputType.TEXT,
|
|
Tasks.video_panoptic_segmentation:
|
|
InputType.VIDEO,
|
|
|
|
|
|
|
|
Tasks.task_template: {
|
|
'image': InputType.IMAGE,
|
|
'text': InputType.TEXT
|
|
},
|
|
# if task input is single var, value is InputType
|
|
# if task input is a tuple, value is tuple of InputType
|
|
# if task input is a dict, value is a dict of InputType, where key
|
|
# equals the one needed in pipeline input dict
|
|
# if task input is a list, value is a set of input format, in which
|
|
# each element corresponds to one input format as described above and
|
|
# must include a dict format.
|
|
|
|
# ============ vision tasks ===================
|
|
Tasks.ocr_detection:
|
|
InputType.IMAGE,
|
|
Tasks.ocr_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.face_2d_keypoints:
|
|
InputType.IMAGE,
|
|
Tasks.face_liveness:
|
|
InputType.IMAGE,
|
|
Tasks.face_quality_assessment:
|
|
InputType.IMAGE,
|
|
Tasks.card_detection:
|
|
InputType.IMAGE,
|
|
Tasks.license_plate_detection:
|
|
InputType.IMAGE,
|
|
Tasks.card_detection_correction:
|
|
InputType.IMAGE,
|
|
Tasks.lineless_table_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.table_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.face_detection:
|
|
InputType.IMAGE,
|
|
Tasks.facial_expression_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.face_attribute_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.face_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.face_reconstruction:
|
|
InputType.IMAGE,
|
|
Tasks.head_reconstruction:
|
|
InputType.IMAGE,
|
|
Tasks.text_to_head:
|
|
InputType.TEXT,
|
|
Tasks.human_detection:
|
|
InputType.IMAGE,
|
|
Tasks.face_image_generation:
|
|
InputType.NUMBER,
|
|
Tasks.image_classification:
|
|
InputType.IMAGE,
|
|
Tasks.image_quality_assessment_mos:
|
|
InputType.IMAGE,
|
|
Tasks.image_quality_assessment_degradation:
|
|
InputType.IMAGE,
|
|
Tasks.image_object_detection:
|
|
InputType.IMAGE,
|
|
Tasks.domain_specific_object_detection:
|
|
InputType.IMAGE,
|
|
Tasks.human_wholebody_keypoint:
|
|
InputType.IMAGE,
|
|
Tasks.image_segmentation:
|
|
InputType.IMAGE,
|
|
Tasks.portrait_matting:
|
|
InputType.IMAGE,
|
|
Tasks.universal_matting:
|
|
InputType.IMAGE,
|
|
Tasks.product_segmentation:
|
|
InputType.IMAGE,
|
|
Tasks.semantic_segmentation:
|
|
InputType.IMAGE,
|
|
Tasks.face_human_hand_detection:
|
|
InputType.IMAGE,
|
|
Tasks.hand_static:
|
|
InputType.IMAGE,
|
|
Tasks.image_fewshot_detection:
|
|
InputType.IMAGE,
|
|
Tasks.open_vocabulary_detection: {
|
|
'img': InputType.IMAGE,
|
|
'category_names': InputType.TEXT
|
|
},
|
|
Tasks.image_driving_perception:
|
|
InputType.IMAGE,
|
|
Tasks.vision_efficient_tuning:
|
|
InputType.IMAGE,
|
|
|
|
# image editing task result for a single image
|
|
Tasks.skin_retouching:
|
|
InputType.IMAGE,
|
|
Tasks.image_super_resolution:
|
|
InputType.IMAGE,
|
|
Tasks.image_colorization:
|
|
InputType.IMAGE,
|
|
Tasks.image_color_enhancement:
|
|
InputType.IMAGE,
|
|
Tasks.image_denoising:
|
|
InputType.IMAGE,
|
|
Tasks.image_body_reshaping:
|
|
InputType.IMAGE,
|
|
Tasks.image_portrait_enhancement:
|
|
InputType.IMAGE,
|
|
Tasks.crowd_counting:
|
|
InputType.IMAGE,
|
|
Tasks.image_super_resolution_pasd: {
|
|
'image': InputType.IMAGE,
|
|
'prompt': InputType.TEXT,
|
|
},
|
|
Tasks.image_inpainting: {
|
|
'img': InputType.IMAGE,
|
|
'mask': InputType.IMAGE,
|
|
},
|
|
Tasks.image_paintbyexample: {
|
|
'img': InputType.IMAGE,
|
|
'mask': InputType.IMAGE,
|
|
'reference': InputType.IMAGE,
|
|
},
|
|
Tasks.image_skychange: {
|
|
'sky_image': InputType.IMAGE,
|
|
'scene_image': InputType.IMAGE,
|
|
},
|
|
Tasks.controllable_image_generation: {
|
|
'image': InputType.IMAGE,
|
|
'prompt': InputType.TEXT,
|
|
},
|
|
Tasks.image_face_fusion: {
|
|
'template': InputType.IMAGE,
|
|
'user': InputType.IMAGE,
|
|
},
|
|
Tasks.image_deblurring:
|
|
InputType.IMAGE,
|
|
Tasks.video_colorization:
|
|
InputType.VIDEO,
|
|
|
|
# image generation task result for a single image
|
|
Tasks.image_to_image_generation:
|
|
InputType.IMAGE,
|
|
Tasks.image_to_image_translation:
|
|
InputType.IMAGE,
|
|
Tasks.image_style_transfer: {
|
|
'content': InputType.IMAGE,
|
|
'style': InputType.IMAGE,
|
|
},
|
|
Tasks.image_portrait_stylization:
|
|
InputType.IMAGE,
|
|
Tasks.live_category:
|
|
InputType.VIDEO,
|
|
Tasks.action_recognition:
|
|
InputType.VIDEO,
|
|
Tasks.body_2d_keypoints:
|
|
InputType.IMAGE,
|
|
Tasks.body_3d_keypoints:
|
|
InputType.VIDEO,
|
|
Tasks.hand_2d_keypoints:
|
|
InputType.IMAGE,
|
|
Tasks.pedestrian_attribute_recognition:
|
|
InputType.IMAGE,
|
|
Tasks.video_single_object_tracking: (InputType.VIDEO, InputType.BOX),
|
|
Tasks.video_multi_object_tracking:
|
|
InputType.VIDEO,
|
|
Tasks.video_category:
|
|
InputType.VIDEO,
|
|
Tasks.product_retrieval_embedding:
|
|
InputType.IMAGE,
|
|
Tasks.video_embedding:
|
|
InputType.VIDEO,
|
|
Tasks.virtual_try_on: [
|
|
(InputType.IMAGE, InputType.IMAGE, InputType.IMAGE),
|
|
{
|
|
'masked_model': InputType.IMAGE,
|
|
'pose': InputType.IMAGE,
|
|
'cloth': InputType.IMAGE,
|
|
}
|
|
],
|
|
Tasks.text_driven_segmentation: {
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
InputKeys.TEXT: InputType.TEXT
|
|
},
|
|
Tasks.shop_segmentation:
|
|
InputType.IMAGE,
|
|
Tasks.movie_scene_segmentation:
|
|
InputType.VIDEO,
|
|
Tasks.bad_image_detecting:
|
|
InputType.IMAGE,
|
|
Tasks.image_try_on: {
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
InputKeys.IMAGE: InputType.IMAGE
|
|
},
|
|
Tasks.human_image_generation: {
|
|
InputKeys.IMAGE: InputType.IMAGE,
|
|
'target_pose_path': InputType.TEXT
|
|
},
|
|
|
|
# ============ nlp tasks ===================
|
|
Tasks.chat: [
|
|
InputType.TEXT,
|
|
{
|
|
'text': InputType.TEXT,
|
|
'history': InputType.LIST,
|
|
}
|
|
],
|
|
Tasks.text_classification: [
|
|
InputType.TEXT,
|
|
(InputType.TEXT, InputType.TEXT),
|
|
{
|
|
'text': InputType.TEXT,
|
|
'text2': InputType.TEXT
|
|
},
|
|
],
|
|
Tasks.sentence_similarity: [
|
|
(InputType.TEXT, InputType.TEXT),
|
|
{
|
|
'source_text': InputType.TEXT,
|
|
'target_text': InputType.TEXT,
|
|
},
|
|
],
|
|
Tasks.nli: (InputType.TEXT, InputType.TEXT),
|
|
Tasks.sentiment_classification:
|
|
InputType.TEXT,
|
|
Tasks.zero_shot_classification:
|
|
InputType.TEXT,
|
|
Tasks.relation_extraction:
|
|
InputType.TEXT,
|
|
Tasks.translation:
|
|
InputType.TEXT,
|
|
Tasks.text_summarization: [InputType.TEXT, {
|
|
'text': InputType.TEXT,
|
|
}],
|
|
Tasks.competency_aware_translation:
|
|
InputType.TEXT,
|
|
Tasks.word_segmentation: [InputType.TEXT, {
|
|
'text': InputType.TEXT,
|
|
}],
|
|
Tasks.part_of_speech:
|
|
InputType.TEXT,
|
|
Tasks.named_entity_recognition:
|
|
InputType.TEXT,
|
|
Tasks.text_error_correction:
|
|
InputType.TEXT,
|
|
Tasks.sentence_embedding: {
|
|
'source_sentence': InputType.LIST,
|
|
'sentences_to_compare': InputType.LIST,
|
|
},
|
|
Tasks.text_ranking: [
|
|
(InputType.TEXT, InputType.TEXT),
|
|
{
|
|
'source_sentence': InputType.LIST,
|
|
'sentences_to_compare': InputType.LIST
|
|
}
|
|
],
|
|
Tasks.text_generation:
|
|
InputType.TEXT,
|
|
Tasks.fid_dialogue: {
|
|
'history': InputType.TEXT,
|
|
'knowledge': InputType.TEXT,
|
|
'bot_profile': InputType.TEXT,
|
|
'user_profile': InputType.TEXT,
|
|
},
|
|
Tasks.fill_mask:
|
|
InputType.TEXT,
|
|
Tasks.table_question_answering: {
|
|
'question': InputType.TEXT,
|
|
'history_sql': InputType.DICT,
|
|
},
|
|
Tasks.faq_question_answering: {
|
|
'query_set': InputType.LIST,
|
|
'support_set': InputType.LIST,
|
|
},
|
|
Tasks.translation_evaluation: {
|
|
'hyp': InputType.LIST,
|
|
'src': InputType.LIST,
|
|
'ref': InputType.LIST,
|
|
},
|
|
Tasks.sudoku:
|
|
InputType.TEXT,
|
|
Tasks.text2sql: {
|
|
'text': InputType.TEXT,
|
|
'database': InputType.TEXT
|
|
},
|
|
Tasks.document_grounded_dialog_generate: {
|
|
'query': InputType.LIST,
|
|
'context': InputType.LIST,
|
|
'label': InputType.LIST,
|
|
},
|
|
Tasks.document_grounded_dialog_rerank: {
|
|
'dataset': InputType.LIST
|
|
},
|
|
Tasks.document_grounded_dialog_retrieval: {
|
|
'query': InputType.LIST,
|
|
'positive': InputType.LIST,
|
|
'negative': InputType.LIST
|
|
},
|
|
Tasks.machine_reading_comprehension:
|
|
InputType.TEXT,
|
|
|
|
# ============ audio tasks ===================
|
|
Tasks.auto_speech_recognition: # input can be audio, or audio and text.
|
|
[InputType.AUDIO, {
|
|
'wav': InputType.AUDIO,
|
|
'text': InputType.TEXT
|
|
}],
|
|
Tasks.speech_signal_process:
|
|
InputType.AUDIO,
|
|
Tasks.acoustic_echo_cancellation: {
|
|
'nearend_mic': InputType.AUDIO,
|
|
'farend_speech': InputType.AUDIO
|
|
},
|
|
Tasks.speech_separation:
|
|
InputType.AUDIO,
|
|
Tasks.acoustic_noise_suppression:
|
|
InputType.AUDIO,
|
|
Tasks.text_to_speech:
|
|
InputType.TEXT,
|
|
Tasks.keyword_spotting:
|
|
InputType.AUDIO,
|
|
Tasks.speaker_diarization_dialogue_detection:
|
|
InputType.TEXT,
|
|
Tasks.language_score_prediction:
|
|
InputType.TEXT,
|
|
Tasks.punctuation:
|
|
InputType.TEXT,
|
|
Tasks.speech_language_recognition:
|
|
InputType.AUDIO,
|
|
Tasks.speaker_diarization_semantic_speaker_turn_detection:
|
|
InputType.TEXT,
|
|
Tasks.inverse_text_processing:
|
|
InputType.TEXT,
|
|
Tasks.speaker_verification: [InputType.AUDIO, InputType.AUDIO],
|
|
|
|
# ============ multi-modal tasks ===================
|
|
Tasks.image_captioning: [InputType.IMAGE, {
|
|
'image': InputType.IMAGE,
|
|
}],
|
|
Tasks.video_captioning: [InputType.VIDEO, {
|
|
'video': InputType.VIDEO,
|
|
}],
|
|
Tasks.multimodal_dialogue: {
|
|
'messages': InputType.LIST,
|
|
},
|
|
Tasks.visual_grounding: {
|
|
'image': InputType.IMAGE,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.text_to_image_synthesis: {
|
|
'text': InputType.TEXT,
|
|
},
|
|
Tasks.multi_modal_embedding: {
|
|
'img': InputType.IMAGE,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.generative_multi_modal_embedding: {
|
|
'image': InputType.IMAGE,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.multi_modal_similarity: {
|
|
'img': InputType.IMAGE,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.text_video_retrieval: {
|
|
'video': InputType.VIDEO,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.visual_question_answering: {
|
|
'image': InputType.IMAGE,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.video_question_answering: {
|
|
'video': InputType.VIDEO,
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.visual_entailment: {
|
|
'image': InputType.IMAGE,
|
|
'text': InputType.TEXT,
|
|
'text2': InputType.TEXT,
|
|
},
|
|
Tasks.action_detection:
|
|
InputType.VIDEO,
|
|
Tasks.human_reconstruction:
|
|
InputType.IMAGE,
|
|
Tasks.image_reid_person:
|
|
InputType.IMAGE,
|
|
Tasks.video_inpainting: {
|
|
'video_input_path': InputType.TEXT,
|
|
'video_output_path': InputType.TEXT,
|
|
'mask_path': InputType.TEXT,
|
|
},
|
|
Tasks.text_to_video_synthesis: {
|
|
'text': InputType.TEXT
|
|
},
|
|
Tasks.video_summarization: InputType.TEXT,
|
|
Tasks.text_to_360panorama_image: {
|
|
'prompt': InputType.TEXT,
|
|
},
|
|
Tasks.image_editing: {
|
|
'img': InputType.IMAGE,
|
|
'prompts': InputType.LIST
|
|
}
|
|
}
|