Files
modelscope/modelscope/outputs.py

666 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Copyright (c) Alibaba, Inc. and its affiliates.
from modelscope.utils.constant import Tasks
class OutputKeys(object):
LOSS = 'loss'
LOGITS = 'logits'
SCORES = 'scores'
SCORE = 'score'
LABEL = 'label'
LABELS = 'labels'
INPUT_IDS = 'input_ids'
LABEL_POS = 'label_pos'
POSES = 'poses'
CAPTION = 'caption'
BOXES = 'boxes'
KEYPOINTS = 'keypoints'
MASKS = 'masks'
TEXT = 'text'
POLYGONS = 'polygons'
OUTPUT = 'output'
OUTPUT_IMG = 'output_img'
OUTPUT_PCM = 'output_pcm'
IMG_EMBEDDING = 'img_embedding'
SPO_LIST = 'spo_list'
TEXT_EMBEDDING = 'text_embedding'
TRANSLATION = 'translation'
RESPONSE = 'response'
PREDICTION = 'prediction'
PREDICTIONS = 'predictions'
PROBABILITIES = 'probabilities'
DIALOG_STATES = 'dialog_states'
VIDEO_EMBEDDING = 'video_embedding'
UUID = 'uuid'
WORD = 'word'
KWS_LIST = 'kws_list'
HISTORY = 'history'
TIMESTAMPS = 'timestamps'
SPLIT_VIDEO_NUM = 'split_video_num'
SPLIT_META_LIST = 'split_meta_list'
TASK_OUTPUTS = {
# ============ vision tasks ===================
# ocr detection result for single sample
# {
# "polygons": np.array with shape [num_text, 8], each polygon is
# [x1, y1, x2, y2, x3, y3, x4, y4]
# }
Tasks.ocr_detection: [OutputKeys.POLYGONS],
# ocr recognition result for single sample
# {
# "text": "电子元器件提供BOM配单"
# }
Tasks.ocr_recognition: [OutputKeys.TEXT],
# face 2d keypoint result for single sample
# {
# "keypoints": [
# [x1, y1]*106
# ],
# "poses": [pitch, roll, yaw]
# }
Tasks.face_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.POSES],
# face detection result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "keypoints": [
# [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
# [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
# [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
# [x1, y1, x2, y2, x3, y3, x4, y4, x5, y5],
# ],
# }
Tasks.face_detection:
[OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS],
# facial expression recognition result for single sample
# {
# "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02],
# "labels": ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
# }
Tasks.facial_expression_recognition:
[OutputKeys.SCORES, OutputKeys.LABELS],
# face recognition result for single sample
# {
# "img_embedding": np.array with shape [1, D],
# }
Tasks.face_recognition: [OutputKeys.IMG_EMBEDDING],
# human detection result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
# "labels": ["person", "person", "person", "person"],
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# }
#
Tasks.human_detection:
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
# face generation result for single sample
# {
# "output_img": np.array with shape(h, w, 3)
# }
Tasks.face_image_generation: [OutputKeys.OUTPUT_IMG],
# image classification result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
# "labels": ["dog", "horse", "cow", "cat"],
# }
Tasks.image_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
# object detection result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
# "labels": ["dog", "horse", "cow", "cat"],
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# }
Tasks.image_object_detection:
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES],
# instance segmentation result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05],
# "labels": ["dog", "horse", "cow", "cat"],
# "masks": [
# np.array # 2D array containing only 0, 1
# ]
# }
Tasks.image_segmentation:
[OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.MASKS],
# semantic segmentation result for single sample
# {
# "masks": [np.array # 2D array with shape [height, width]]
# }
Tasks.semantic_segmentation: [OutputKeys.MASKS],
# image matting result for single sample
# {
# "output_img": np.array with shape(h, w, 4)
# for matting or (h, w, 3) for general purpose
# , shape(h, w) for crowd counting
# }
Tasks.portrait_matting: [OutputKeys.OUTPUT_IMG],
# image editing task result for a single image
# {"output_img": np.array with shape (h, w, 3)}
Tasks.skin_retouching: [OutputKeys.OUTPUT_IMG],
Tasks.image_super_resolution: [OutputKeys.OUTPUT_IMG],
Tasks.image_colorization: [OutputKeys.OUTPUT_IMG],
Tasks.image_color_enhancement: [OutputKeys.OUTPUT_IMG],
Tasks.image_denoising: [OutputKeys.OUTPUT_IMG],
Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG],
Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG],
# image generation task result for a single image
# {"output_img": np.array with shape (h, w, 3)}
Tasks.image_to_image_generation: [OutputKeys.OUTPUT_IMG],
Tasks.image_to_image_translation: [OutputKeys.OUTPUT_IMG],
Tasks.image_style_transfer: [OutputKeys.OUTPUT_IMG],
Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG],
# live category recognition result for single video
# {
# "scores": [0.885272, 0.014790631, 0.014558001]
# "labels": ['女装/女士精品>>棉衣/棉服', '女装/女士精品>>牛仔裤', '女装/女士精品>>裤子>>休闲裤'],
# }
Tasks.live_category: [OutputKeys.SCORES, OutputKeys.LABELS],
# action recognition result for single video
# {
# "output_label": "abseiling"
# }
Tasks.action_recognition: [OutputKeys.LABELS],
# human body keypoints detection result for single sample
# {
# "poses": [
# [[x, y]*15],
# [[x, y]*15],
# [[x, y]*15]
# ]
# "scores": [
# [[score]*15],
# [[score]*15],
# [[score]*15]
# ]
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ]
# }
Tasks.body_2d_keypoints:
[OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES],
# 3D human body keypoints detection result for single sample
# {
# "poses": [
# [[x, y, z]*17],
# [[x, y, z]*17],
# [[x, y, z]*17]
# ]
# }
Tasks.body_3d_keypoints: [OutputKeys.POSES],
# 2D hand keypoints result for single sample
# {
# "keypoints": [
# [[x, y, score] * 21],
# [[x, y, score] * 21],
# [[x, y, score] * 21],
# ],
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ]
# }
Tasks.hand_2d_keypoints: [OutputKeys.KEYPOINTS, OutputKeys.BOXES],
# video single object tracking result for single video
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "timestamps": ["hh:mm:ss", "hh:mm:ss", "hh:mm:ss"]
# }
Tasks.video_single_object_tracking:
[OutputKeys.BOXES, OutputKeys.TIMESTAMPS],
# live category recognition result for single video
# {
# "scores": [0.885272, 0.014790631, 0.014558001],
# 'labels': ['修身型棉衣', '高腰牛仔裤', '休闲连体裤']
# }
Tasks.live_category: [OutputKeys.SCORES, OutputKeys.LABELS],
# video category recognition result for single video
# {
# "scores": [0.7716429233551025],
# "labels": ['生活>>好物推荐']
# }
Tasks.video_category: [OutputKeys.SCORES, OutputKeys.LABELS],
# image embedding result for a single image
# {
# "image_bedding": np.array with shape [D]
# }
Tasks.product_retrieval_embedding: [OutputKeys.IMG_EMBEDDING],
# video embedding result for single video
# {
# "video_embedding": np.array with shape [D],
# }
Tasks.video_embedding: [OutputKeys.VIDEO_EMBEDDING],
# virtual_try_on result for a single sample
# {
# "output_img": np.ndarray with shape [height, width, 3]
# }
Tasks.virtual_try_on: [OutputKeys.OUTPUT_IMG],
# text driven segmentation result for single sample
# {
# "masks": [
# np.array # 2D array containing only 0, 255
# ]
# }
Tasks.text_driven_segmentation: [OutputKeys.MASKS],
# shop segmentation result for single sample
# {
# "masks": [
# np.array # 2D array containing only 0, 255
# ]
# }
Tasks.shop_segmentation: [OutputKeys.MASKS],
# movide scene segmentation result for a single video
# {
# "split_video_num":3,
# "split_meta_list":
# [
# {
# "shot": [0,1,2],
# "frame": [start_frame, end_frame],
# "timestamp": [start_timestamp, end_timestamp] # ['00:00:01.133', '00:00:02.245']
# }
# ]
#
# }
Tasks.movie_scene_segmentation:
[OutputKeys.SPLIT_VIDEO_NUM, OutputKeys.SPLIT_META_LIST],
# ============ nlp tasks ===================
# text classification result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
# "labels": ["happy", "sad", "calm", "angry"],
# }
Tasks.text_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
# sentence similarity result for single sample
# {
# "scores": 0.9
# "labels": "1",
# }
Tasks.sentence_similarity: [OutputKeys.SCORES, OutputKeys.LABELS],
# nli result for single sample
# {
# "labels": ["happy", "sad", "calm", "angry"],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.nli: [OutputKeys.SCORES, OutputKeys.LABELS],
# sentiment classification result for single sample
# {
# 'scores': [0.07183828949928284, 0.9281617403030396],
# 'labels': ['1', '0']
# }
Tasks.sentiment_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
# zero-shot classification result for single sample
# {
# "scores": [0.9, 0.1, 0.05, 0.05]
# "labels": ["happy", "sad", "calm", "angry"],
# }
Tasks.zero_shot_classification: [OutputKeys.SCORES, OutputKeys.LABELS],
# relation extraction result for a single sample
# {
# "uuid": "人生信息-1",
# "text": "《父老乡亲》是由是由由中国人民解放军海政文工团创作的军旅歌曲,石顺义作词,王锡仁作曲,范琳琳演唱",
# "spo_list": [{"subject": "石顺义", "predicate": "国籍", "object": "中国"}]
# }
Tasks.relation_extraction: [OutputKeys.SPO_LIST],
# translation result for a source sentence
# {
# "translation": “北京是中国的首都”
# }
Tasks.translation: [OutputKeys.TRANSLATION],
# word segmentation result for single sample
# {
# "output": "今天 天气 不错 适合 出去 游玩"
# "labels": [
# {'word': '今天', 'label': 'PROPN'},
# {'word': '天气', 'label': 'PROPN'},
# {'word': '不错', 'label': 'VERB'},
# {'word': ',', 'label': 'NUM'},
# {'word': '适合', 'label': 'NOUN'},
# {'word': '出去', 'label': 'PART'},
# {'word': '游玩', 'label': 'ADV'},
# ]
# }
Tasks.word_segmentation: [OutputKeys.OUTPUT, OutputKeys.LABELS],
Tasks.part_of_speech: [OutputKeys.OUTPUT, OutputKeys.LABELS],
# TODO @wenmeng.zwm support list of result check
# named entity recognition result for single sample
# {
# "output": [
# {"type": "LOC", "start": 2, "end": 5, "span": "温岭市"},
# {"type": "LOC", "start": 5, "end": 8, "span": "新河镇"}
# ]
# }
Tasks.named_entity_recognition: [OutputKeys.OUTPUT],
# text_error_correction result for a single sample
# {
# "output": "我想吃苹果"
# }
Tasks.text_error_correction: [OutputKeys.OUTPUT],
Tasks.sentence_embedding: [OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
Tasks.passage_ranking: [OutputKeys.SCORES],
# text generation result for single sample
# {
# "text": "this is the text generated by a model."
# }
Tasks.text_generation: [OutputKeys.TEXT],
# text generation result for single sample
# {
# "text": "北京"
# }
Tasks.text2text_generation: [OutputKeys.TEXT],
# fill mask result for single sample
# {
# "text": "this is the text which masks filled by model."
# }
Tasks.fill_mask: [OutputKeys.TEXT],
# feature extraction result for single sample
# {
# "text_embedding": [[
# [1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04],
# [6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01],
# [2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05]
# ],
# [
# [2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05],
# [8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05],
# [3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05]
# ]
# ]
# }
Tasks.feature_extraction: [OutputKeys.TEXT_EMBEDDING],
# (Deprecated) dialog intent prediction result for single sample
# {'output': {'prediction': array([2.62349960e-03, 4.12110658e-03, 4.12748595e-05, 3.77560973e-05,
# 1.08599677e-04, 1.72710388e-05, 2.95618793e-05, 1.93638436e-04,
# 6.45841064e-05, 1.15997791e-04, 5.11605394e-05, 9.87020373e-01,
# 2.66957268e-05, 4.72324500e-05, 9.74208378e-05, 4.18022355e-05,
# 2.97343540e-05, 5.81317654e-05, 5.44203431e-05, 6.28319322e-05,
# 7.34537680e-05, 6.61411541e-05, 3.62534920e-05, 8.58885178e-05,
# 8.24327726e-05, 4.66077945e-05, 5.32869453e-05, 4.16190960e-05,
# 5.97518992e-05, 3.92273068e-05, 3.44069012e-05, 9.92335918e-05,
# 9.25978165e-05, 6.26462061e-05, 3.32317031e-05, 1.32061413e-03,
# 2.01607945e-05, 3.36636294e-05, 3.99156743e-05, 5.84108493e-05,
# 2.53432900e-05, 4.95731190e-04, 2.64443643e-05, 4.46992999e-05,
# 2.42672231e-05, 4.75615161e-05, 2.66230145e-05, 4.00083954e-05,
# 2.90536875e-04, 4.23891543e-05, 8.63691166e-05, 4.98188965e-05,
# 3.47019341e-05, 4.52718523e-05, 4.20905781e-05, 5.50173208e-05,
# 4.92360487e-05, 3.56021264e-05, 2.13957210e-05, 6.17428886e-05,
# 1.43893281e-04, 7.32152112e-05, 2.91354867e-04, 2.46623786e-05,
# 3.61441926e-05, 3.38475402e-05, 3.44323053e-05, 5.70138109e-05,
# 4.31488479e-05, 4.94503947e-05, 4.30105974e-05, 1.00963116e-04,
# 2.82062047e-05, 1.15582036e-04, 4.48261271e-05, 3.99339879e-05,
# 7.27692823e-05], dtype=float32), 'label_pos': array([11]), 'label': 'lost_or_stolen_card'}}
# (Deprecated) dialog modeling prediction result for single sample
# {'output' : ['you', 'are', 'welcome', '.', 'have', 'a', 'great', 'day', '!']}
# (Deprecated) dialog state tracking result for single sample
# {
# "output":{
# "dialog_states": {
# "taxi-leaveAt": "none",
# "taxi-destination": "none",
# "taxi-departure": "none",
# "taxi-arriveBy": "none",
# "restaurant-book_people": "none",
# "restaurant-book_day": "none",
# "restaurant-book_time": "none",
# "restaurant-food": "none",
# "restaurant-pricerange": "none",
# "restaurant-name": "none",
# "restaurant-area": "none",
# "hotel-book_people": "none",
# "hotel-book_day": "none",
# "hotel-book_stay": "none",
# "hotel-name": "none",
# "hotel-area": "none",
# "hotel-parking": "none",
# "hotel-pricerange": "cheap",
# "hotel-stars": "none",
# "hotel-internet": "none",
# "hotel-type": "true",
# "attraction-type": "none",
# "attraction-name": "none",
# "attraction-area": "none",
# "train-book_people": "none",
# "train-leaveAt": "none",
# "train-destination": "none",
# "train-day": "none",
# "train-arriveBy": "none",
# "train-departure": "none"
# }
# }
# }
Tasks.task_oriented_conversation: [OutputKeys.OUTPUT],
# conversational text-to-sql result for single sample
# {
# "text": "SELECT shop.Name FROM shop."
# }
Tasks.conversational_text_to_sql: [OutputKeys.TEXT],
# table-question-answering result for single sample
# {
# "sql": "SELECT shop.Name FROM shop."
# "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]}
# }
Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY],
# ============ audio tasks ===================
# asr result for single sample
# { "text": "每一天都要快乐喔"}
Tasks.auto_speech_recognition: [OutputKeys.TEXT],
# audio processed for single file in PCM format
# {
# "output_pcm": pcm encoded audio bytes
# }
Tasks.speech_signal_process: [OutputKeys.OUTPUT_PCM],
Tasks.acoustic_echo_cancellation: [OutputKeys.OUTPUT_PCM],
Tasks.acoustic_noise_suppression: [OutputKeys.OUTPUT_PCM],
# text_to_speech result for a single sample
# {
# "output_pcm": {"input_label" : np.ndarray with shape [D]}
# }
Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM],
# {
# "kws_list": [
# {
# 'keyword': '', # the keyword spotted
# 'offset': 19.4, # the keyword start time in second
# 'length': 0.68, # the keyword length in second
# 'confidence': 0.85 # the possibility if it is the keyword
# },
# ...
# ]
# }
Tasks.keyword_spotting: [OutputKeys.KWS_LIST],
# ============ multi-modal tasks ===================
# image caption result for single sample
# {
# "caption": "this is an image caption text."
# }
Tasks.image_captioning: [OutputKeys.CAPTION],
# visual grounding result for single sample
# {
# "boxes": [
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# [x1, y1, x2, y2],
# ],
# "scores": [0.9, 0.1, 0.05, 0.05]
# }
Tasks.visual_grounding: [OutputKeys.BOXES, OutputKeys.SCORES],
# text_to_image result for a single sample
# {
# "output_img": np.ndarray with shape [height, width, 3]
# }
Tasks.text_to_image_synthesis: [OutputKeys.OUTPUT_IMG],
# text_to_speech result for a single sample
# {
# "output_pcm": {"input_label" : np.ndarray with shape [D]}
# }
Tasks.text_to_speech: [OutputKeys.OUTPUT_PCM],
# multi-modal embedding result for single sample
# {
# "img_embedding": np.array with shape [1, D],
# "text_embedding": np.array with shape [1, D]
# }
Tasks.multi_modal_embedding:
[OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING],
# generative multi-modal embedding result for single sample
# {
# "img_embedding": np.array with shape [1, D],
# "text_embedding": np.array with shape [1, D],
# "caption": "this is an image caption text."
# }
Tasks.generative_multi_modal_embedding:
[OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.CAPTION],
# multi-modal similarity result for single sample
# {
# "img_embedding": np.array with shape [1, D],
# "text_embedding": np.array with shape [1, D],
# "similarity": float
# }
Tasks.multi_modal_similarity:
[OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES],
# VQA result for a sample
# {"text": "this is a text answser. "}
Tasks.visual_question_answering: [OutputKeys.TEXT],
# auto_speech_recognition result for a single sample
# {
# "text": "每天都要快乐喔"
# }
Tasks.auto_speech_recognition: [OutputKeys.TEXT],
# {
# "scores": [0.9, 0.1, 0.1],
# "labels": ["entailment", "contradiction", "neutral"]
# }
Tasks.visual_entailment: [OutputKeys.SCORES, OutputKeys.LABELS],
# {
# 'labels': ['吸烟', '打电话', '吸烟'],
# 'scores': [0.7527753114700317, 0.753358006477356, 0.6880350708961487],
# 'boxes': [[547, 2, 1225, 719], [529, 8, 1255, 719], [584, 0, 1269, 719]],
# 'timestamps': [1, 3, 5]
# }
Tasks.action_detection: [
OutputKeys.TIMESTAMPS,
OutputKeys.LABELS,
OutputKeys.SCORES,
OutputKeys.BOXES,
],
# {
# 'output': [
# [{'label': '6527856', 'score': 0.9942756295204163}, {'label': '1000012000', 'score': 0.0379515215754509},
# {'label': '13421097', 'score': 2.2825044965202324e-08}],
# [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
# {'label': '13421097', 'score': 2.75914817393641e-06}],
# [{'label': '1000012000', 'score': 0.910681426525116}, {'label': '6527856', 'score': 0.0005046309670433402},
# {'label': '13421097', 'score': 2.75914817393641e-06}]]
# }
Tasks.faq_question_answering: [OutputKeys.OUTPUT],
# image person reid result for single sample
# {
# "img_embedding": np.array with shape [1, D],
# }
Tasks.image_reid_person: [OutputKeys.IMG_EMBEDDING],
# {
# 'output': ['Done' / 'Decode_Error']
# }
Tasks.video_inpainting: [OutputKeys.OUTPUT],
# {
# 'output': ['bixin']
# }
Tasks.hand_static: [OutputKeys.OUTPUT],
# {
# 'output': [
# [2, 75, 287, 240, 510, 0.8335018754005432],
# [1, 127, 83, 332, 366, 0.9175254702568054],
# [0, 0, 0, 367, 639, 0.9693422317504883]]
# }
Tasks.face_human_hand_detection: [OutputKeys.OUTPUT],
}