From ff69439c4f48bbd1ca3e3f81a3c921925f8e3ca5 Mon Sep 17 00:00:00 2001 From: "ryan.yy" Date: Mon, 10 Oct 2022 17:42:41 +0800 Subject: [PATCH 01/57] [to #42322933]add image_body_reshaping code Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10217723 * add image_body_reshaping code --- data/test/images/image_body_reshaping.jpg | 3 + modelscope/metainfo.py | 2 + .../cv/image_body_reshaping/__init__.py | 20 + .../image_body_reshaping.py | 128 +++++ .../models/cv/image_body_reshaping/model.py | 189 +++++++ .../cv/image_body_reshaping/person_info.py | 339 ++++++++++++ .../pose_estimator/__init__.py | 0 .../pose_estimator/body.py | 272 ++++++++++ .../pose_estimator/model.py | 141 +++++ .../pose_estimator/util.py | 33 ++ .../cv/image_body_reshaping/slim_utils.py | 507 ++++++++++++++++++ modelscope/outputs.py | 1 + modelscope/pipelines/builder.py | 2 + .../cv/image_body_reshaping_pipeline.py | 40 ++ modelscope/utils/constant.py | 2 +- requirements/cv.txt | 1 + tests/pipelines/test_image_body_reshaping.py | 58 ++ 17 files changed, 1737 insertions(+), 1 deletion(-) create mode 100644 data/test/images/image_body_reshaping.jpg create mode 100644 modelscope/models/cv/image_body_reshaping/__init__.py create mode 100644 modelscope/models/cv/image_body_reshaping/image_body_reshaping.py create mode 100644 modelscope/models/cv/image_body_reshaping/model.py create mode 100644 modelscope/models/cv/image_body_reshaping/person_info.py create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/body.py create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/model.py create mode 100644 modelscope/models/cv/image_body_reshaping/pose_estimator/util.py create mode 100644 modelscope/models/cv/image_body_reshaping/slim_utils.py create mode 100644 modelscope/pipelines/cv/image_body_reshaping_pipeline.py create mode 100644 tests/pipelines/test_image_body_reshaping.py diff --git a/data/test/images/image_body_reshaping.jpg b/data/test/images/image_body_reshaping.jpg new file mode 100644 index 00000000..d78acb8f --- /dev/null +++ b/data/test/images/image_body_reshaping.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d +size 1127557 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 28804ce6..1b8c4720 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -43,6 +43,7 @@ class Models(object): face_human_hand_detection = 'face-human-hand-detection' face_emotion = 'face-emotion' product_segmentation = 'product-segmentation' + image_body_reshaping = 'image-body-reshaping' # EasyCV models yolox = 'YOLOX' @@ -187,6 +188,7 @@ class Pipelines(object): face_human_hand_detection = 'face-human-hand-detection' face_emotion = 'face-emotion' product_segmentation = 'product-segmentation' + image_body_reshaping = 'flow-based-body-reshaping' # nlp tasks automatic_post_editing = 'automatic-post-editing' diff --git a/modelscope/models/cv/image_body_reshaping/__init__.py b/modelscope/models/cv/image_body_reshaping/__init__.py new file mode 100644 index 00000000..a04f110d --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .image_body_reshaping import ImageBodyReshaping + +else: + _import_structure = {'image_body_reshaping': ['ImageBodyReshaping']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py new file mode 100644 index 00000000..4aed8d98 --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/image_body_reshaping.py @@ -0,0 +1,128 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +from typing import Any, Dict + +import cv2 +import numpy as np +import torch + +from modelscope.metainfo import Models +from modelscope.models.base import Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .model import FlowGenerator +from .person_info import PersonInfo +from .pose_estimator.body import Body +from .slim_utils import image_warp_grid1, resize_on_long_side + +logger = get_logger() + +__all__ = ['ImageBodyReshaping'] + + +@MODELS.register_module( + Tasks.image_body_reshaping, module_name=Models.image_body_reshaping) +class ImageBodyReshaping(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the image body reshaping model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + + if torch.cuda.is_available(): + self.device = torch.device('cuda') + else: + self.device = torch.device('cpu') + + self.degree = 1.0 + self.reshape_model = FlowGenerator(n_channels=16).to(self.device) + model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) + checkpoints = torch.load(model_path, map_location=torch.device('cpu')) + self.reshape_model.load_state_dict( + checkpoints['state_dict'], strict=True) + self.reshape_model.eval() + logger.info('load body reshaping model done') + + pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth') + self.pose_esti = Body(pose_model_ckpt, self.device) + logger.info('load pose model done') + + def pred_joints(self, img): + if img is None: + return None + small_src, resize_scale = resize_on_long_side(img, 300) + body_joints = self.pose_esti(small_src) + + if body_joints.shape[0] >= 1: + body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale + + return body_joints + + def pred_flow(self, img): + + body_joints = self.pred_joints(img) + small_size = 1200 + + if img.shape[0] > small_size or img.shape[1] > small_size: + _img, _scale = resize_on_long_side(img, small_size) + body_joints[:, :, :2] = body_joints[:, :, :2] * _scale + else: + _img = img + + # We only reshape one person + if body_joints.shape[0] < 1 or body_joints.shape[0] > 1: + return None + + person = PersonInfo(body_joints[0]) + + with torch.no_grad(): + person_pred = person.pred_flow(_img, self.reshape_model, + self.device) + + flow = np.dstack((person_pred['rDx'], person_pred['rDy'])) + + scale = img.shape[0] * 1.0 / flow.shape[0] + + flow = cv2.resize(flow, (img.shape[1], img.shape[0])) + flow *= scale + + return flow + + def warp(self, src_img, flow): + + X_flow = flow[..., 0] + Y_flow = flow[..., 1] + + X_flow = np.ascontiguousarray(X_flow) + Y_flow = np.ascontiguousarray(Y_flow) + + pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0) + return pred + + def inference(self, img): + img = img.cpu().numpy() + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + flow = self.pred_flow(img) + + if flow is None: + return img + + assert flow.shape[:2] == img.shape[:2] + + mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8) + mag -= 3 + mag[mag <= 0] = 0 + + x, y = cv2.polarToCart(mag, ang, angleInDegrees=False) + flow = np.dstack((x, y)) + + flow *= self.degree + pred = self.warp(img, flow) + out_img = np.clip(pred, 0, 255) + logger.info('model inference done') + + return out_img.astype(np.uint8) diff --git a/modelscope/models/cv/image_body_reshaping/model.py b/modelscope/models/cv/image_body_reshaping/model.py new file mode 100644 index 00000000..174428a1 --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/model.py @@ -0,0 +1,189 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ConvLayer(nn.Module): + + def __init__(self, in_ch, out_ch): + super(ConvLayer, self).__init__() + + self.conv = nn.Sequential( + nn.ReflectionPad2d(1), + nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0), + nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True)) + + def forward(self, x): + x = self.conv(x) + return x + + +class SASA(nn.Module): + + def __init__(self, in_dim): + super(SASA, self).__init__() + self.chanel_in = in_dim + + self.query_conv = nn.Conv2d( + in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1) + self.key_conv = nn.Conv2d( + in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1) + self.value_conv = nn.Conv2d( + in_channels=in_dim, out_channels=in_dim, kernel_size=1) + self.mag_conv = nn.Conv2d( + in_channels=5, out_channels=in_dim // 32, kernel_size=1) + + self.gamma = nn.Parameter(torch.zeros(1)) + + self.softmax = nn.Softmax(dim=-1) # + self.sigmoid = nn.Sigmoid() + + def structure_encoder(self, paf_mag, target_height, target_width): + torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True) + torso_mask = torch.clamp(torso_mask, 0, 1) + + arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True) + arms_mask = torch.clamp(arms_mask, 0, 1) + + legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True) + legs_mask = torch.clamp(legs_mask, 0, 1) + + fg_mask = paf_mag[:, 12, :, :].unsqueeze(1) + bg_mask = 1 - fg_mask + Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask), + dim=1) + Y = F.interpolate(Y, size=(target_height, target_width), mode='area') + return Y + + def forward(self, X, PAF_mag): + """extract self-attention features. + Args: + X : input feature maps( B x C x H x W) + PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity + + Returns: + out : self attention value + input feature + Y: B X N X N (N is Width*Height) + """ + + m_batchsize, C, height, width = X.size() + + Y = self.structure_encoder(PAF_mag, height, width) + + connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1, + width * height) + affinity = torch.bmm( + connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec) + affinity_centered = affinity - torch.mean(affinity) + affinity_sigmoid = self.sigmoid(affinity_centered) + + proj_query = self.query_conv(X).view(m_batchsize, -1, + width * height).permute(0, 2, 1) + proj_key = self.key_conv(X).view(m_batchsize, -1, width * height) + selfatten_map = torch.bmm(proj_query, proj_key) + selfatten_centered = selfatten_map - torch.mean( + selfatten_map) # centering + selfatten_sigmoid = self.sigmoid(selfatten_centered) + + SASA_map = selfatten_sigmoid * affinity_sigmoid + + proj_value = self.value_conv(X).view(m_batchsize, -1, width * height) + + out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1)) + out = out.view(m_batchsize, C, height, width) + + out = self.gamma * out + X + return out, Y + + +class FlowGenerator(nn.Module): + + def __init__(self, n_channels, deep_supervision=False): + super(FlowGenerator, self).__init__() + self.deep_supervision = deep_supervision + + self.Encoder = nn.Sequential( + ConvLayer(n_channels, 64), + ConvLayer(64, 64), + nn.MaxPool2d(2), + ConvLayer(64, 128), + ConvLayer(128, 128), + nn.MaxPool2d(2), + ConvLayer(128, 256), + ConvLayer(256, 256), + nn.MaxPool2d(2), + ConvLayer(256, 512), + ConvLayer(512, 512), + nn.MaxPool2d(2), + ConvLayer(512, 1024), + ConvLayer(1024, 1024), + ConvLayer(1024, 1024), + ConvLayer(1024, 1024), + ConvLayer(1024, 1024), + ) + + self.SASA = SASA(in_dim=1024) + + self.Decoder = nn.Sequential( + ConvLayer(1024, 1024), + nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), + ConvLayer(1024, 512), + ConvLayer(512, 512), + nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True), + ConvLayer(512, 256), + ConvLayer(256, 256), + ConvLayer(256, 128), + ConvLayer(128, 64), + ConvLayer(64, 32), + nn.Conv2d(32, 2, kernel_size=1, padding=0), + nn.Tanh(), + nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True), + ) + + dilation_ksize = 17 + self.dilation = torch.nn.MaxPool2d( + kernel_size=dilation_ksize, + stride=1, + padding=int((dilation_ksize - 1) / 2)) + + def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2): + n, c, h, w = x.size() + yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)]) + xv = xv.float() / (w - 1) * 2.0 - 1 + yv = yv.float() / (h - 1) * 2.0 - 1 + grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0) + grid = grid.to(flow.device) + grid_x = grid + 2 * flow * coff + warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode) + return warp_x + + def forward(self, img, skeleton_map, coef=0.2): + """extract self-attention features. + Args: + img : input numpy image + skeleton_map : skeleton map of input image + coef: warp degree + + Returns: + warp_x : warped image + flow: predicted flow + """ + + img_concat = torch.cat((img, skeleton_map), dim=1) + X = self.Encoder(img_concat) + + _, _, height, width = X.size() + + # directly get PAF magnitude from skeleton maps via dilation + PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5) + + out, Y = self.SASA(X, PAF_mag) + flow = self.Decoder(out) + + flow = flow.permute(0, 2, 3, 1) # [n, 2, h, w] ==> [n, h, w, 2] + + warp_x = self.warp(img, flow, coff=coef) + warp_x = torch.clamp(warp_x, min=-1.0, max=1.0) + + return warp_x, flow diff --git a/modelscope/models/cv/image_body_reshaping/person_info.py b/modelscope/models/cv/image_body_reshaping/person_info.py new file mode 100644 index 00000000..509a2ce3 --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/person_info.py @@ -0,0 +1,339 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import copy + +import cv2 +import numpy as np +import torch + +from .slim_utils import (enlarge_box_tblr, gen_skeleton_map, + get_map_fusion_map_cuda, get_mask_bbox, + resize_on_long_side) + + +class PersonInfo(object): + + def __init__(self, joints): + self.joints = joints + self.flow = None + self.pad_boder = False + self.height_expand = 0 + self.width_expand = 0 + self.coeff = 0.2 + self.network_input_W = 256 + self.network_input_H = 256 + self.divider = 20 + self.flow_scales = ['upper_2'] + + def update_attribute(self, pad_boder, height_expand, width_expand): + self.pad_boder = pad_boder + self.height_expand = height_expand + self.width_expand = width_expand + if pad_boder: + self.joints[:, 0] += width_expand + self.joints[:, 1] += height_expand + + def pred_flow(self, img, flow_net, device): + with torch.no_grad(): + if img is None: + print('image is none') + self.flow = None + + if len(img.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + if self.pad_boder: + height_expand = self.height_expand + width_expand = self.width_expand + pad_img = cv2.copyMakeBorder( + img, + height_expand, + height_expand, + width_expand, + width_expand, + cv2.BORDER_CONSTANT, + value=(127, 127, 127)) + + else: + height_expand = 0 + width_expand = 0 + pad_img = img.copy() + + canvas = np.zeros( + shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32) + + self.human_joint_box = self.__joint_to_body_box() + + self.human_box = enlarge_box_tblr( + self.human_joint_box, pad_img, ratio=0.25) + human_box_height = self.human_box[1] - self.human_box[0] + human_box_width = self.human_box[3] - self.human_box[2] + + self.leg_joint_box = self.__joint_to_leg_box() + self.leg_box = enlarge_box_tblr( + self.leg_joint_box, pad_img, ratio=0.25) + + self.arm_joint_box = self.__joint_to_arm_box() + self.arm_box = enlarge_box_tblr( + self.arm_joint_box, pad_img, ratio=0.1) + + x_flows = [] + y_flows = [] + multi_bbox = [] + + for scale in self.flow_scales: # better for metric + scale_value = float(scale.split('_')[-1]) + + arm_box = copy.deepcopy(self.arm_box) + + if arm_box[0] is None: + arm_box = self.human_box + + arm_box_height = arm_box[1] - arm_box[0] + arm_box_width = arm_box[3] - arm_box[2] + + roi_bbox = None + + if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1: + roi_bbox = self.human_box + else: + arm_box = enlarge_box_tblr( + arm_box, pad_img, ratio=scale_value) + if scale == 'upper_0.2': + arm_box[0] = min(arm_box[0], int(self.joints[0][1])) + if scale.startswith('upper'): + roi_bbox = [ + max(self.human_box[0], arm_box[0]), + min(self.human_box[1], arm_box[1]), + max(self.human_box[2], arm_box[2]), + min(self.human_box[3], arm_box[3]) + ] + if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[ + 3] - roi_bbox[2] < 1: + continue + + elif scale.startswith('lower'): + roi_bbox = [ + max(self.human_box[0], self.leg_box[0]), + min(self.human_box[1], self.leg_box[1]), + max(self.human_box[2], self.leg_box[2]), + min(self.human_box[3], self.leg_box[3]) + ] + + if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[ + 3] - roi_bbox[2] < 1: + continue + + skel_map, roi_bbox = gen_skeleton_map( + self.joints, 'depth', input_roi_box=roi_bbox) + + if roi_bbox is None: + continue + + if skel_map.dtype != np.float32: + skel_map = skel_map.astype(np.float32) + + skel_map -= 1.0 # [0,2] ->[-1,1] + + multi_bbox.append(roi_bbox) + + roi_bbox_height = roi_bbox[1] - roi_bbox[0] + roi_bbox_width = roi_bbox[3] - roi_bbox[2] + + assert skel_map.shape[0] == roi_bbox_height + assert skel_map.shape[1] == roi_bbox_width + roi_height_pad = roi_bbox_height // self.divider + roi_width_pad = roi_bbox_width // self.divider + paded_roi_h = roi_bbox_height + 2 * roi_height_pad + paded_roi_w = roi_bbox_width + 2 * roi_width_pad + + roi_height_pad_joint = skel_map.shape[0] // self.divider + roi_width_pad_joint = skel_map.shape[1] // self.divider + skel_map = np.pad( + skel_map, + ((roi_height_pad_joint, roi_height_pad_joint), + (roi_width_pad_joint, roi_width_pad_joint), (0, 0)), + 'constant', + constant_values=-1) + + skel_map_resized = cv2.resize( + skel_map, (self.network_input_W, self.network_input_H)) + + skel_map_resized[skel_map_resized < 0] = -1.0 + skel_map_resized[skel_map_resized > -0.5] = 1.0 + skel_map_transformed = torch.from_numpy( + skel_map_resized.transpose((2, 0, 1))) + + roi_npy = pad_img[roi_bbox[0]:roi_bbox[1], + roi_bbox[2]:roi_bbox[3], :].copy() + if roi_npy.dtype != np.float32: + roi_npy = roi_npy.astype(np.float32) + + roi_npy = np.pad(roi_npy, + ((roi_height_pad, roi_height_pad), + (roi_width_pad, roi_width_pad), (0, 0)), + 'edge') + + roi_npy = roi_npy[:, :, ::-1] + + roi_npy = cv2.resize( + roi_npy, (self.network_input_W, self.network_input_H)) + + roi_npy *= 1.0 / 255 + roi_npy -= 0.5 + roi_npy *= 2 + + rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1))) + + rgb_tensor = rgb_tensor.unsqueeze(0).to(device) + skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device) + warped_img_val, flow_field_val = flow_net( + rgb_tensor, skel_map_tensor + ) # inference, connectivity_mask [1,12,16,16] + flow_field_val = flow_field_val.detach().squeeze().cpu().numpy( + ) + + flow_field_val = cv2.resize( + flow_field_val, (paded_roi_w, paded_roi_h), + interpolation=cv2.INTER_LINEAR) + flow_field_val[..., 0] = flow_field_val[ + ..., 0] * paded_roi_w * 0.5 * 2 * self.coeff + flow_field_val[..., 1] = flow_field_val[ + ..., 1] * paded_roi_h * 0.5 * 2 * self.coeff + + # remove pad areas + flow_field_val = flow_field_val[ + roi_height_pad:flow_field_val.shape[0] - roi_height_pad, + roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :] + + diffuse_width = max(roi_bbox_width // 3, 1) + diffuse_height = max(roi_bbox_height // 3, 1) + assert roi_bbox_width == flow_field_val.shape[1] + assert roi_bbox_height == flow_field_val.shape[0] + + origin_flow = np.zeros( + (pad_img.shape[0] + 2 * diffuse_height, + pad_img.shape[1] + 2 * diffuse_width, 2), + dtype=np.float32) + + flow_field_val = np.pad(flow_field_val, + ((diffuse_height, diffuse_height), + (diffuse_width, diffuse_width), + (0, 0)), 'linear_ramp') + + origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height, + roi_bbox[2]:roi_bbox[3] + + 2 * diffuse_width] = flow_field_val + + origin_flow = origin_flow[diffuse_height:-diffuse_height, + diffuse_width:-diffuse_width, :] + + x_flows.append(origin_flow[..., 0]) + y_flows.append(origin_flow[..., 1]) + + if len(x_flows) == 0: + return { + 'rDx': np.zeros(canvas.shape[:2], dtype=np.float32), + 'rDy': np.zeros(canvas.shape[:2], dtype=np.float32), + 'multi_bbox': multi_bbox, + 'x_fusion_map': + np.ones(canvas.shape[:2], dtype=np.float32), + 'y_fusion_map': + np.ones(canvas.shape[:2], dtype=np.float32) + } + else: + origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow( + x_flows, y_flows, device=device) + + return { + 'rDx': origin_rDx, + 'rDy': origin_rDy, + 'multi_bbox': multi_bbox, + 'x_fusion_map': x_fusion_map, + 'y_fusion_map': y_fusion_map + } + + @staticmethod + def blend_multiscale_flow(x_flows, y_flows, device=None): + scale_num = len(x_flows) + if scale_num == 1: + return x_flows[0], y_flows[0], np.ones_like( + x_flows[0]), np.ones_like(x_flows[0]) + + origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]), + dtype=np.float32) + origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]), + dtype=np.float32) + + x_fusion_map, x_acc_map = get_map_fusion_map_cuda( + x_flows, 1, device=device) + y_fusion_map, y_acc_map = get_map_fusion_map_cuda( + y_flows, 1, device=device) + + x_flow_map = 1.0 / x_fusion_map + y_flow_map = 1.0 / y_fusion_map + + all_acc_map = x_acc_map + y_acc_map + all_acc_map = all_acc_map.astype(np.uint8) + roi_box = get_mask_bbox(all_acc_map, threshold=1) + + if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[ + 3] - roi_box[2] <= 0: + roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]] + + roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1], + roi_box[2]:roi_box[3]] + roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1], + roi_box[2]:roi_box[3]] + + roi_width = roi_x_flow_map.shape[1] + roi_height = roi_x_flow_map.shape[0] + + roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320) + roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320) + + roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55)) + roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55)) + + roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height)) + roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height)) + + x_flow_map[roi_box[0]:roi_box[1], + roi_box[2]:roi_box[3]] = roi_x_flow_map + y_flow_map[roi_box[0]:roi_box[1], + roi_box[2]:roi_box[3]] = roi_y_flow_map + + for i in range(scale_num): + origin_rDx += x_flows[i] + origin_rDy += y_flows[i] + + origin_rDx *= x_flow_map + origin_rDy *= y_flow_map + + return origin_rDx, origin_rDy, x_flow_map, y_flow_map + + def __joint_to_body_box(self): + joint_left = int(np.min(self.joints, axis=0)[0]) + joint_right = int(np.max(self.joints, axis=0)[0]) + joint_top = int(np.min(self.joints, axis=0)[1]) + joint_bottom = int(np.max(self.joints, axis=0)[1]) + return [joint_top, joint_bottom, joint_left, joint_right] + + def __joint_to_leg_box(self): + leg_joints = self.joints[8:, :] + if np.max(leg_joints, axis=0)[2] < 0.05: + return [0, 0, 0, 0] + joint_left = int(np.min(leg_joints, axis=0)[0]) + joint_right = int(np.max(leg_joints, axis=0)[0]) + joint_top = int(np.min(leg_joints, axis=0)[1]) + joint_bottom = int(np.max(leg_joints, axis=0)[1]) + return [joint_top, joint_bottom, joint_left, joint_right] + + def __joint_to_arm_box(self): + arm_joints = self.joints[2:8, :] + if np.max(arm_joints, axis=0)[2] < 0.05: + return [0, 0, 0, 0] + joint_left = int(np.min(arm_joints, axis=0)[0]) + joint_right = int(np.max(arm_joints, axis=0)[0]) + joint_top = int(np.min(arm_joints, axis=0)[1]) + joint_bottom = int(np.max(arm_joints, axis=0)[1]) + return [joint_top, joint_bottom, joint_left, joint_right] diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py new file mode 100644 index 00000000..45b02724 --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/body.py @@ -0,0 +1,272 @@ +# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. + +import math + +import cv2 +import numpy as np +import torch +from scipy.ndimage.filters import gaussian_filter + +from .model import BodyposeModel +from .util import pad_rightdown_corner, transfer + + +class Body(object): + + def __init__(self, model_path, device): + self.model = BodyposeModel().to(device) + model_dict = transfer(self.model, torch.load(model_path)) + self.model.load_state_dict(model_dict) + self.model.eval() + + def __call__(self, oriImg): + scale_search = [0.5] + boxsize = 368 + stride = 8 + padValue = 128 + thre1 = 0.1 + thre2 = 0.05 + bodyparts = 18 + multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search] + heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19)) + paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38)) + + for m in range(len(multiplier)): + scale = multiplier[m] + imageToTest = cv2.resize( + oriImg, (0, 0), + fx=scale, + fy=scale, + interpolation=cv2.INTER_CUBIC) + imageToTest_padded, pad = pad_rightdown_corner( + imageToTest, stride, padValue) + im = np.transpose( + np.float32(imageToTest_padded[:, :, :, np.newaxis]), + (3, 2, 0, 1)) / 256 - 0.5 + im = np.ascontiguousarray(im) + + data = torch.from_numpy(im).float() + if torch.cuda.is_available(): + data = data.cuda() + with torch.no_grad(): + Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data) + Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy() + Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy() + + # extract outputs, resize, and remove padding + heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), + (1, 2, 0)) # output 1 is heatmaps + heatmap = cv2.resize( + heatmap, (0, 0), + fx=stride, + fy=stride, + interpolation=cv2.INTER_CUBIC) + heatmap = heatmap[:imageToTest_padded.shape[0] + - pad[2], :imageToTest_padded.shape[1] + - pad[3], :] + heatmap = cv2.resize( + heatmap, (oriImg.shape[1], oriImg.shape[0]), + interpolation=cv2.INTER_CUBIC) + + paf = np.transpose(np.squeeze(Mconv7_stage6_L1), + (1, 2, 0)) # output 0 is PAFs + paf = cv2.resize( + paf, (0, 0), + fx=stride, + fy=stride, + interpolation=cv2.INTER_CUBIC) + paf = paf[:imageToTest_padded.shape[0] + - pad[2], :imageToTest_padded.shape[1] - pad[3], :] + paf = cv2.resize( + paf, (oriImg.shape[1], oriImg.shape[0]), + interpolation=cv2.INTER_CUBIC) + + heatmap_avg += heatmap_avg + heatmap / len(multiplier) + paf_avg += +paf / len(multiplier) + + all_peaks = [] + peak_counter = 0 + + for part in range(bodyparts): + map_ori = heatmap_avg[:, :, part] + one_heatmap = gaussian_filter(map_ori, sigma=3) + + map_left = np.zeros(one_heatmap.shape) + map_left[1:, :] = one_heatmap[:-1, :] + map_right = np.zeros(one_heatmap.shape) + map_right[:-1, :] = one_heatmap[1:, :] + map_up = np.zeros(one_heatmap.shape) + map_up[:, 1:] = one_heatmap[:, :-1] + map_down = np.zeros(one_heatmap.shape) + map_down[:, :-1] = one_heatmap[:, 1:] + + peaks_binary = np.logical_and.reduce( + (one_heatmap >= map_left, one_heatmap >= map_right, + one_heatmap >= map_up, one_heatmap >= map_down, + one_heatmap > thre1)) + peaks = list( + zip(np.nonzero(peaks_binary)[1], + np.nonzero(peaks_binary)[0])) # note reverse + peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks] + peak_id = range(peak_counter, peak_counter + len(peaks)) + peaks_with_score_and_id = [ + peaks_with_score[i] + (peak_id[i], ) + for i in range(len(peak_id)) + ] + + all_peaks.append(peaks_with_score_and_id) + peak_counter += len(peaks) + + # find connection in the specified sequence, center 29 is in the position 15 + limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], + [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], + [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]] + # the middle joints heatmap correpondence + mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], + [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30], + [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38], + [45, 46]] + + connection_all = [] + special_k = [] + mid_num = 10 + + for k in range(len(mapIdx)): + score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]] + candA = all_peaks[limbSeq[k][0] - 1] + candB = all_peaks[limbSeq[k][1] - 1] + nA = len(candA) + nB = len(candB) + if (nA != 0 and nB != 0): + connection_candidate = [] + for i in range(nA): + for j in range(nB): + vec = np.subtract(candB[j][:2], candA[i][:2]) + norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1]) + norm = max(0.001, norm) + vec = np.divide(vec, norm) + + startend = list( + zip( + np.linspace( + candA[i][0], candB[j][0], num=mid_num), + np.linspace( + candA[i][1], candB[j][1], num=mid_num))) + + vec_x = np.array([ + score_mid[int(round(startend[item][1])), + int(round(startend[item][0])), 0] + for item in range(len(startend)) + ]) + vec_y = np.array([ + score_mid[int(round(startend[item][1])), + int(round(startend[item][0])), 1] + for item in range(len(startend)) + ]) + + score_midpts = np.multiply( + vec_x, vec[0]) + np.multiply(vec_y, vec[1]) + temp1 = sum(score_midpts) / len(score_midpts) + temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0) + score_with_dist_prior = temp1 + temp2 + criterion1 = len(np.nonzero( + score_midpts > thre2)[0]) > 0.8 * len(score_midpts) + criterion2 = score_with_dist_prior > 0 + if criterion1 and criterion2: + connection_candidate.append([ + i, j, score_with_dist_prior, + score_with_dist_prior + candA[i][2] + + candB[j][2] + ]) + + connection_candidate = sorted( + connection_candidate, key=lambda x: x[2], reverse=True) + connection = np.zeros((0, 5)) + for c in range(len(connection_candidate)): + i, j, s = connection_candidate[c][0:3] + if (i not in connection[:, 3] + and j not in connection[:, 4]): + connection = np.vstack( + [connection, [candA[i][3], candB[j][3], s, i, j]]) + if (len(connection) >= min(nA, nB)): + break + + connection_all.append(connection) + else: + special_k.append(k) + connection_all.append([]) + + # last number in each row is the total parts number of that person + # the second last number in each row is the score of the overall configuration + subset = -1 * np.ones((0, 20)) + candidate = np.array( + [item for sublist in all_peaks for item in sublist]) + + for k in range(len(mapIdx)): + if k not in special_k: + partAs = connection_all[k][:, 0] + partBs = connection_all[k][:, 1] + indexA, indexB = np.array(limbSeq[k]) - 1 + + for i in range(len(connection_all[k])): # = 1:size(temp,1) + found = 0 + subset_idx = [-1, -1] + for j in range(len(subset)): # 1:size(subset,1): + if subset[j][indexA] == partAs[i] or subset[j][ + indexB] == partBs[i]: + subset_idx[found] = j + found += 1 + + if found == 1: + j = subset_idx[0] + if subset[j][indexB] != partBs[i]: + subset[j][indexB] = partBs[i] + subset[j][-1] += 1 + subset[j][-2] += candidate[ + partBs[i].astype(int), + 2] + connection_all[k][i][2] + elif found == 2: # if found 2 and disjoint, merge them + j1, j2 = subset_idx + tmp1 = (subset[j1] >= 0).astype(int) + tmp2 = (subset[j2] >= 0).astype(int) + membership = (tmp1 + tmp2)[:-2] + if len(np.nonzero(membership == 2)[0]) == 0: # merge + subset[j1][:-2] += (subset[j2][:-2] + 1) + subset[j1][-2:] += subset[j2][-2:] + subset[j1][-2] += connection_all[k][i][2] + subset = np.delete(subset, j2, 0) + else: # as like found == 1 + subset[j1][indexB] = partBs[i] + subset[j1][-1] += 1 + subset[j1][-2] += candidate[ + partBs[i].astype(int), + 2] + connection_all[k][i][2] + + # if find no partA in the subset, create a new subset + elif not found and k < 17: + row = -1 * np.ones(20) + row[indexA] = partAs[i] + row[indexB] = partBs[i] + row[-1] = 2 + row[-2] = sum( + candidate[connection_all[k][i, :2].astype(int), + 2]) + connection_all[k][i][2] + subset = np.vstack([subset, row]) + # delete some rows of subset which has few parts occur + deleteIdx = [] + for i in range(len(subset)): + if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4: + deleteIdx.append(i) + subset = np.delete(subset, deleteIdx, axis=0) + + # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts + # candidate: x, y, score, id + count = subset.shape[0] + joints = np.zeros(shape=(count, bodyparts, 3)) + + for i in range(count): + for j in range(bodyparts): + joints[i, j, :3] = candidate[int(subset[i, j]), :3] + confidence = 1.0 if subset[i, j] >= 0 else 0.0 + joints[i, j, 2] *= confidence + return joints diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py new file mode 100644 index 00000000..12f6e84d --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/model.py @@ -0,0 +1,141 @@ +# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. + +from collections import OrderedDict + +import torch +import torch.nn as nn + + +def make_layers(block, no_relu_layers): + layers = [] + for layer_name, v in block.items(): + if 'pool' in layer_name: + layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2]) + layers.append((layer_name, layer)) + else: + conv2d = nn.Conv2d( + in_channels=v[0], + out_channels=v[1], + kernel_size=v[2], + stride=v[3], + padding=v[4]) + layers.append((layer_name, conv2d)) + if layer_name not in no_relu_layers: + layers.append(('relu_' + layer_name, nn.ReLU(inplace=True))) + + return nn.Sequential(OrderedDict(layers)) + + +class BodyposeModel(nn.Module): + + def __init__(self): + super(BodyposeModel, self).__init__() + + # these layers have no relu layer + no_relu_layers = [ + 'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1', + 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2', + 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1', + 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1' + ] + blocks = {} + block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]), + ('conv1_2', [64, 64, 3, 1, 1]), + ('pool1_stage1', [2, 2, 0]), + ('conv2_1', [64, 128, 3, 1, 1]), + ('conv2_2', [128, 128, 3, 1, 1]), + ('pool2_stage1', [2, 2, 0]), + ('conv3_1', [128, 256, 3, 1, 1]), + ('conv3_2', [256, 256, 3, 1, 1]), + ('conv3_3', [256, 256, 3, 1, 1]), + ('conv3_4', [256, 256, 3, 1, 1]), + ('pool3_stage1', [2, 2, 0]), + ('conv4_1', [256, 512, 3, 1, 1]), + ('conv4_2', [512, 512, 3, 1, 1]), + ('conv4_3_CPM', [512, 256, 3, 1, 1]), + ('conv4_4_CPM', [256, 128, 3, 1, 1])]) + + # Stage 1 + block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]), + ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]), + ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]), + ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]), + ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])]) + + block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]), + ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]), + ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]), + ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]), + ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])]) + blocks['block1_1'] = block1_1 + blocks['block1_2'] = block1_2 + + self.model0 = make_layers(block0, no_relu_layers) + + # Stages 2 - 6 + for i in range(2, 7): + blocks['block%d_1' % i] = OrderedDict([ + ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]), + ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]), + ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]), + ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0]) + ]) + + blocks['block%d_2' % i] = OrderedDict([ + ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]), + ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]), + ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]), + ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0]) + ]) + + for k in blocks.keys(): + blocks[k] = make_layers(blocks[k], no_relu_layers) + + self.model1_1 = blocks['block1_1'] + self.model2_1 = blocks['block2_1'] + self.model3_1 = blocks['block3_1'] + self.model4_1 = blocks['block4_1'] + self.model5_1 = blocks['block5_1'] + self.model6_1 = blocks['block6_1'] + + self.model1_2 = blocks['block1_2'] + self.model2_2 = blocks['block2_2'] + self.model3_2 = blocks['block3_2'] + self.model4_2 = blocks['block4_2'] + self.model5_2 = blocks['block5_2'] + self.model6_2 = blocks['block6_2'] + + def forward(self, x): + + out1 = self.model0(x) + + out1_1 = self.model1_1(out1) + out1_2 = self.model1_2(out1) + out2 = torch.cat([out1_1, out1_2, out1], 1) + + out2_1 = self.model2_1(out2) + out2_2 = self.model2_2(out2) + out3 = torch.cat([out2_1, out2_2, out1], 1) + + out3_1 = self.model3_1(out3) + out3_2 = self.model3_2(out3) + out4 = torch.cat([out3_1, out3_2, out1], 1) + + out4_1 = self.model4_1(out4) + out4_2 = self.model4_2(out4) + out5 = torch.cat([out4_1, out4_2, out1], 1) + + out5_1 = self.model5_1(out5) + out5_2 = self.model5_2(out5) + out6 = torch.cat([out5_1, out5_2, out1], 1) + + out6_1 = self.model6_1(out6) + out6_2 = self.model6_2(out6) + + return out6_1, out6_2 diff --git a/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py new file mode 100644 index 00000000..13a42074 --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/pose_estimator/util.py @@ -0,0 +1,33 @@ +# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose. +import numpy as np + + +def pad_rightdown_corner(img, stride, padValue): + h = img.shape[0] + w = img.shape[1] + + pad = 4 * [None] + pad[0] = 0 # up + pad[1] = 0 # left + pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down + pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right + + img_padded = img + pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1)) + img_padded = np.concatenate((pad_up, img_padded), axis=0) + pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1)) + img_padded = np.concatenate((pad_left, img_padded), axis=1) + pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1)) + img_padded = np.concatenate((img_padded, pad_down), axis=0) + pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1)) + img_padded = np.concatenate((img_padded, pad_right), axis=1) + + return img_padded, pad + + +def transfer(model, model_weights): + transfered_model_weights = {} + for weights_name in model.state_dict().keys(): + transfered_model_weights[weights_name] = model_weights['.'.join( + weights_name.split('.')[1:])] + return transfered_model_weights diff --git a/modelscope/models/cv/image_body_reshaping/slim_utils.py b/modelscope/models/cv/image_body_reshaping/slim_utils.py new file mode 100644 index 00000000..23d5a741 --- /dev/null +++ b/modelscope/models/cv/image_body_reshaping/slim_utils.py @@ -0,0 +1,507 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import math +import os +import random + +import cv2 +import numba +import numpy as np +import torch + + +def resize_on_long_side(img, long_side=800): + src_height = img.shape[0] + src_width = img.shape[1] + + if src_height > src_width: + scale = long_side * 1.0 / src_height + _img = cv2.resize( + img, (int(src_width * scale), long_side), + interpolation=cv2.INTER_LINEAR) + else: + scale = long_side * 1.0 / src_width + _img = cv2.resize( + img, (long_side, int(src_height * scale)), + interpolation=cv2.INTER_LINEAR) + + return _img, scale + + +def point_in_box(pt, box): + pt_x = pt[0] + pt_y = pt[1] + + if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[ + 1] and pt_y <= box[1] + box[3]: + return True + else: + return False + + +def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True): + if roi_bbox is None or None in roi_bbox: + return [None, None, None, None] + + top = roi_bbox[0] + bottom = roi_bbox[1] + left = roi_bbox[2] + right = roi_bbox[3] + + roi_width = roi_bbox[3] - roi_bbox[2] + roi_height = roi_bbox[1] - roi_bbox[0] + right = left + roi_width + bottom = top + roi_height + + long_side = roi_width if roi_width > roi_height else roi_height + + if use_long_side: + new_left = left - int(long_side * ratio) + else: + new_left = left - int(roi_width * ratio) + new_left = 1 if new_left < 0 else new_left + + if use_long_side: + new_top = top - int(long_side * ratio) + else: + new_top = top - int(roi_height * ratio) + new_top = 1 if new_top < 0 else new_top + + if use_long_side: + new_right = right + int(long_side * ratio) + else: + new_right = right + int(roi_width * ratio) + new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right + + if use_long_side: + new_bottom = bottom + int(long_side * ratio) + else: + new_bottom = bottom + int(roi_height * ratio) + new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom + + bbox = [new_top, new_bottom, new_left, new_right] + return bbox + + +def gen_PAF(image, joints): + + assert joints.shape[0] == 18 + assert joints.shape[1] == 3 + + org_h = image.shape[0] + org_w = image.shape[1] + small_image, resize_scale = resize_on_long_side(image, 120) + + joints[:, :2] = joints[:, :2] * resize_scale + + joint_left = int(np.min(joints, axis=0)[0]) + joint_right = int(np.max(joints, axis=0)[0]) + joint_top = int(np.min(joints, axis=0)[1]) + joint_bottom = int(np.max(joints, axis=0)[1]) + + limb_width = min( + abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6 + + if limb_width % 2 == 0: + limb_width += 1 + kernel_size = limb_width + + part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12), + (12, 13), (8, 9), (9, 10)] + + map_list = [] + mask_list = [] + PAF_all = np.zeros( + shape=(small_image.shape[0], small_image.shape[1], 2), + dtype=np.float32) + for c, pair in enumerate(part_orders): + idx_a_name = pair[0] + idx_b_name = pair[1] + + jointa = joints[idx_a_name] + jointb = joints[idx_b_name] + + confidence_threshold = 0.05 + if jointa[2] > confidence_threshold and jointb[ + 2] > confidence_threshold: + canvas = np.zeros( + shape=(small_image.shape[0], small_image.shape[1]), + dtype=np.uint8) + + canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])), + (int(jointb[0]), int(jointb[1])), + (255, 255, 255), 5) + + kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, + (kernel_size, kernel_size)) + + canvas = cv2.dilate(canvas, kernel, 1) + canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0) + canvas = canvas.astype(np.float32) / 255 + PAF = np.zeros( + shape=(small_image.shape[0], small_image.shape[1], 2), + dtype=np.float32) + PAF[..., 0] = jointb[0] - jointa[0] + PAF[..., 1] = jointb[1] - jointa[1] + mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1]) + PAF /= (np.dstack((mag, mag)) + 1e-5) + + single_PAF = PAF * np.dstack((canvas, canvas)) + map_list.append( + cv2.GaussianBlur(single_PAF, + (kernel_size * 3, kernel_size * 3), 0)) + + mask_list.append( + cv2.GaussianBlur(canvas.copy(), + (kernel_size * 3, kernel_size * 3), 0)) + PAF_all = PAF_all * (1.0 - np.dstack( + (canvas, canvas))) + single_PAF + + PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0) + PAF_all = cv2.resize( + PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR) + map_list.append(PAF_all) + return PAF_all, map_list, mask_list + + +def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None): + if type(joints) == list: + joints = np.array(joints) + assert stack_mode == 'column' or stack_mode == 'depth' + + part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3), + (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)] + + def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0): + jointa = joints[a] + jointb = joints[b] + + temp1 = int((jointa[0] - x_offset) * scale) + temp2 = int((jointa[1] - y_offset) * scale) + temp3 = int((jointb[0] - x_offset) * scale) + temp4 = int((jointb[1] - y_offset) * scale) + + cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width) + + roi_box = input_roi_box + + roi_box_width = roi_box[3] - roi_box[2] + roi_box_height = roi_box[1] - roi_box[0] + short_side_length = min(roi_box_width, roi_box_height) + line_width = short_side_length // 30 + + line_width = max(line_width, 2) + + map_cube = np.zeros( + shape=(roi_box_height, roi_box_width, len(part_orders) + 1), + dtype=np.float32) + + use_line_width = min(5, line_width) + fx = use_line_width * 1.0 / line_width # fx 最大值为1 + + if fx < 0.99: + map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx) + + for c, pair in enumerate(part_orders): + tmp = map_cube[..., c].copy() + link( + tmp, + pair[0], + pair[1], (2.0, 2.0, 2.0), + use_line_width, + scale=fx, + x_offset=roi_box[2], + y_offset=roi_box[0]) + map_cube[..., c] = tmp + + tmp = map_cube[..., -1].copy() + link( + tmp, + pair[0], + pair[1], (2.0, 2.0, 2.0), + use_line_width, + scale=fx, + x_offset=roi_box[2], + y_offset=roi_box[0]) + map_cube[..., -1] = tmp + + map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height)) + + if stack_mode == 'depth': + return map_cube, roi_box + elif stack_mode == 'column': + joint_maps = [] + for c in range(len(part_orders) + 1): + joint_maps.append(map_cube[..., c]) + joint_map = np.column_stack(joint_maps) + + return joint_map, roi_box + + +def plot_one_box(x, img, color=None, label=None, line_thickness=None): + tl = line_thickness or round( + 0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness + color = color or [random.randint(0, 255) for _ in range(3)] + c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) + cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) + if label: + tf = max(tl - 1, 1) # font thickness + t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] + c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 + cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled + cv2.putText( + img, + label, (c1[0], c1[1] - 2), + 0, + tl / 3, [225, 255, 255], + thickness=tf, + lineType=cv2.LINE_AA) + + +def draw_line(im, points, color, stroke_size=2, closed=False): + points = points.astype(np.int32) + for i in range(len(points) - 1): + cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color, + stroke_size) + if closed: + cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size) + + +def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2): + left = bbox[0] + top = bbox[1] + + right = bbox[2] + bottom = bbox[3] + + roi_width = right - left + roi_height = bottom - top + + new_left = left - int(roi_width * enlarge_ratio) + new_left = 0 if new_left < 0 else new_left + + new_top = top - int(roi_height * enlarge_ratio) + new_top = 0 if new_top < 0 else new_top + + new_right = right + int(roi_width * enlarge_ratio) + new_right = img_width if new_right > img_width else new_right + + new_bottom = bottom + int(roi_height * enlarge_ratio) + new_bottom = img_height if new_bottom > img_height else new_bottom + + bbox = [new_left, new_top, new_right, new_bottom] + + bbox = [int(x) for x in bbox] + + return bbox + + +def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')): + map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list] + map_concat = torch.stack(tuple(map_list_cuda), dim=-1) + + map_concat = torch.abs(map_concat) + + map_concat[map_concat < threshold] = 0 + map_concat[map_concat > 1e-5] = 1.0 + + sum_map = torch.sum(map_concat, dim=2) + a = torch.ones_like(sum_map) + acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map)) + + fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map) + + fusion_map = fusion_map.float() + acc_map = acc_map.float() + + fusion_map = fusion_map.cpu().numpy().astype(np.float32) + acc_map = acc_map.cpu().numpy().astype(np.float32) + + return fusion_map, acc_map + + +def gen_border_shade(height, width, height_band, width_band): + height_ratio = height_band * 1.0 / height + width_ratio = width_band * 1.0 / width + + _height_band = int(256 * height_ratio) + _width_band = int(256 * width_ratio) + + canvas = np.zeros((256, 256), dtype=np.float32) + + canvas[_height_band // 2:-_height_band // 2, + _width_band // 2:-_width_band // 2] = 1.0 + + canvas = cv2.blur(canvas, (_height_band, _width_band)) + + canvas = cv2.resize(canvas, (width, height)) + + return canvas + + +def get_mask_bbox(mask, threshold=127): + ret, mask = cv2.threshold(mask, threshold, 1, 0) + + if cv2.countNonZero(mask) == 0: + return [None, None, None, None] + + col_acc = np.sum(mask, 0) + row_acc = np.sum(mask, 1) + + col_acc = col_acc.tolist() + row_acc = row_acc.tolist() + + for x in range(len(col_acc)): + if col_acc[x] > 0: + left = x + break + + for x in range(1, len(col_acc)): + if col_acc[-x] > 0: + right = len(col_acc) - x + break + + for x in range(len(row_acc)): + if row_acc[x] > 0: + top = x + break + + for x in range(1, len(row_acc)): + if row_acc[-x] > 0: + bottom = len(row_acc[::-1]) - x + break + return [top, bottom, left, right] + + +def visualize_flow(flow): + h, w = flow.shape[:2] + hsv = np.zeros((h, w, 3), np.uint8) + mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1]) + + hsv[..., 0] = ang * 180 / np.pi / 2 + hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX) + hsv[..., 2] = 255 + bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) + bgr = bgr * 1.0 / 255 + return bgr.astype(np.float32) + + +def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1): + + part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3), + (3, 4), (11, 12), (12, 13), (8, 9), (9, 10)] + + abandon_idxs = [0, 1, 14, 15, 16, 17] + # draw joints + for i, joint in enumerate(joints): + if i in abandon_idxs: + continue + if joint[-1] > confidence_threshold: + + cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2) + if show_text: + cv2.putText(image, + str(i) + '[{:.2f}]'.format(joint[-1]), + (int(joint[0]), int(joint[1])), + cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) + # draw link + for pair in part_orders: + if joints[pair[0]][-1] > confidence_threshold and joints[ + pair[1]][-1] > confidence_threshold: + cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])), + (int(joints[pair[1]][0]), int(joints[pair[1]][1])), color, + 2) + return image + + +def get_heatmap_cv(img, magn, max_flow_mag): + min_flow_mag = .5 + cv_magn = np.clip( + 255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7), + a_min=0, + a_max=255).astype(np.uint8) + if img.dtype != np.uint8: + img = (255 * img).astype(np.uint8) + + heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET) + heatmap_img = heatmap_img[..., ::-1] + + h, w = magn.shape + img_alpha = np.ones((h, w), dtype=np.double)[:, :, None] + heatmap_alpha = np.clip( + magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7 + heatmap_alpha[heatmap_alpha < .2]**.5 + pm_hm = heatmap_img * heatmap_alpha + pm_img = img * img_alpha + cv_out = pm_hm + pm_img * (1 - heatmap_alpha) + cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8) + + return cv_out + + +def save_heatmap_cv(img, flow, supression=2): + + flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2) + flow_magn -= supression + flow_magn[flow_magn <= 0] = 0 + cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3) + return cv_out + + +@numba.jit(nopython=True, parallel=False) +def bilinear_interp(x, y, v11, v12, v21, v22): + temp1 = (v11 * (1 - y) + v12 * y) * (1 - x) + temp2 = (v21 * (1 - y) + v22 * y) * x + result = temp1 + temp2 + return result + + +@numba.jit(nopython=True, parallel=False) +def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand, + height_expand): + srcW = oriImg.shape[1] + srcH = oriImg.shape[0] + + newImg = oriImg.copy() + + for i in range(srcH): + for j in range(srcW): + _i = i + _j = j + + deltaX = rDx[_i, _j] + deltaY = rDy[_i, _j] + + nx = _j + deltaX * transRatio + ny = _i + deltaY * transRatio + + if nx >= srcW - width_expand - 1: + if nx > srcW - 1: + nx = srcW - 1 + + if ny >= srcH - height_expand - 1: + if ny > srcH - 1: + ny = srcH - 1 + + if nx < width_expand: + if nx < 0: + nx = 0 + + if ny < height_expand: + if ny < 0: + ny = 0 + + nxi = int(math.floor(nx)) + nyi = int(math.floor(ny)) + nxi1 = int(math.ceil(nx)) + nyi1 = int(math.ceil(ny)) + + for ll in range(3): + newImg[_i, _j, + ll] = bilinear_interp(ny - nyi, nx - nxi, + oriImg[nyi, nxi, + ll], oriImg[nyi, nxi1, ll], + oriImg[nyi1, nxi, + ll], oriImg[nyi1, nxi1, + ll]) + return newImg diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 717ff4dd..c16e256e 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -184,6 +184,7 @@ TASK_OUTPUTS = { Tasks.image_to_image_translation: [OutputKeys.OUTPUT_IMG], Tasks.image_style_transfer: [OutputKeys.OUTPUT_IMG], Tasks.image_portrait_stylization: [OutputKeys.OUTPUT_IMG], + Tasks.image_body_reshaping: [OutputKeys.OUTPUT_IMG], # live category recognition result for single video # { diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 7fa66b5f..c9a70d14 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -75,6 +75,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/nlp_bart_text-error-correction_chinese'), Tasks.image_captioning: (Pipelines.image_captioning, 'damo/ofa_image-caption_coco_large_en'), + Tasks.image_body_reshaping: (Pipelines.image_body_reshaping, + 'damo/cv_flow-based-body-reshaping_damo'), Tasks.image_portrait_stylization: (Pipelines.person_image_cartoon, 'damo/cv_unet_person-image-cartoon_compound-models'), diff --git a/modelscope/pipelines/cv/image_body_reshaping_pipeline.py b/modelscope/pipelines/cv/image_body_reshaping_pipeline.py new file mode 100644 index 00000000..c3600eb5 --- /dev/null +++ b/modelscope/pipelines/cv/image_body_reshaping_pipeline.py @@ -0,0 +1,40 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_body_reshaping, module_name=Pipelines.image_body_reshaping) +class ImageBodyReshapingPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image body reshaping pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + logger.info('body reshaping model init done') + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input) + result = {'img': img} + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + output = self.model.inference(input['img']) + result = {'outputs': output} + return result + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + output_img = inputs['outputs'] + return {OutputKeys.OUTPUT_IMG: output_img} diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 5bc27c03..2331dc85 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -60,7 +60,7 @@ class CVTasks(object): image_to_image_generation = 'image-to-image-generation' image_style_transfer = 'image-style-transfer' image_portrait_stylization = 'image-portrait-stylization' - + image_body_reshaping = 'image-body-reshaping' image_embedding = 'image-embedding' product_retrieval_embedding = 'product-retrieval-embedding' diff --git a/requirements/cv.txt b/requirements/cv.txt index 5a2d7763..f907256d 100644 --- a/requirements/cv.txt +++ b/requirements/cv.txt @@ -13,6 +13,7 @@ ml_collections mmcls>=0.21.0 mmdet>=2.25.0 networkx>=2.5 +numba onnxruntime>=1.10 pai-easycv>=0.6.3.6 pandas diff --git a/tests/pipelines/test_image_body_reshaping.py b/tests/pipelines/test_image_body_reshaping.py new file mode 100644 index 00000000..e1955e94 --- /dev/null +++ b/tests/pipelines/test_image_body_reshaping.py @@ -0,0 +1,58 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +import unittest + +import cv2 + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class ImageBodyReshapingTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.image_body_reshaping + self.model_id = 'damo/cv_flow-based-body-reshaping_damo' + self.test_image = 'data/test/images/image_body_reshaping.jpg' + + def pipeline_inference(self, pipeline: Pipeline, input_location: str): + result = pipeline(input_location) + if result is not None: + cv2.imwrite('result_bodyreshaping.png', + result[OutputKeys.OUTPUT_IMG]) + print( + f'Output written to {osp.abspath("result_body_reshaping.png")}' + ) + else: + raise Exception('Testing failed: invalid output') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_by_direct_model_download(self): + model_dir = snapshot_download(self.model_id) + image_body_reshaping = pipeline( + Tasks.image_body_reshaping, model=model_dir) + self.pipeline_inference(image_body_reshaping, self.test_image) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + image_body_reshaping = pipeline( + Tasks.image_body_reshaping, model=self.model_id) + self.pipeline_inference(image_body_reshaping, self.test_image) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_modelhub_default_model(self): + image_body_reshaping = pipeline(Tasks.image_body_reshaping) + self.pipeline_inference(image_body_reshaping, self.test_image) + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() From 12ee711f682b5d90d35eb6c7ec024ccf87ee619a Mon Sep 17 00:00:00 2001 From: "wenqi.oywq" Date: Tue, 11 Oct 2022 11:07:45 +0800 Subject: [PATCH 02/57] [to #42322933]add license header Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10353812 * add license header --- modelscope/models/cv/image_color_enhance/csrnet.py | 3 +++ .../models/cv/image_color_enhance/image_color_enhance.py | 1 + 2 files changed, 4 insertions(+) diff --git a/modelscope/models/cv/image_color_enhance/csrnet.py b/modelscope/models/cv/image_color_enhance/csrnet.py index 782cd528..502abf88 100644 --- a/modelscope/models/cv/image_color_enhance/csrnet.py +++ b/modelscope/models/cv/image_color_enhance/csrnet.py @@ -1,3 +1,6 @@ +# The implementation is adopted from Jingwen He, +# made publicly available at https://github.com/hejingwenhejingwen/CSRNet + import functools import math diff --git a/modelscope/models/cv/image_color_enhance/image_color_enhance.py b/modelscope/models/cv/image_color_enhance/image_color_enhance.py index 382cc152..0bd74197 100644 --- a/modelscope/models/cv/image_color_enhance/image_color_enhance.py +++ b/modelscope/models/cv/image_color_enhance/image_color_enhance.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os.path as osp from copy import deepcopy from typing import Dict, Union From 4bd72e528ad9938e131908c5a67920666fcdcae1 Mon Sep 17 00:00:00 2001 From: pangda Date: Tue, 11 Oct 2022 11:14:34 +0800 Subject: [PATCH 03/57] [to #42322933] support restore best checkpoint after training MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 支持训练完成后自动恢复best ckpt,方便在不同测试集上进行测试 2. build_optimizer/build_lr_scheduler改为成员函数,方便重载(如模型分层lr) Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10348255 --- modelscope/trainers/hooks/checkpoint_hook.py | 7 +++++++ modelscope/trainers/trainer.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/modelscope/trainers/hooks/checkpoint_hook.py b/modelscope/trainers/hooks/checkpoint_hook.py index 220929b8..c9f51a88 100644 --- a/modelscope/trainers/hooks/checkpoint_hook.py +++ b/modelscope/trainers/hooks/checkpoint_hook.py @@ -216,6 +216,7 @@ class BestCkptSaverHook(CheckpointHook): by_epoch (bool): Save best checkpoints by epoch or by iteration. save_optimizer (bool): Whether to save optimizer state dict. Default: True. save_dir (str): Output directory to save best checkpoint. + restore_best (bool): Whether to restore the best checkpoint after training. """ PRIORITY = Priority.LOW @@ -228,6 +229,7 @@ class BestCkptSaverHook(CheckpointHook): save_optimizer=True, save_dir=None, save_file_name=None, + restore_best=False, interval=0): assert rule in ['max', 'min'], 'Only support "max" or "min" rule now.' super().__init__( @@ -241,6 +243,7 @@ class BestCkptSaverHook(CheckpointHook): self._best_metric = None self._best_ckpt_file = None self.save_file_name = save_file_name + self.restore_best = restore_best def _should_save(self, trainer): return self._is_best_metric(trainer.metric_values) @@ -305,3 +308,7 @@ class BestCkptSaverHook(CheckpointHook): self.logger.warn( 'The state_dict is not available, the best metric value will be affected.' ) + + def after_run(self, trainer): + if self.restore_best: + self.load_checkpoint(self._best_ckpt_file, trainer) diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index a01d9b59..4c21d63f 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -664,6 +664,12 @@ class EpochBasedTrainer(BaseTrainer): dataset = self.to_task_dataset(torch_dataset, mode) return dataset + def build_optimizer(self, cfg: ConfigDict, default_args: dict = None): + return build_optimizer(self.model, cfg=cfg, default_args=default_args) + + def build_lr_scheduler(self, cfg: ConfigDict, default_args: dict = None): + return build_lr_scheduler(cfg=cfg, default_args=default_args) + def create_optimizer_and_scheduler(self): """ Create optimizer and lr scheduler @@ -680,7 +686,7 @@ class EpochBasedTrainer(BaseTrainer): optim_options = {} if optimizer_cfg is not None: optim_options = optimizer_cfg.pop('options', {}) - optimizer = build_optimizer(self.model, cfg=optimizer_cfg) + optimizer = self.build_optimizer(cfg=optimizer_cfg) if lr_scheduler is None: lr_scheduler_cfg = self.cfg.train.get('lr_scheduler', None) @@ -691,7 +697,7 @@ class EpochBasedTrainer(BaseTrainer): if lr_scheduler_cfg is not None: assert optimizer is not None lr_options = lr_scheduler_cfg.pop('options', {}) - lr_scheduler = build_lr_scheduler( + lr_scheduler = self.build_lr_scheduler( cfg=lr_scheduler_cfg, default_args={'optimizer': optimizer}) self.optimizer = optimizer From 333c11c0a61c780a524d1b3b07793cff0d46a8da Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Tue, 11 Oct 2022 14:06:07 +0800 Subject: [PATCH 04/57] [to #42322933] fix: missing type bytes in InputType.AUDIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 已经和谦言讨论过,确认可添加 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10358110 * fix: missing type bytes in InputType.AUDIO --- modelscope/pipeline_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py index de9814a7..2b14c278 100644 --- a/modelscope/pipeline_inputs.py +++ b/modelscope/pipeline_inputs.py @@ -28,7 +28,7 @@ class InputType(object): INPUT_TYPE = { InputType.IMAGE: (str, np.ndarray, Image.Image), InputType.TEXT: str, - InputType.AUDIO: (str, np.ndarray), + InputType.AUDIO: (str, bytes, np.ndarray), InputType.VIDEO: (str, np.ndarray, cv2.VideoCapture), InputType.BOX: (list, np.ndarray), InputType.DICT: (dict, type(None)), From 09d2296f36f1a301dc5e144e00a692dfce2675ee Mon Sep 17 00:00:00 2001 From: "laiyin.lyc" Date: Tue, 11 Oct 2022 16:05:20 +0800 Subject: [PATCH 05/57] [to #44847108] add sparsity hook (pst algorithm) Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10198228 * [to #44847108] add sparsity hook (pst algorithm) --- modelscope/metainfo.py | 3 + modelscope/trainers/hooks/__init__.py | 4 +- .../trainers/hooks/compression/__init__.py | 24 ++ .../hooks/compression/sparsity_hook.py | 131 +++++++++++ .../trainers/hooks/compression/utils.py | 208 ++++++++++++++++++ tests/trainers/hooks/compression/__init__.py | 0 .../hooks/compression/test_sparsity_hook.py | 113 ++++++++++ 7 files changed, 482 insertions(+), 1 deletion(-) create mode 100644 modelscope/trainers/hooks/compression/__init__.py create mode 100644 modelscope/trainers/hooks/compression/sparsity_hook.py create mode 100644 modelscope/trainers/hooks/compression/utils.py create mode 100644 tests/trainers/hooks/compression/__init__.py create mode 100644 tests/trainers/hooks/compression/test_sparsity_hook.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 1b8c4720..77627abc 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -404,6 +404,9 @@ class Hooks(object): IterTimerHook = 'IterTimerHook' EvaluationHook = 'EvaluationHook' + # Compression + SparsityHook = 'SparsityHook' + class LR_Schedulers(object): """learning rate scheduler is defined here diff --git a/modelscope/trainers/hooks/__init__.py b/modelscope/trainers/hooks/__init__.py index f133041b..a2e0cf4b 100644 --- a/modelscope/trainers/hooks/__init__.py +++ b/modelscope/trainers/hooks/__init__.py @@ -6,10 +6,11 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .builder import HOOKS, build_hook from .checkpoint_hook import BestCkptSaverHook, CheckpointHook + from .compression import SparsityHook from .evaluation_hook import EvaluationHook from .hook import Hook from .iter_timer_hook import IterTimerHook - from .logger import TextLoggerHook, TensorboardHook + from .logger import TensorboardHook, TextLoggerHook from .lr_scheduler_hook import LrSchedulerHook from .optimizer import (ApexAMPOptimizerHook, NoneOptimizerHook, OptimizerHook, TorchAMPOptimizerHook) @@ -19,6 +20,7 @@ else: _import_structure = { 'builder': ['HOOKS', 'build_hook'], 'checkpoint_hook': ['BestCkptSaverHook', 'CheckpointHook'], + 'compression': ['SparsityHook'], 'evaluation_hook': ['EvaluationHook'], 'hook': ['Hook'], 'iter_timer_hook': ['IterTimerHook'], diff --git a/modelscope/trainers/hooks/compression/__init__.py b/modelscope/trainers/hooks/compression/__init__.py new file mode 100644 index 00000000..f755b2ca --- /dev/null +++ b/modelscope/trainers/hooks/compression/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .sparsity_hook import SparsityHook + from .utils import SparseLinear, convert_sparse_network + +else: + _import_structure = { + 'sparsity_hook': ['SparsityHook'], + 'utils': ['convert_sparse_network', 'SparseLinear'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/trainers/hooks/compression/sparsity_hook.py b/modelscope/trainers/hooks/compression/sparsity_hook.py new file mode 100644 index 00000000..993488d8 --- /dev/null +++ b/modelscope/trainers/hooks/compression/sparsity_hook.py @@ -0,0 +1,131 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os + +from modelscope import __version__ +from modelscope.metainfo import Hooks +from modelscope.trainers.hooks.builder import HOOKS +from modelscope.trainers.hooks.hook import Hook +from modelscope.trainers.hooks.priority import Priority +from modelscope.utils.checkpoint import save_checkpoint +from modelscope.utils.torch_utils import is_master + + +@HOOKS.register_module(module_name=Hooks.SparsityHook) +class SparsityHook(Hook): + + PRIORITY = Priority.HIGHEST + + def __init__(self, pruning_method, config={}, save_dir=None): + self.pruning_method = pruning_method + self.save_dir = save_dir + + self.compress_module = config.get('compress_module', []) + self.weight_rank = config.get('weight_rank', 8) + self.weight_beta = config.get('weight_beta', 1) + self.mask_rank = config.get('mask_rank', 8) + self.mask_alpha1 = config.get('mask_alpha1', 1) + self.mask_alpha2 = config.get('mask_alpha2', 1) + + self.step = 0 + self.total_step = 0 + self.frequency = config.get('frequency', 1) + self.initial_warmup = config.get('initial_warmup', 0.1) + self.final_warmup = config.get('final_warmup', 0.3) + self.initial_sparsity = config.get('initial_sparsity', 0.0) + self.final_sparsity = config.get('final_sparsity', 0.0) + + def before_run(self, trainer): + import torch + + from .utils import SparseLinear, convert_sparse_network + + if self.save_dir is None: + self.save_dir = trainer.work_dir + + if len(self.compress_module) == 0: + convert_sparse_network( + trainer.model, + pruning_method=self.pruning_method, + weight_rank=self.weight_rank, + weight_beta=self.weight_beta, + mask_rank=self.mask_rank, + mask_alpha1=self.mask_alpha1, + mask_alpha2=self.mask_alpha2, + logger=trainer.logger, + ) + else: + for cm in self.compress_module: + for name, module in trainer.model.named_modules(): + if name != cm: + continue + convert_sparse_network( + module, + pruning_method=self.pruning_method, + weight_rank=self.weight_rank, + weight_beta=self.weight_beta, + mask_rank=self.mask_rank, + mask_alpha1=self.mask_alpha1, + mask_alpha2=self.mask_alpha2, + logger=trainer.logger, + ) + + for i in range(len(trainer.optimizer.param_groups)): + new_train_params = [] + for param in trainer.optimizer.param_groups[i]['params']: + is_find = False + for name, module in trainer.model.named_modules(): + if isinstance(module, SparseLinear): + if torch.equal(param.half(), + module.weight.data.half()): + is_find = True + break + + if not is_find: + new_train_params.append(param) + + trainer.optimizer.param_groups[i]['params'] = new_train_params + + new_params = [] + for name, module in trainer.model.named_modules(): + if isinstance(module, SparseLinear): + new_params.extend( + [p for p in module.parameters() if p.requires_grad]) + + trainer.optimizer.add_param_group({'params': new_params}) + + self.total_step = trainer.iters_per_epoch * trainer._max_epochs + + def before_train_iter(self, trainer): + from .utils import schedule_sparsity_ratio, update_network_sparsity + + cur_sparsity = schedule_sparsity_ratio( + self.step, + self.total_step, + self.frequency, + self.initial_warmup, + self.final_warmup, + self.initial_sparsity, + self.final_sparsity, + ) + + update_network_sparsity(trainer.model, cur_sparsity) + + if is_master(): + trainer.logger.info( + f'Step[{self.step}/{self.total_step}] current sparsity ratio = {cur_sparsity}' + ) + + self.step += 1 + + def after_run(self, trainer): + from .utils import generate_sparse_model + + generate_sparse_model(trainer.model, logger=trainer.logger) + + self._save_checkpoint(trainer) + + def _save_checkpoint(self, trainer): + if is_master(): + trainer.logger.info('Saving checkpoint at final compress') + cur_save_name = os.path.join(self.save_dir, 'compress_model.pth') + save_checkpoint(trainer.model, cur_save_name, trainer.optimizer) diff --git a/modelscope/trainers/hooks/compression/utils.py b/modelscope/trainers/hooks/compression/utils.py new file mode 100644 index 00000000..59418201 --- /dev/null +++ b/modelscope/trainers/hooks/compression/utils.py @@ -0,0 +1,208 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import torch +import torch.nn as nn + +from modelscope.utils.torch_utils import is_master + + +class SparseBinarizer(torch.autograd.Function): + + @staticmethod + def forward(ctx, mask_scores, sparsity): + num_prune = int(mask_scores.numel() * sparsity) + prune_indices = torch.argsort(mask_scores.reshape(-1))[:num_prune] + mask = mask_scores.clone().fill_(1) + mask.reshape(-1)[prune_indices] = 0.0 + return mask + + @staticmethod + def backward(ctx, gradOutput): + return gradOutput, None + + +class SparseLinear(nn.Module): + """ + Fully Connected layer with on the fly adaptive mask. + """ + + def __init__( + self, + module, + pruning_method='pst', + weight_rank=8, + weight_beta=1.0, + mask_rank=8, + mask_alpha1=1.0, + mask_alpha2=1.0, + ): + super(SparseLinear, self).__init__() + self.module = module + out_features = self.module.weight.shape[0] + in_features = self.module.weight.shape[1] + + self.weight = self.module.weight + self.module.weight = None + self.module._parameters.pop('weight') + + self.pruning_method = pruning_method + + self.cur_sparsity = 0.0 + + if self.pruning_method == 'pst': + self.weight_rank = weight_rank + self.weight_beta = weight_beta + self.mask_rank = mask_rank + self.mask_alpha1 = mask_alpha1 + self.mask_alpha2 = mask_alpha2 + + # create trainable params + self.weight_U = nn.Parameter( + torch.randn(out_features, self.weight_rank).to( + device=self.weight.device, dtype=self.weight.dtype)) + self.weight_V = nn.Parameter( + torch.zeros(self.weight_rank, in_features).to( + device=self.weight.device, dtype=self.weight.dtype)) + + self.mask_scores_A = nn.Parameter( + torch.randn(out_features, self.mask_rank).to( + device=self.weight.device, dtype=self.weight.dtype)) + self.mask_scores_B = nn.Parameter( + torch.zeros(self.mask_rank, in_features).to( + device=self.weight.device, dtype=self.weight.dtype)) + self.mask_scores_R = nn.Parameter( + torch.zeros(out_features).to( + device=self.weight.device, dtype=self.weight.dtype)) + self.mask_scores_C = nn.Parameter( + torch.zeros(in_features).to( + device=self.weight.device, dtype=self.weight.dtype)) + + self.weight.requires_grad = False + if self.module.bias is not None: + self.module.bias.requires_grad = False + + def forward(self, *inputs): + if self.pruning_method == 'pst': + weight = self.weight + self.weight_beta * self.weight_U @ self.weight_V + mask_scores = ( + weight.abs() + + self.mask_alpha1 * self.mask_scores_A @ self.mask_scores_B + + self.mask_alpha2 * (self.mask_scores_R.unsqueeze(1) + + self.mask_scores_C.unsqueeze(0))) + + mask = SparseBinarizer.apply(mask_scores, self.cur_sparsity) + masked_weight = mask * weight + + self.module.weight = masked_weight + return self.module(*inputs) + else: + return self.module(*inputs) + + def convert(self): + if self.pruning_method == 'pst': + weight = self.weight + self.weight_beta * self.weight_U @ self.weight_V + mask_scores = ( + weight.abs() + + self.mask_alpha1 * self.mask_scores_A @ self.mask_scores_B + + self.mask_alpha2 * (self.mask_scores_R.unsqueeze(1) + + self.mask_scores_C.unsqueeze(0))) + + mask = SparseBinarizer.apply(mask_scores, self.cur_sparsity) + + masked_weight = mask * weight + self.module.weight = nn.Parameter(masked_weight.data) + + +def _setattr(model, name, module): + name_list = name.split('.') + for name in name_list[:-1]: + model = getattr(model, name) + setattr(model, name_list[-1], module) + + +def convert_sparse_network( + model, + pruning_method, + weight_rank, + weight_beta, + mask_rank, + mask_alpha1, + mask_alpha2, + logger=None, +): + compress_module = [nn.Linear] + try: + from megatron import mpu + compress_module.extend( + [mpu.RowParallelLinear, mpu.ColumnParallelLinear]) + except ImportError: + pass + + for name, module in model.named_modules(): + if type(module) in compress_module: + new_module = SparseLinear( + module, + pruning_method, + weight_rank, + weight_beta, + mask_rank, + mask_alpha1, + mask_alpha2, + ) + + # replace original module by new sparse module + _setattr(model, name, new_module) + + if is_master(): + if logger: + logger.info(f'convert {name} to sparse module.') + else: + print(f'convert {name} to sparse module.') + + +def update_network_sparsity(model, sparsity): + for name, module in model.named_modules(): + if isinstance(module, SparseLinear): + module.cur_sparsity = sparsity + + +def schedule_sparsity_ratio( + step, + total_step, + frequency, + initial_warmup, + final_warmup, + initial_sparsity, + final_sparsity, +): + if step <= initial_warmup * total_step: + sparsity = initial_sparsity + elif step > (total_step - final_warmup * total_step): + sparsity = final_sparsity + else: + spars_warmup_steps = initial_warmup * total_step + spars_schedu_steps = (final_warmup + initial_warmup) * total_step + step = (step - spars_warmup_steps) // frequency * frequency + mul_coeff = 1 - step / (total_step - spars_schedu_steps) + sparsity = final_sparsity + (initial_sparsity - final_sparsity) * ( + mul_coeff**3) + return sparsity + + +def generate_sparse_model(model, logger=None): + # generate sparse weight for saving + for name, module in model.named_modules(): + if isinstance(module, SparseLinear): + module.convert() + + _setattr(model, name, module.module) + + if is_master(): + if logger: + logger.info(f'convert {name} weight to sparse weight, \ + sparsity ratio={torch.mean(1.0*(module.module.weight==0)).item()}.' + ) + else: + print(f'convert {name} weight to sparse, \ + sparsity ratio={torch.mean(1.0*(module.module.weight==0)).item()}.' + ) diff --git a/tests/trainers/hooks/compression/__init__.py b/tests/trainers/hooks/compression/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/trainers/hooks/compression/test_sparsity_hook.py b/tests/trainers/hooks/compression/test_sparsity_hook.py new file mode 100644 index 00000000..4af4dcdb --- /dev/null +++ b/tests/trainers/hooks/compression/test_sparsity_hook.py @@ -0,0 +1,113 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +import json +import numpy as np +import torch +from torch import nn +from torch.optim import SGD +from torch.optim.lr_scheduler import MultiStepLR + +from modelscope.metainfo import Trainers +from modelscope.models.base import Model +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile, TrainerStages +from modelscope.utils.test_utils import create_dummy_test_dataset + +dummy_dataset = create_dummy_test_dataset( + np.random.random(size=(5, )), np.random.randint(0, 4, (1, )), 10) + + +class DummyModel(nn.Module, Model): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(5, 10) + self.bn = nn.BatchNorm1d(10) + + def forward(self, feat, labels): + x = self.linear(feat) + + x = self.bn(x) + loss = torch.sum(x) + return dict(logits=x, loss=loss) + + +class SparsityHookTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir) + + def test_sparsity_hook(self): + json_cfg = { + 'task': 'image_classification', + 'train': { + 'work_dir': + self.tmp_dir, + 'dataloader': { + 'batch_size_per_gpu': 2, + 'workers_per_gpu': 1 + }, + 'hooks': [{ + 'type': 'SparsityHook', + 'pruning_method': 'pst', + 'config': { + 'weight_rank': 1, + 'mask_rank': 1, + 'final_sparsity': 0.9, + 'frequency': 1, + }, + }], + }, + } + + config_path = os.path.join(self.tmp_dir, ModelFile.CONFIGURATION) + with open(config_path, 'w') as f: + json.dump(json_cfg, f) + + model = DummyModel() + optimizer = SGD(model.parameters(), lr=0.01) + lr_scheduler = MultiStepLR(optimizer, milestones=[2, 4]) + trainer_name = Trainers.default + kwargs = dict( + cfg_file=config_path, + model=model, + train_dataset=dummy_dataset, + optimizers=(optimizer, lr_scheduler), + max_epochs=5, + device='cpu', + ) + + trainer = build_trainer(trainer_name, kwargs) + train_dataloader = trainer._build_dataloader_with_dataset( + trainer.train_dataset, **trainer.cfg.train.get('dataloader', {})) + trainer.register_optimizers_hook() + trainer.register_hook_from_cfg(trainer.cfg.train.hooks) + trainer.train_dataloader = train_dataloader + trainer.data_loader = train_dataloader + trainer.invoke_hook(TrainerStages.before_run) + for i in range(trainer._epoch, trainer._max_epochs): + trainer.invoke_hook(TrainerStages.before_train_epoch) + for _, data_batch in enumerate(train_dataloader): + trainer.invoke_hook(TrainerStages.before_train_iter) + trainer.train_step(trainer.model, data_batch) + trainer.invoke_hook(TrainerStages.after_train_iter) + trainer.invoke_hook(TrainerStages.after_train_epoch) + trainer.invoke_hook(TrainerStages.after_run) + + self.assertEqual( + torch.mean(1.0 * (trainer.model.linear.weight == 0)), 0.9) + + +if __name__ == '__main__': + unittest.main() From 67d6fa001da5cb58b81dcb68968355995f8e586f Mon Sep 17 00:00:00 2001 From: "hanyuan.chy" Date: Tue, 11 Oct 2022 17:17:51 +0800 Subject: [PATCH 06/57] [to #42322933] unify keys forbody_3d_keypoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 统一关键点检测输出key的名字 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10359335 --- modelscope/outputs.py | 4 ++-- modelscope/pipelines/cv/body_3d_keypoints_pipeline.py | 4 ++-- tests/pipelines/test_body_3d_keypoints.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index c16e256e..331f4816 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -222,7 +222,7 @@ TASK_OUTPUTS = { # 3D human body keypoints detection result for single sample # { - # "poses": [ # 3d pose coordinate in camera coordinate + # "keypoints": [ # 3d pose coordinate in camera coordinate # [[x, y, z]*17], # joints of per image # [[x, y, z]*17], # ... @@ -236,7 +236,7 @@ TASK_OUTPUTS = { # and is only avaialbe when the "render" option is enabled. # } Tasks.body_3d_keypoints: - [OutputKeys.POSES, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO], + [OutputKeys.KEYPOINTS, OutputKeys.TIMESTAMPS, OutputKeys.OUTPUT_VIDEO], # 2D hand keypoints result for single sample # { diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py index b0faa1e0..c3f4e8c1 100644 --- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py @@ -180,7 +180,7 @@ class Body3DKeypointsPipeline(Pipeline): return res def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]: - res = {OutputKeys.POSES: [], OutputKeys.TIMESTAMPS: []} + res = {OutputKeys.KEYPOINTS: [], OutputKeys.TIMESTAMPS: []} if not input['success']: pass @@ -197,7 +197,7 @@ class Body3DKeypointsPipeline(Pipeline): self.render_prediction(pred_3d_pose, output_video_path) res[OutputKeys.OUTPUT_VIDEO] = output_video_path - res[OutputKeys.POSES] = pred_3d_pose + res[OutputKeys.KEYPOINTS] = pred_3d_pose res[OutputKeys.TIMESTAMPS] = self.timestamps return res diff --git a/tests/pipelines/test_body_3d_keypoints.py b/tests/pipelines/test_body_3d_keypoints.py index 6f27f12d..6e671d2e 100644 --- a/tests/pipelines/test_body_3d_keypoints.py +++ b/tests/pipelines/test_body_3d_keypoints.py @@ -21,7 +21,7 @@ class Body3DKeypointsTest(unittest.TestCase, DemoCompatibilityCheck): def pipeline_inference(self, pipeline: Pipeline, pipeline_input): output = pipeline(pipeline_input, output_video='./result.mp4') - poses = np.array(output[OutputKeys.POSES]) + poses = np.array(output[OutputKeys.KEYPOINTS]) print(f'result 3d points shape {poses.shape}') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') From e240edea7ebbd7beb66246fb18b071a6ba0a65c0 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Tue, 11 Oct 2022 17:20:11 +0800 Subject: [PATCH 07/57] [to #42322933]t5 bug fixex --- modelscope/preprocessors/nlp/nlp_base.py | 4 +--- tests/pipelines/test_text2text_generation.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 6b559de9..a9be0cb0 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -417,14 +417,12 @@ class Text2TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): tokenizer=None, mode=ModeKeys.INFERENCE, **kwargs): - self.tokenizer = self.build_tokenizer( - model_dir) if tokenizer is None else tokenizer kwargs['truncation'] = kwargs.get('truncation', 'do_not_truncate') kwargs['padding'] = kwargs.get('padding', False) kwargs['return_token_type_ids'] = kwargs.get('return_token_type_ids', False) kwargs['max_length'] = kwargs.pop('sequence_length', 128) - super().__init__(model_dir, pair=False, mode=mode, **kwargs) + super().__init__(model_dir, mode=mode, **kwargs) def __call__(self, data: Union[Dict, str]) -> Dict[str, Any]: text_a, _, _ = self.parse_text_and_label(data) diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py index a39562f5..2506547e 100644 --- a/tests/pipelines/test_text2text_generation.py +++ b/tests/pipelines/test_text2text_generation.py @@ -18,7 +18,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): self.model_id = 'damo/t5-cn-base-test' self.input = '中国的首都位于。' - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_T5(self): cache_path = snapshot_download(self.model_id) model = T5ForConditionalGeneration(cache_path) @@ -40,7 +40,7 @@ class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): preprocessor=preprocessor) print(pipeline_ins(self.input)) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_pipeline_with_model_id(self): pipeline_ins = pipeline( task=Tasks.text2text_generation, model=self.model_id) From 65be443e982755a96915b363af6b6ad2dfe5c827 Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Tue, 11 Oct 2022 17:22:58 +0800 Subject: [PATCH 08/57] [to #41669377] Add more models to test in tts UT Link https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10360754#tab=detail Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10360754 --- .../audio/tts/models/datasets/__init__.py | 0 tests/pipelines/test_text_to_speech.py | 70 +++++++++++++++---- 2 files changed, 58 insertions(+), 12 deletions(-) mode change 100755 => 100644 modelscope/models/audio/tts/models/datasets/__init__.py diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py old mode 100755 new mode 100644 diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py index f659e59b..0caf1c84 100644 --- a/tests/pipelines/test_text_to_speech.py +++ b/tests/pipelines/test_text_to_speech.py @@ -27,21 +27,67 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase, def setUp(self) -> None: self.task = Tasks.text_to_speech - self.model_id = 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k' + zhcn_text = '今天北京天气怎么样' + en_text = 'How is the weather in Beijing?' + zhcn_voice = ['zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo'] + enus_voice = ['andy', 'annie'] + engb_voice = ['luca', 'luna'] + self.tts_test_cases = [] + for voice in zhcn_voice: + model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice, + 'zh-cn') + self.tts_test_cases.append({ + 'voice': voice, + 'model_id': model_id, + 'text': zhcn_text + }) + for voice in enus_voice: + model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice, + 'en-us') + self.tts_test_cases.append({ + 'voice': voice, + 'model_id': model_id, + 'text': en_text + }) + for voice in engb_voice: + model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice, + 'en-gb') + self.tts_test_cases.append({ + 'voice': voice, + 'model_id': model_id, + 'text': en_text + }) + zhcn_model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k' + enus_model_id = 'damo/speech_sambert-hifigan_tts_en-us_16k' + engb_model_id = 'damo/speech_sambert-hifigan_tts_en-gb_16k' + self.tts_test_cases.append({ + 'voice': 'zhcn', + 'model_id': zhcn_model_id, + 'text': zhcn_text + }) + self.tts_test_cases.append({ + 'voice': 'enus', + 'model_id': enus_model_id, + 'text': en_text + }) + self.tts_test_cases.append({ + 'voice': 'engb', + 'model_id': engb_model_id, + 'text': en_text + }) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_pipeline(self): - text = '今天北京天气怎么样?' - voice = 'zhitian_emo' - - model = Model.from_pretrained( - model_name_or_path=self.model_id, revision='pytorch_am') - sambert_hifigan_tts = pipeline(task=self.task, model=model) - self.assertTrue(sambert_hifigan_tts is not None) - output = sambert_hifigan_tts(input=text, voice=voice) - self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM]) - pcm = output[OutputKeys.OUTPUT_PCM] - write('output.wav', 16000, pcm) + for case in self.tts_test_cases: + logger.info('test %s' % case['voice']) + model = Model.from_pretrained( + model_name_or_path=case['model_id'], revision='pytorch_am') + sambert_hifigan_tts = pipeline(task=self.task, model=model) + self.assertTrue(sambert_hifigan_tts is not None) + output = sambert_hifigan_tts(input=case['text']) + self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM]) + pcm = output[OutputKeys.OUTPUT_PCM] + write('output_%s.wav' % case['voice'], 16000, pcm) @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): From 4c993638046a051a624375241ae5271509ebb510 Mon Sep 17 00:00:00 2001 From: "james.wjg" Date: Tue, 11 Oct 2022 17:24:46 +0800 Subject: [PATCH 09/57] =?UTF-8?q?[to=20#42322933]video=20summarization=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20license=20&=20header;=20=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=20output=20for=20demo=20service?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit video summarization: 1. 添加 license & header; 2. 修改 output for demo service Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10260946 --- .../metrics/video_summarization_metric.py | 3 ++ .../models/cv/video_summarization/__init__.py | 23 +++++++++- .../cv/video_summarization/base_model.py | 3 +- .../cv/video_summarization/kts/cpd_auto.py | 3 +- .../cv/video_summarization/kts/cpd_nonlin.py | 3 +- .../models/cv/video_summarization/pgl_sum.py | 3 +- .../cv/video_summarization/summarizer.py | 46 ++++++++++++++++++- .../video_summarization_dataset.py | 5 +- modelscope/outputs.py | 16 +++++++ .../cv/video_summarization_pipeline.py | 13 ++++-- tests/pipelines/test_video_summarization.py | 3 -- 11 files changed, 107 insertions(+), 14 deletions(-) diff --git a/modelscope/metrics/video_summarization_metric.py b/modelscope/metrics/video_summarization_metric.py index d1867600..40580382 100644 --- a/modelscope/metrics/video_summarization_metric.py +++ b/modelscope/metrics/video_summarization_metric.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from PGL-SUM, +# publicly available at https://github.com/e-apostolidis/PGL-SUM + from typing import Dict import numpy as np diff --git a/modelscope/models/cv/video_summarization/__init__.py b/modelscope/models/cv/video_summarization/__init__.py index 064110f7..15ad61b4 100644 --- a/modelscope/models/cv/video_summarization/__init__.py +++ b/modelscope/models/cv/video_summarization/__init__.py @@ -1 +1,22 @@ -from .summarizer import PGLVideoSummarization +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .summarizer import (PGLVideoSummarization, summary_format) + +else: + _import_structure = { + 'summarizer': ['PGLVideoSummarization', 'summary_format'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/video_summarization/base_model.py b/modelscope/models/cv/video_summarization/base_model.py index 670da251..912ba68d 100644 --- a/modelscope/models/cv/video_summarization/base_model.py +++ b/modelscope/models/cv/video_summarization/base_model.py @@ -1,4 +1,5 @@ -# The implementation is based on pytorch-caffe-models, available at https://github.com/crowsonkb/pytorch-caffe-models. +# Part of the implementation is borrowed and modified from pytorch-caffe-models, +# publicly available at https://github.com/crowsonkb/pytorch-caffe-models import cv2 import numpy as np diff --git a/modelscope/models/cv/video_summarization/kts/cpd_auto.py b/modelscope/models/cv/video_summarization/kts/cpd_auto.py index a794ca26..58281df8 100644 --- a/modelscope/models/cv/video_summarization/kts/cpd_auto.py +++ b/modelscope/models/cv/video_summarization/kts/cpd_auto.py @@ -1,4 +1,5 @@ -# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS. +# Part of the implementation is borrowed and modified from KTS, +# publicly available at https://github.com/TatsuyaShirakawa/KTS import numpy as np diff --git a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py index ef2eb6ef..55e279e9 100644 --- a/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py +++ b/modelscope/models/cv/video_summarization/kts/cpd_nonlin.py @@ -1,4 +1,5 @@ -# The implementation is based on KTS, available at https://github.com/TatsuyaShirakawa/KTS. +# Part of the implementation is borrowed and modified from KTS, +# publicly available at https://github.com/TatsuyaShirakawa/KTS import numpy as np diff --git a/modelscope/models/cv/video_summarization/pgl_sum.py b/modelscope/models/cv/video_summarization/pgl_sum.py index ab3010c9..2d27501d 100644 --- a/modelscope/models/cv/video_summarization/pgl_sum.py +++ b/modelscope/models/cv/video_summarization/pgl_sum.py @@ -1,4 +1,5 @@ -# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM. +# Part of the implementation is borrowed and modified from PGL-SUM, +# publicly available at https://github.com/e-apostolidis/PGL-SUM import math diff --git a/modelscope/models/cv/video_summarization/summarizer.py b/modelscope/models/cv/video_summarization/summarizer.py index c95da025..75251989 100644 --- a/modelscope/models/cv/video_summarization/summarizer.py +++ b/modelscope/models/cv/video_summarization/summarizer.py @@ -1,4 +1,5 @@ -# The implementation is based on PGL-SUM, available at https://github.com/e-apostolidis/PGL-SUM. +# Part of the implementation is borrowed and modified from PGL-SUM, +# publicly available at https://github.com/e-apostolidis/PGL-SUM import os.path as osp from copy import deepcopy @@ -23,7 +24,8 @@ logger = get_logger() def get_change_points(video_feat, n_frame): video_feat = np.array(video_feat, np.float32) K = np.dot(video_feat, video_feat.T) - change_points, _ = cpd_auto(K, ncp=120, vmax=2.2 / 4.0, lmin=1) + change_points, _ = cpd_auto( + K, ncp=min(K.shape[0] - 1, 120), vmax=2.2 / 4.0, lmin=1) change_points = change_points * 15 change_points = np.concatenate(([0], change_points, [n_frame - 1])) @@ -135,6 +137,46 @@ def generate_summary(all_shot_bound, all_scores, all_nframes, all_positions): return all_summaries +def transform_time(seconds): + m, s = divmod(seconds, 60) + h, m = divmod(m, 60) + time = '%02d:%02d:%06.3f' % (h, m, s) + return time + + +def summary_format(summary, fps): + frames_list = [] + start_frame = -1 + end_frame = -1 + is_summary_frame = False + for i, idx in enumerate(summary): + if idx: + if is_summary_frame is False: + start_frame = i + is_summary_frame = True + else: + if is_summary_frame: + end_frame = i - 1 + frames_list.append([start_frame, end_frame]) + is_summary_frame = False + + if is_summary_frame and summary[-1] == 1: + end_frame = len(frame_idxes) - 1 + frames_list.append([start_frame, end_frame]) + + output = [] + for seg in frames_list: + output.append({ + 'frame': + seg, + 'timestamps': [ + transform_time(seg[0] / float(fps)), + transform_time(seg[1] / float(fps)) + ] + }) + return output + + @MODELS.register_module( Tasks.video_summarization, module_name=Models.video_summarization) class PGLVideoSummarization(TorchModel): diff --git a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py index 89deb7ba..34eb0450 100644 --- a/modelscope/msdatasets/task_datasets/video_summarization_dataset.py +++ b/modelscope/msdatasets/task_datasets/video_summarization_dataset.py @@ -1,3 +1,6 @@ +# Part of the implementation is borrowed and modified from PGL-SUM, +# publicly available at https://github.com/e-apostolidis/PGL-SUM + import os import h5py @@ -15,7 +18,7 @@ class VideoSummarizationDataset(TorchTaskDataset): self.mode = mode self.data_filename = os.path.join(root_dir, opt.dataset_file) self.split_filename = os.path.join(root_dir, opt.split_file) - self.split_index = opt.split_index # it represents the current split (varies from 0 to 4) + self.split_index = opt.split_index hdf = h5py.File(self.data_filename, 'r') self.list_frame_features, self.list_gtscores = [], [] self.list_user_summary = [] diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 331f4816..07a14191 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -337,6 +337,22 @@ TASK_OUTPUTS = { OutputKeys.SCENE_META_LIST ], + # video summarization result for a single video + # { + # "output": + # [ + # { + # "frame": [start_frame, end_frame] + # "timestamps": [start_time, end_time] + # }, + # { + # "frame": [start_frame, end_frame] + # "timestamps": [start_time, end_time] + # } + # ] + # } + Tasks.video_summarization: [OutputKeys.OUTPUT], + # ============ nlp tasks =================== # text classification result for single sample diff --git a/modelscope/pipelines/cv/video_summarization_pipeline.py b/modelscope/pipelines/cv/video_summarization_pipeline.py index 25ea1e7c..e4fe206d 100644 --- a/modelscope/pipelines/cv/video_summarization_pipeline.py +++ b/modelscope/pipelines/cv/video_summarization_pipeline.py @@ -1,4 +1,6 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. +# Part of the implementation is borrowed and modified from PGL-SUM, +# publicly available at https://github.com/e-apostolidis/PGL-SUM + import os.path as osp from typing import Any, Dict @@ -8,7 +10,8 @@ import torch from tqdm import tqdm from modelscope.metainfo import Pipelines -from modelscope.models.cv.video_summarization import PGLVideoSummarization +from modelscope.models.cv.video_summarization import (PGLVideoSummarization, + summary_format) from modelscope.models.cv.video_summarization.base_model import bvlc_googlenet from modelscope.models.cv.video_summarization.summarizer import ( generate_summary, get_change_points) @@ -57,6 +60,8 @@ class VideoSummarizationPipeline(Pipeline): frames = [] picks = [] cap = cv2.VideoCapture(input) + self.fps = cap.get(cv2.CAP_PROP_FPS) + self.frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) frame_idx = 0 while (cap.isOpened()): ret, frame = cap.read() @@ -89,7 +94,9 @@ class VideoSummarizationPipeline(Pipeline): summary = self.inference(frame_features, input['n_frame'], input['picks'], change_points) - return {OutputKeys.OUTPUT: summary} + output = summary_format(summary, self.fps) + + return {OutputKeys.OUTPUT: output} def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/tests/pipelines/test_video_summarization.py b/tests/pipelines/test_video_summarization.py index 6dcc31e9..1f965c53 100644 --- a/tests/pipelines/test_video_summarization.py +++ b/tests/pipelines/test_video_summarization.py @@ -3,7 +3,6 @@ import unittest from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks -from modelscope.utils.cv.image_utils import show_video_summarization_result from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -22,8 +21,6 @@ class VideoSummarizationTest(unittest.TestCase, DemoCompatibilityCheck): result = summarization_pipeline(video_path) print(f'video summarization output: \n{result}.') - show_video_summarization_result(video_path, result, - './summarization_result.avi') @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_modelhub_default_model(self): From 02c913a0fee0bbb0a6ade9086c9c142f508ab3e0 Mon Sep 17 00:00:00 2001 From: "suluyan.sly" Date: Tue, 11 Oct 2022 17:26:43 +0800 Subject: [PATCH 10/57] [to #42322933] add plug doc string Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10337105 --- .../models/nlp/plug/configuration_plug.py | 165 +++++++----- .../models/nlp/plug/distributed_plug.py | 44 +++- modelscope/models/nlp/plug/modeling_plug.py | 243 ++++++++---------- 3 files changed, 240 insertions(+), 212 deletions(-) diff --git a/modelscope/models/nlp/plug/configuration_plug.py b/modelscope/models/nlp/plug/configuration_plug.py index 64807392..c3a526a9 100644 --- a/modelscope/models/nlp/plug/configuration_plug.py +++ b/modelscope/models/nlp/plug/configuration_plug.py @@ -40,8 +40,6 @@ class PlugNLUConfig(PretrainedConfig): max_position_embeddings=2048, type_vocab_size=3, initializer_range=0.00707, - deep_init=False, - deepspeed=False, lr_decay_style='linear', weight_decay=1e-2, clip_grad=1.0, @@ -53,20 +51,7 @@ class PlugNLUConfig(PretrainedConfig): fp32_tokentypes=False, layernorm_epsilon=1e-5, dec_hidden_layers=6, - pruning_method=None, - pruning_mask_init='constant', - pruning_mask_scale=0.0, - pruning_initial_threshold=1.0, - pruning_final_threshold=0.01, - pruning_initial_warmup=1, - pruning_final_warmup=20, - pruning_module='decoder', - pruning_decay_step=50, - pruning_decay_type='exp', - ft_module=None, attn_separate=False, - LR_weight_rank=8, - LR_mask_rank=8, **kwargs): super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs) @@ -82,8 +67,6 @@ class PlugNLUConfig(PretrainedConfig): self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range - self.deep_init = deep_init - self.deepspeed = deepspeed self.lr_decay_style = lr_decay_style self.weight_decay = weight_decay self.clip_grad = clip_grad @@ -95,20 +78,7 @@ class PlugNLUConfig(PretrainedConfig): self.layernorm_epsilon = layernorm_epsilon self.fp32_tokentypes = fp32_tokentypes self.dec_hidden_layers = dec_hidden_layers - self.pruning_method = pruning_method - self.pruning_mask_init = pruning_mask_init - self.pruning_mask_scale = pruning_mask_scale - self.pruning_module = pruning_module - self.pruning_initial_threshold = pruning_initial_threshold - self.pruning_final_threshold = pruning_final_threshold - self.pruning_initial_warmup = pruning_initial_warmup - self.pruning_final_warmup = pruning_final_warmup - self.pruning_decay_step = pruning_decay_step - self.pruning_decay_type = pruning_decay_type - self.ft_module = ft_module self.attn_separate = attn_separate - self.LR_weight_rank = LR_weight_rank - self.LR_mask_rank = LR_mask_rank @classmethod def from_dict(cls, json_object): @@ -148,47 +118,115 @@ class PlugNLUConfig(PretrainedConfig): class PlugNLGConfig(PlugNLUConfig): + """ + This is the configuration class to store the configuration of a [`PlugModel`]. It is used to instantiate a + PLUG understanding model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the PLUG + [PLUG](https://modelscope.cn/models/damo/nlp_plug_text-generation_27B/summary) architecture. + + Configuration objects inherit from [`PlugNLUConfig`] and can be used to control the model outputs. Read the + documentation from [`PlugNLUConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 21504): + Padded vocabulary size of the PLUG model for vocab tensor parallel. Defines the number of different tokens + that can be represented by the `inputs_ids` passed when calling [`PlugModel`]. + original_vocab_size (`int`, *optional*, defaults to 21128): + True vocabulary size of the PLUG model. Defines the number of different tokens that can be represented. + hidden_size (`int`, *optional*, defaults to 8192): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + dec_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 128): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 32768): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the Transformer Attention. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 3): + The vocabulary size of the `token_type_ids` passed when calling [`PlugModel`]. + initializer_range (`float`, *optional*, defaults to 0.00707): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + lr_decay_style (`str`, *optional*, defaults to 'linear'): + The decay style of learning rate during fine-tunining. If string, `"linear"`, `"cosine"`, `"exponential"`, + `"constant"`, `"None"` are supported. + weight_decay (`float`, *optional*, defaults to 1e-2): + Decoupled weight decay to apply. + clip_grad (`float`, *optional*, defaults to 1.0): + Maximum gradient norm for gradient clipping. + warmup (`float`, *optional*, defaults to 0.01): + Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. + pre_ln (`boolean`, *optional*, defaults to `True`): + Whether or not to apply LayerNorm to the input instead of the output in the blocks. + fp16 (`boolean`, *optional*, defaults to `True`): + Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training. + fp32_layernorm (`boolean`, *optional*, defaults to `True`): + Whether to use fp32 32-bit precision LayerNorm training while the argument `fp16` set to `True`. + fp32_embedding (`boolean`, *optional*, defaults to `False`): + Whether to use fp32 32-bit precision Embedding training while the argument `fp16` set to `True`. + fp32_tokentypes (`boolean`, *optional*, defaults to `False`): + Whether to use fp32 32-bit precision token types training while the argument `fp16` set to `True`. + layernorm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + attn_separate (`boolean`, *optional*, defaults to `False`): + Whether or not to separate query-key-value to query, key, value in the Attention. + + Example: + + ```python + >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given + >>> # here only initializes a slice of the model on a single GPU. + >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model. + >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel + + >>> # Initializing a Plug configuration + >>> configuration = PlugNLGConfig() + + >>> # Initializing a model from the configuration + >>> model = PlugModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + model_type = 'plugNLG' def __init__(self, vocab_size=21504, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, + original_vocab_size=21128, + hidden_size=8192, + num_hidden_layers=24, + dec_hidden_layers=6, + num_attention_heads=128, + intermediate_size=32768, hidden_act='gelu', hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, + max_position_embeddings=2048, + type_vocab_size=3, initializer_range=0.00707, - deep_init=False, - deepspeed=False, lr_decay_style='linear', weight_decay=1e-2, clip_grad=1.0, warmup=0.01, - pre_ln=False, - fp16=False, - fp32_layernorm=False, + pre_ln=True, + fp16=True, + fp32_layernorm=True, fp32_embedding=False, fp32_tokentypes=False, - layernorm_epsilon=1e-12, - dec_hidden_layers=6, - pruning_method=None, - pruning_mask_init='constant', - pruning_mask_scale=0.0, - pruning_initial_threshold=1.0, - pruning_final_threshold=0.01, - pruning_initial_warmup=1, - pruning_final_warmup=20, - pruning_module='decoder', - pruning_decay_step=50, - pruning_decay_type='exp', - ft_module=None, + layernorm_epsilon=1e-5, attn_separate=False, - LR_weight_rank=8, - LR_mask_rank=8, **kwargs): super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs) @@ -203,8 +241,6 @@ class PlugNLGConfig(PlugNLUConfig): self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range - self.deep_init = deep_init - self.deepspeed = deepspeed self.lr_decay_style = lr_decay_style self.weight_decay = weight_decay self.clip_grad = clip_grad @@ -216,17 +252,4 @@ class PlugNLGConfig(PlugNLUConfig): self.layernorm_epsilon = layernorm_epsilon self.fp32_tokentypes = fp32_tokentypes self.dec_hidden_layers = dec_hidden_layers - self.pruning_method = pruning_method - self.pruning_mask_init = pruning_mask_init - self.pruning_mask_scale = pruning_mask_scale - self.pruning_module = pruning_module - self.pruning_initial_threshold = pruning_initial_threshold - self.pruning_final_threshold = pruning_final_threshold - self.pruning_initial_warmup = pruning_initial_warmup - self.pruning_final_warmup = pruning_final_warmup - self.pruning_decay_step = pruning_decay_step - self.pruning_decay_type = pruning_decay_type - self.ft_module = ft_module self.attn_separate = attn_separate - self.LR_weight_rank = LR_weight_rank - self.LR_mask_rank = LR_mask_rank diff --git a/modelscope/models/nlp/plug/distributed_plug.py b/modelscope/models/nlp/plug/distributed_plug.py index 2992f595..06009ba1 100644 --- a/modelscope/models/nlp/plug/distributed_plug.py +++ b/modelscope/models/nlp/plug/distributed_plug.py @@ -20,6 +20,48 @@ logger = get_logger(__name__) class DistributedPlug(TorchModel): + """ + The wapper class of PLUG Model to initialize parallel environment, load model weights, generate sentences. + Parameters: + model_dir (`str`, *required*): + Path to model damo/nlp_plug_text-generation_27B. + The model structure in model_dir should be like this: + model_dir + |_ config.json + |_ configuration.json + |_ ds_zero-offload_10B_config.json + |_ vocab.txt + |_ model <-- an empty directory + + Model binaries shall be downloaded separately to populate the model directory, so that + the model directory would contain the following binaries: + |_ model + |_ mp_rank_00_model_states.pt + |_ mp_rank_01_model_states.pt + |_ mp_rank_02_model_states.pt + |_ mp_rank_03_model_states.pt + |_ mp_rank_04_model_states.pt + |_ mp_rank_05_model_states.pt + |_ mp_rank_06_model_states.pt + |_ mp_rank_07_model_states.pt + rank (`int`, *required*): + Used to identify different GPUs in a tensor parallel environment. eg. The rank of GPU #0 is 0, and the + model file `mp_rank_00_model_states.pt` will be loaded on this GPU. + world_size (`int`, *required*, defaults to 8): + The parallel size in total. + model_parallel_size (`int`, *required*, defaults to 8): + The parallel size of model(tensor parallel). + master_ip (`str`, *required*): + The master IP, can usually be set to `"127.0.0.1"`, used as part of + [`~torch.distributed.init_process_group`] method parameter `init_method`. + `init_method` = `"tcp://{master_ip}:{master_port}"` + master_port (`str`, *required*): + The master port, can usually be set to `"29500"`, used as part of + [`~torch.distributed.init_process_group`] method parameter `init_method`. + `init_method` = `"tcp://{master_ip}:{master_port}"` + seed (`int`, *optional*, defaults to 42): + Random seed to control sampling. + """ def __init__(self, model_dir, rank, **kwargs): super().__init__(model_dir, **kwargs) @@ -29,7 +71,7 @@ class DistributedPlug(TorchModel): initialize_distributed(rank, mpu, kwargs['world_size'], kwargs['model_parallel_size'], kwargs['master_ip'], kwargs['master_port']) - seed = 0 if 'seed' not in kwargs else kwargs['seed'] + seed = 42 if 'seed' not in kwargs else kwargs['seed'] set_random_seed_mpu(seed) self.iteration = 0 self.dist_model = self.initialize_model(path_load_tag='model') diff --git a/modelscope/models/nlp/plug/modeling_plug.py b/modelscope/models/nlp/plug/modeling_plug.py index 9d2bb14f..df00006b 100644 --- a/modelscope/models/nlp/plug/modeling_plug.py +++ b/modelscope/models/nlp/plug/modeling_plug.py @@ -152,15 +152,7 @@ class BertSelfOutput(nn.Module): bias=True, input_is_parallel=True, stride=1, - init_method=init_method, - pruning_method=config.pruning_method if config.pruning_module in [ - 'all', 'encoder', 'encoder_self', 'encoder_selfvo', - 'encoder_selfo' - ] else None, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank) + init_method=init_method) self.fp32_layernorm = config.fp32_layernorm if not config.pre_ln: self.LayerNorm = BertLayerNorm( @@ -173,12 +165,8 @@ class BertSelfOutput(nn.Module): self, hidden_states, input_tensor, - pruning_threshold=None, ): - hidden_states = self.dense( - hidden_states, - pruning_threshold=pruning_threshold, - ) + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) ln_input = hidden_states + input_tensor if self.LayerNorm is not None: @@ -210,20 +198,13 @@ class BertAttention(nn.Module): output_parallel=True, init_method=normal_init_method( mean=0.0, std=config.initializer_range), - separate=config.attn_separate, - pruning_method=config.pruning_method, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - pruning_module=config.pruning_module, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank) + separate=config.attn_separate) self.output = BertSelfOutput(config) def forward( self, input_tensor, attention_mask, - pruning_threshold=None, ): if self.LayerNorm is not None: ln_input = input_tensor @@ -236,20 +217,16 @@ class BertAttention(nn.Module): self_output = self.self( ln_output, attention_mask, - pruning_threshold=pruning_threshold, ) else: self_output = self.self( input_tensor, attention_mask, - pruning_threshold=pruning_threshold, ) - output_pruning_threshold = pruning_threshold attention_output = self.output( self_output, input_tensor, - pruning_threshold=output_pruning_threshold, ) return attention_output @@ -265,25 +242,15 @@ class BertIntermediate(nn.Module): gather_output=False, stride=1, init_method=normal_init_method( - mean=0.0, std=config.initializer_range), - pruning_method=config.pruning_method if config.pruning_module - in ['all', 'encoder', 'encoder_ffn'] else None, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank) + mean=0.0, std=config.initializer_range)) self.intermediate_act_fn = ACT2FN[config.hidden_act] \ if isinstance(config.hidden_act, str) else config.hidden_act def forward( self, hidden_states, - pruning_threshold=None, ): - hidden_states = self.dense( - hidden_states, - pruning_threshold=pruning_threshold, - ) + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states @@ -306,13 +273,7 @@ class BertOutput(nn.Module): bias=True, input_is_parallel=True, stride=1, - init_method=init_method, - pruning_method=config.pruning_method if config.pruning_module - in ['all', 'encoder', 'encoder_ffn'] else None, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank) + init_method=init_method) self.fp32_layernorm = config.fp32_layernorm if not config.pre_ln: self.LayerNorm = BertLayerNorm( @@ -325,12 +286,8 @@ class BertOutput(nn.Module): self, hidden_states, input_tensor, - pruning_threshold=None, ): - hidden_states = self.dense( - hidden_states, - pruning_threshold=pruning_threshold, - ) + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) ln_input = hidden_states + input_tensor if self.LayerNorm is not None: @@ -359,14 +316,8 @@ class BertLayer(nn.Module): else: self.LayerNorm = None - def forward( - self, - hidden_states, - attention_mask, - pruning_threshold=None, - ): - attention_output = self.attention( - hidden_states, attention_mask, pruning_threshold=pruning_threshold) + def forward(self, hidden_states, attention_mask): + attention_output = self.attention(hidden_states, attention_mask) if self.LayerNorm is not None: ln_input = attention_output previous_type = attention_output.type() @@ -375,15 +326,10 @@ class BertLayer(nn.Module): ln_output = self.LayerNorm(ln_input) if self.fp32_layernorm: ln_output = ln_output.type(previous_type) - intermediate_output = self.intermediate( - ln_output, pruning_threshold=pruning_threshold) + intermediate_output = self.intermediate(ln_output) else: - intermediate_output = self.intermediate( - attention_output, pruning_threshold=pruning_threshold) - layer_output = self.output( - intermediate_output, - attention_output, - pruning_threshold=pruning_threshold) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) return layer_output @@ -407,7 +353,6 @@ class BertEncoder(nn.Module): output_all_encoded_layers=True, checkpoint_activations=False, detach_index=-1, - pruning_threshold=None, ): all_encoder_layers = [] @@ -417,8 +362,7 @@ class BertEncoder(nn.Module): layers = self.layer[start:end] x_ = inputs[0] for layer in layers: - x_ = layer( - x_, inputs[1], pruning_threshold=pruning_threshold) + x_ = layer(x_, inputs[1]) return x_ return custom_forward @@ -654,7 +598,6 @@ class BertModel(PreTrainedBertModel): output_all_encoded_layers=True, checkpoint_activations=False, detach_index=-1, - pruning_threshold=None, ): if attention_mask is None: attention_mask = torch.ones_like(input_ids) @@ -683,8 +626,7 @@ class BertModel(PreTrainedBertModel): extended_attention_mask, output_all_encoded_layers=output_all_encoded_layers, checkpoint_activations=checkpoint_activations, - detach_index=detach_index, - pruning_threshold=pruning_threshold) + detach_index=detach_index) sequence_output = encoded_layers[-1] for p in self.pooler.parameters(): if p is None: @@ -709,18 +651,6 @@ class DecodeLayer(nn.Module): std=config.initializer_range, num_layers=config.num_hidden_layers) - self_pruning_method = config.pruning_method - cross_pruning_method = config.pruning_method - ffn_pruning_method = config.pruning_method - - if config.ft_module is not None: - if 'decoder_self' in config.ft_module: - self_pruning_method = 'finetune' - if 'decoder_cross' in config.ft_module: - cross_pruning_method = 'finetune' - if 'decoder_ffn' in config.ft_module: - ffn_pruning_method = 'finetune' - self.attention = mpu.GPT2ParallelSelfAttention( hidden_size=config.hidden_size, num_attention_heads=config.num_attention_heads, @@ -728,13 +658,6 @@ class DecodeLayer(nn.Module): output_dropout_prob=config.hidden_dropout_prob, init_method=init_method, output_layer_init_method=output_layer_init_method, - pruning_method=self_pruning_method if config.pruning_module in [ - 'all', 'decoder', 'decoder_self', 'decoder_self+ffn' - ] else None, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank, ) self.cross_attention = mpu.PalmParallelCrossAttention( @@ -745,12 +668,6 @@ class DecodeLayer(nn.Module): init_method=init_method, attn_separate=False, output_layer_init_method=output_layer_init_method, - pruning_method=cross_pruning_method, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - pruning_module=config.pruning_module, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank, ) self.input_layernorm = BertLayerNorm( @@ -765,12 +682,6 @@ class DecodeLayer(nn.Module): config.intermediate_size, gather_output=False, init_method=init_method, - pruning_method=ffn_pruning_method if config.pruning_module - in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank, ) self.intermediate_act_fn = ACT2FN[config.hidden_act] \ if isinstance(config.hidden_act, str) else config.hidden_act @@ -779,12 +690,6 @@ class DecodeLayer(nn.Module): config.hidden_size, input_is_parallel=True, init_method=output_layer_init_method, - pruning_method=ffn_pruning_method if config.pruning_module - in ['all', 'decoder', 'decoder_ffn', 'decoder_self+ffn'] else None, - pruning_mask_init=config.pruning_mask_init, - pruning_mask_scale=config.pruning_mask_scale, - LR_weight_rank=config.LR_weight_rank, - LR_mask_rank=config.LR_mask_rank, ) self.dropout = torch.nn.Dropout(config.hidden_dropout_prob) @@ -804,8 +709,7 @@ class DecodeLayer(nn.Module): enc_hidden_states, enc_attn_mask, dec_attn_mask, - is_infer=False, - pruning_threshold=None): + is_infer=False): residual = hidden_states previous_type = hidden_states.type() hidden_states = self.input_layernorm( @@ -813,10 +717,7 @@ class DecodeLayer(nn.Module): if self.fp32_layernorm: hidden_states = hidden_states.type(previous_type) hidden_states = self.attention( - hidden_states, - dec_attn_mask, - is_infer=is_infer, - pruning_threshold=pruning_threshold) + hidden_states, dec_attn_mask, is_infer=is_infer) hidden_states = residual + hidden_states @@ -825,23 +726,18 @@ class DecodeLayer(nn.Module): self.type_converter(hidden_states)) if self.fp32_layernorm: hidden_states = hidden_states.type(previous_type) - hidden_states = self.cross_attention( - hidden_states, - enc_hidden_states, - enc_attn_mask, - pruning_threshold=pruning_threshold) + hidden_states = self.cross_attention(hidden_states, enc_hidden_states, + enc_attn_mask) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.post_cross_attention_layernorm( self.type_converter(hidden_states)) if self.fp32_layernorm: hidden_states = hidden_states.type(previous_type) - hidden_states = self.intermediate( - hidden_states, pruning_threshold=pruning_threshold) + hidden_states = self.intermediate(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) - hidden_states = self.output( - hidden_states, pruning_threshold=pruning_threshold) + hidden_states = self.output(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = residual + hidden_states @@ -866,8 +762,7 @@ class BertDecoder(nn.Module): dec_attn_mask, checkpoint_activations=False, output_all_encoded_layers=False, - is_infer=False, - pruning_threshold=None): + is_infer=False): def custom(start, end): @@ -880,8 +775,7 @@ class BertDecoder(nn.Module): inputs[1], inputs[2], dec_attn_mask * 1, - is_infer=is_infer, - pruning_threshold=pruning_threshold) + is_infer=is_infer) return x_ return custom_forward @@ -904,8 +798,7 @@ class BertDecoder(nn.Module): enc_hidden_states, enc_attn_mask, dec_attn_mask, - is_infer=is_infer, - pruning_threshold=pruning_threshold) + is_infer=is_infer) previous_type = hidden_states.type() if self.fp32_layernorm: @@ -932,8 +825,7 @@ class DecodeModel(PreTrainedBertModel): enc_attn_mask=None, dec_attn_mask=None, checkpoint_activations=False, - is_infer=False, - pruning_threshold=None): + is_infer=False): extended_attention_mask = enc_attn_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.to( dtype=next(self.decoder.parameters()).dtype) # fp16 compatibility @@ -946,8 +838,7 @@ class DecodeModel(PreTrainedBertModel): extended_attention_mask, dec_attn_mask, checkpoint_activations=False, - is_infer=is_infer, - pruning_threshold=pruning_threshold) + is_infer=is_infer) return sequence_output[-1] @@ -972,16 +863,14 @@ class PalmForPreTraining(PreTrainedBertModel): checkpoint_activations=False, is_infer=False, sequence_output=None, - parallel_output=True, - pruning_threshold=None): + parallel_output=True): if sequence_output is None: sequence_output, pooled_output = self.bert( input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, - checkpoint_activations=checkpoint_activations, - pruning_threshold=pruning_threshold) + checkpoint_activations=checkpoint_activations) prediction_scores, seq_relationship_score = self.cls( sequence_output, pooled_output) else: @@ -998,8 +887,7 @@ class PalmForPreTraining(PreTrainedBertModel): attention_mask, decode_attention_mask, checkpoint_activations=checkpoint_activations, - is_infer=is_infer, - pruning_threshold=pruning_threshold) + is_infer=is_infer) transformer_output_parallel = mpu.copy_to_model_parallel_region( decode_output) @@ -1017,6 +905,29 @@ class PalmForPreTraining(PreTrainedBertModel): class PlugModel(torch.nn.Module): + """ + The bare Plug Model transformer outputting raw hidden-states without any specific head on top. + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + Parameters: + config ([`PlugNLGConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~DistributedPlug.initialize_model`] method to load the model weights. + Example: + + ```python + >>> # The PLUG model has 27B parameters and usually need to run on multiple GPUs. The example given + >>> # here only initializes a slice of the model on a single GPU. + >>> # Check out the [`~DistributedPipeline.__init__`] method to initialize entire PLUG model. + >>> from modelscope.models.nlp.plug import PlugNLGConfig, PlugModel + + >>> # Initializing a Plug configuration + >>> configuration = PlugNLGConfig() + + >>> # Initializing a model from the configuration + >>> model = PlugModel(configuration) + """ def __init__(self, config): super(PlugModel, self).__init__() @@ -1034,6 +945,58 @@ class PlugModel(torch.nn.Module): is_infer=False, sequence_output=None, parallel_output=True): + """ + Parameters: + input_tokens (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`): + `input_tokens_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary. + Indices can be obtained using transformers [`BertTokenizer`]. See + [`TextGenerationPreprocessor.__call__`] for details. + token_type_ids (`torch.LongTensor` of shape `(batch_size, input_tokens_length)`, *optional*, defaults to + None): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + target_tokens (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None): + Target token ids(labels) for language modeling. Note that the labels **are shifted** inside the model, + i.e. you can set `target_tokens = input_tokens` Indices are selected in + `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only + computed for labels in `[0, ..., config.vocab_size]` + + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults to None): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.max_position_embeddings - 1]`. + + decode_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*, defaults + to None): + Mask to avoid performing attention on padding token indices of target tokens. Mask values selected in + `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + checkpoint_activations (`boolean`, *optional*, defaults to `False`): + Whether gradient checkpointing is activated for this model or not. + is_infer (`boolean`, *optional*, defaults to `False`): + Whether or not to perform single inference. + sequence_output (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, + defaults to None): + Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the + model. A single forward() call can produce one single token. To generate the current token, the + sequence_output generated by the `forward()` of the previous token is required. + parallel_output (`boolean`, *optional*, defaults to `True`): + To parallel return output, or gather it before return. + + + """ return self.model( input_tokens, token_type_ids, From 69da8f91ac5ca420408100c4ec5abd0c5987e65a Mon Sep 17 00:00:00 2001 From: "ashui.cbh" Date: Tue, 11 Oct 2022 20:49:13 +0800 Subject: [PATCH 11/57] [to #42322933]suport image inpainting Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10111615 --- .../image_inpainting/image_inpainting.png | 3 + .../image_inpainting_mask.png | 3 + modelscope/metainfo.py | 5 + modelscope/metrics/__init__.py | 2 + modelscope/metrics/builder.py | 2 + modelscope/metrics/image_inpainting_metric.py | 210 +++++++ modelscope/models/cv/__init__.py | 17 +- .../models/cv/crowd_counting/cc_model.py | 2 + .../cv/crowd_counting/hrnet_aspp_relu.py | 14 +- .../models/cv/image_inpainting/__init__.py | 22 + modelscope/models/cv/image_inpainting/base.py | 75 +++ .../models/cv/image_inpainting/default.py | 210 +++++++ .../models/cv/image_inpainting/model.py | 36 ++ .../cv/image_inpainting/modules/__init__.py | 0 .../modules/ade20k/__init__.py | 2 + .../image_inpainting/modules/ade20k/base.py | 380 +++++++++++ .../image_inpainting/modules/ade20k/resnet.py | 183 ++++++ .../image_inpainting/modules/adversarial.py | 167 +++++ .../modules/feature_matching.py | 45 ++ .../models/cv/image_inpainting/modules/ffc.py | 588 ++++++++++++++++++ .../cv/image_inpainting/modules/inception.py | 324 ++++++++++ .../cv/image_inpainting/modules/perceptual.py | 47 ++ .../cv/image_inpainting/modules/pix2pixhd.py | 75 +++ .../models/cv/image_inpainting/refinement.py | 393 ++++++++++++ .../msdatasets/task_datasets/__init__.py | 2 + .../image_inpainting/__init__.py | 2 + .../task_datasets/image_inpainting/aug.py | 100 +++ .../image_inpainting_dataset.py | 337 ++++++++++ modelscope/outputs.py | 1 + modelscope/pipelines/builder.py | 2 + modelscope/pipelines/cv/__init__.py | 2 + .../pipelines/cv/image_inpainting_pipeline.py | 146 +++++ modelscope/trainers/__init__.py | 5 +- modelscope/trainers/cv/__init__.py | 4 +- .../trainers/cv/image_inpainting_trainer.py | 111 ++++ modelscope/utils/constant.py | 4 +- requirements/cv.txt | 2 + tests/pipelines/test_image_inpainting.py | 77 +++ tests/run_config.yaml | 1 + .../trainers/test_image_inpainting_trainer.py | 84 +++ 40 files changed, 3666 insertions(+), 19 deletions(-) create mode 100644 data/test/images/image_inpainting/image_inpainting.png create mode 100644 data/test/images/image_inpainting/image_inpainting_mask.png create mode 100644 modelscope/metrics/image_inpainting_metric.py create mode 100644 modelscope/models/cv/image_inpainting/__init__.py create mode 100644 modelscope/models/cv/image_inpainting/base.py create mode 100644 modelscope/models/cv/image_inpainting/default.py create mode 100644 modelscope/models/cv/image_inpainting/model.py create mode 100644 modelscope/models/cv/image_inpainting/modules/__init__.py create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/base.py create mode 100644 modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py create mode 100644 modelscope/models/cv/image_inpainting/modules/adversarial.py create mode 100644 modelscope/models/cv/image_inpainting/modules/feature_matching.py create mode 100644 modelscope/models/cv/image_inpainting/modules/ffc.py create mode 100644 modelscope/models/cv/image_inpainting/modules/inception.py create mode 100644 modelscope/models/cv/image_inpainting/modules/perceptual.py create mode 100644 modelscope/models/cv/image_inpainting/modules/pix2pixhd.py create mode 100644 modelscope/models/cv/image_inpainting/refinement.py create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/__init__.py create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/aug.py create mode 100644 modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py create mode 100644 modelscope/pipelines/cv/image_inpainting_pipeline.py create mode 100644 modelscope/trainers/cv/image_inpainting_trainer.py create mode 100644 tests/pipelines/test_image_inpainting.py create mode 100644 tests/trainers/test_image_inpainting_trainer.py diff --git a/data/test/images/image_inpainting/image_inpainting.png b/data/test/images/image_inpainting/image_inpainting.png new file mode 100644 index 00000000..e141012d --- /dev/null +++ b/data/test/images/image_inpainting/image_inpainting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5 +size 2747938 diff --git a/data/test/images/image_inpainting/image_inpainting_mask.png b/data/test/images/image_inpainting/image_inpainting_mask.png new file mode 100644 index 00000000..e30f67e7 --- /dev/null +++ b/data/test/images/image_inpainting/image_inpainting_mask.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15 +size 27616 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 77627abc..cae9d188 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -27,6 +27,7 @@ class Models(object): face_2d_keypoints = 'face-2d-keypoints' panoptic_segmentation = 'swinL-panoptic-segmentation' image_reid_person = 'passvitb' + image_inpainting = 'FFTInpainting' video_summarization = 'pgl-video-summarization' swinL_semantic_segmentation = 'swinL-semantic-segmentation' vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' @@ -179,6 +180,7 @@ class Pipelines(object): video_summarization = 'googlenet_pgl_video_summarization' image_semantic_segmentation = 'image-semantic-segmentation' image_reid_person = 'passvitb-image-reid-person' + image_inpainting = 'fft-inpainting' text_driven_segmentation = 'text-driven-segmentation' movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' shop_segmentation = 'shop-segmentation' @@ -264,6 +266,7 @@ class Trainers(object): image_portrait_enhancement = 'image-portrait-enhancement' video_summarization = 'video-summarization' movie_scene_segmentation = 'movie-scene-segmentation' + image_inpainting = 'image-inpainting' # nlp trainers bert_sentiment_analysis = 'bert-sentiment-analysis' @@ -363,6 +366,8 @@ class Metrics(object): video_summarization_metric = 'video-summarization-metric' # metric for movie-scene-segmentation task movie_scene_segmentation_metric = 'movie-scene-segmentation-metric' + # metric for inpainting task + image_inpainting_metric = 'image-inpainting-metric' class Optimizers(object): diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py index d3975a2c..e6a03a22 100644 --- a/modelscope/metrics/__init__.py +++ b/modelscope/metrics/__init__.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: from .token_classification_metric import TokenClassificationMetric from .video_summarization_metric import VideoSummarizationMetric from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric + from .image_inpainting_metric import ImageInpaintingMetric else: _import_structure = { @@ -34,6 +35,7 @@ else: 'token_classification_metric': ['TokenClassificationMetric'], 'video_summarization_metric': ['VideoSummarizationMetric'], 'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'], + 'image_inpainting_metric': ['ImageInpaintingMetric'], } import sys diff --git a/modelscope/metrics/builder.py b/modelscope/metrics/builder.py index 9e875cc4..ee4d2840 100644 --- a/modelscope/metrics/builder.py +++ b/modelscope/metrics/builder.py @@ -18,6 +18,7 @@ class MetricKeys(object): SSIM = 'ssim' AVERAGE_LOSS = 'avg_loss' FScore = 'fscore' + FID = 'fid' BLEU_1 = 'bleu-1' BLEU_4 = 'bleu-4' ROUGE_1 = 'rouge-1' @@ -39,6 +40,7 @@ task_default_metrics = { Tasks.image_captioning: [Metrics.text_gen_metric], Tasks.visual_question_answering: [Metrics.text_gen_metric], Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric], + Tasks.image_inpainting: [Metrics.image_inpainting_metric], } diff --git a/modelscope/metrics/image_inpainting_metric.py b/modelscope/metrics/image_inpainting_metric.py new file mode 100644 index 00000000..954d4ca2 --- /dev/null +++ b/modelscope/metrics/image_inpainting_metric.py @@ -0,0 +1,210 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +from typing import Dict + +import numpy as np +import torch +import torch.nn.functional as F +from scipy import linalg + +from modelscope.metainfo import Metrics +from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3 +from modelscope.utils.registry import default_group +from modelscope.utils.tensor_utils import (torch_nested_detach, + torch_nested_numpify) +from .base import Metric +from .builder import METRICS, MetricKeys + + +def fid_calculate_activation_statistics(act): + mu = np.mean(act, axis=0) + sigma = np.cov(act, rowvar=False) + return mu, sigma + + +def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6): + mu1, sigma1 = fid_calculate_activation_statistics(activations_pred) + mu2, sigma2 = fid_calculate_activation_statistics(activations_target) + + diff = mu1 - mu2 + + # Product might be almost singular + covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) + if not np.isfinite(covmean).all(): + offset = np.eye(sigma1.shape[0]) * eps + covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) + + # Numerical error might give slight imaginary component + if np.iscomplexobj(covmean): + # if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3): + if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2): + m = np.max(np.abs(covmean.imag)) + raise ValueError('Imaginary component {}'.format(m)) + covmean = covmean.real + + tr_covmean = np.trace(covmean) + + return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) + - 2 * tr_covmean) + + +class FIDScore(torch.nn.Module): + + def __init__(self, dims=2048, eps=1e-6): + super().__init__() + if getattr(FIDScore, '_MODEL', None) is None: + block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims] + FIDScore._MODEL = InceptionV3([block_idx]).eval() + self.model = FIDScore._MODEL + self.eps = eps + self.reset() + + def forward(self, pred_batch, target_batch, mask=None): + activations_pred = self._get_activations(pred_batch) + activations_target = self._get_activations(target_batch) + + self.activations_pred.append(activations_pred.detach().cpu()) + self.activations_target.append(activations_target.detach().cpu()) + + def get_value(self): + activations_pred, activations_target = (self.activations_pred, + self.activations_target) + activations_pred = torch.cat(activations_pred).cpu().numpy() + activations_target = torch.cat(activations_target).cpu().numpy() + + total_distance = calculate_frechet_distance( + activations_pred, activations_target, eps=self.eps) + + self.reset() + return total_distance + + def reset(self): + self.activations_pred = [] + self.activations_target = [] + + def _get_activations(self, batch): + activations = self.model(batch)[0] + if activations.shape[2] != 1 or activations.shape[3] != 1: + assert False, \ + 'We should not have got here, because Inception always scales inputs to 299x299' + activations = activations.squeeze(-1).squeeze(-1) + return activations + + +class SSIM(torch.nn.Module): + """SSIM. Modified from: + https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py + """ + + def __init__(self, window_size=11, size_average=True): + super().__init__() + self.window_size = window_size + self.size_average = size_average + self.channel = 1 + self.register_buffer('window', + self._create_window(window_size, self.channel)) + + def forward(self, img1, img2): + assert len(img1.shape) == 4 + + channel = img1.size()[1] + + if channel == self.channel and self.window.data.type( + ) == img1.data.type(): + window = self.window + else: + window = self._create_window(self.window_size, channel) + + window = window.type_as(img1) + + self.window = window + self.channel = channel + + return self._ssim(img1, img2, window, self.window_size, channel, + self.size_average) + + def _gaussian(self, window_size, sigma): + gauss = torch.Tensor([ + np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2)) + for x in range(window_size) + ]) + return gauss / gauss.sum() + + def _create_window(self, window_size, channel): + _1D_window = self._gaussian(window_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm( + _1D_window.t()).float().unsqueeze(0).unsqueeze(0) + return _2D_window.expand(channel, 1, window_size, + window_size).contiguous() + + def _ssim(self, + img1, + img2, + window, + window_size, + channel, + size_average=True): + mu1 = F.conv2d( + img1, window, padding=(window_size // 2), groups=channel) + mu2 = F.conv2d( + img2, window, padding=(window_size // 2), groups=channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1 * mu2 + + sigma1_sq = F.conv2d( + img1 * img1, window, padding=(window_size // 2), + groups=channel) - mu1_sq + sigma2_sq = F.conv2d( + img2 * img2, window, padding=(window_size // 2), + groups=channel) - mu2_sq + sigma12 = F.conv2d( + img1 * img2, window, padding=(window_size // 2), + groups=channel) - mu1_mu2 + + C1 = 0.01**2 + C2 = 0.03**2 + + ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \ + ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) + + if size_average: + return ssim_map.mean() + + return ssim_map.mean(1).mean(1).mean(1) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + return + + +@METRICS.register_module( + group_key=default_group, module_name=Metrics.image_inpainting_metric) +class ImageInpaintingMetric(Metric): + """The metric computation class for image inpainting classes. + """ + + def __init__(self): + self.preds = [] + self.targets = [] + self.SSIM = SSIM(window_size=11, size_average=False).eval() + device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.FID = FIDScore().to(device) + + def add(self, outputs: Dict, inputs: Dict): + pred = outputs['inpainted'] + target = inputs['image'] + self.preds.append(torch_nested_detach(pred)) + self.targets.append(torch_nested_detach(target)) + + def evaluate(self): + ssim_list = [] + for (pred, target) in zip(self.preds, self.targets): + ssim_list.append(self.SSIM(pred, target)) + self.FID(pred, target) + ssim_list = torch_nested_numpify(ssim_list) + fid = self.FID.get_value() + return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid} diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index f2798b59..ba7b03c5 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -5,13 +5,14 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, body_3d_keypoints, cartoon, cmdssl_video_embedding, crowd_counting, face_2d_keypoints, face_detection, face_generation, image_classification, image_color_enhance, - image_colorization, image_denoise, image_instance_segmentation, - image_panoptic_segmentation, image_portrait_enhancement, - image_reid_person, image_semantic_segmentation, - image_to_image_generation, image_to_image_translation, - movie_scene_segmentation, object_detection, - product_retrieval_embedding, realtime_object_detection, - salient_detection, shop_segmentation, super_resolution, - video_single_object_tracking, video_summarization, virual_tryon) + image_colorization, image_denoise, image_inpainting, + image_instance_segmentation, image_panoptic_segmentation, + image_portrait_enhancement, image_reid_person, + image_semantic_segmentation, image_to_image_generation, + image_to_image_translation, movie_scene_segmentation, + object_detection, product_retrieval_embedding, + realtime_object_detection, salient_detection, shop_segmentation, + super_resolution, video_single_object_tracking, + video_summarization, virual_tryon) # yapf: enable diff --git a/modelscope/models/cv/crowd_counting/cc_model.py b/modelscope/models/cv/crowd_counting/cc_model.py index 582b26f4..16fbc261 100644 --- a/modelscope/models/cv/crowd_counting/cc_model.py +++ b/modelscope/models/cv/crowd_counting/cc_model.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Any, Dict, Optional, Union diff --git a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py index 982ba939..0d1bd3ca 100644 --- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py +++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py @@ -1,10 +1,10 @@ -# ------------------------------------------------------------------------------ -# Copyright (c) Microsoft -# Licensed under the MIT License. -# Written by Bin Xiao (Bin.Xiao@microsoft.com) -# Modified by Ke Sun (sunk@mail.ustc.edu.cn) -# https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py -# ------------------------------------------------------------------------------ +""" +Copyright (c) Microsoft +Licensed under the MIT License. +Written by Bin Xiao (Bin.Xiao@microsoft.com) +Modified by Ke Sun (sunk@mail.ustc.edu.cn) +https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py +""" import functools import logging diff --git a/modelscope/models/cv/image_inpainting/__init__.py b/modelscope/models/cv/image_inpainting/__init__.py new file mode 100644 index 00000000..e7c63cd4 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .model import FFTInpainting + +else: + _import_structure = { + 'model': ['FFTInpainting'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_inpainting/base.py b/modelscope/models/cv/image_inpainting/base.py new file mode 100644 index 00000000..04e73630 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/base.py @@ -0,0 +1,75 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +from typing import Dict, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.utils.logger import get_logger +from .modules.adversarial import NonSaturatingWithR1 +from .modules.ffc import FFCResNetGenerator +from .modules.perceptual import ResNetPL +from .modules.pix2pixhd import NLayerDiscriminator + +LOGGER = get_logger() + + +class BaseInpaintingTrainingModule(nn.Module): + + def __init__(self, + model_dir='', + use_ddp=True, + predict_only=False, + visualize_each_iters=100, + average_generator=False, + generator_avg_beta=0.999, + average_generator_start_step=30000, + average_generator_period=10, + store_discr_outputs_for_vis=False, + **kwargs): + super().__init__() + LOGGER.info( + f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}' + ) + + self.generator = FFCResNetGenerator() + self.use_ddp = use_ddp + + if not predict_only: + self.discriminator = NLayerDiscriminator() + self.adversarial_loss = NonSaturatingWithR1( + weight=10, + gp_coef=0.001, + mask_as_fake_target=True, + allow_scale_mask=True) + + self.average_generator = average_generator + self.generator_avg_beta = generator_avg_beta + self.average_generator_start_step = average_generator_start_step + self.average_generator_period = average_generator_period + self.generator_average = None + self.last_generator_averaging_step = -1 + self.store_discr_outputs_for_vis = store_discr_outputs_for_vis + + self.loss_l1 = nn.L1Loss(reduction='none') + + self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir) + + self.visualize_each_iters = visualize_each_iters + LOGGER.info('BaseInpaintingTrainingModule init done') + + def forward(self, batch: Dict[str, + torch.Tensor]) -> Dict[str, torch.Tensor]: + """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys""" + raise NotImplementedError() + + def generator_loss(self, + batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + raise NotImplementedError() + + def discriminator_loss( + self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + raise NotImplementedError() diff --git a/modelscope/models/cv/image_inpainting/default.py b/modelscope/models/cv/image_inpainting/default.py new file mode 100644 index 00000000..5f57d63f --- /dev/null +++ b/modelscope/models/cv/image_inpainting/default.py @@ -0,0 +1,210 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +import bisect + +import torch +import torch.nn.functional as F + +from modelscope.utils.logger import get_logger +from .base import BaseInpaintingTrainingModule +from .modules.feature_matching import feature_matching_loss, masked_l1_loss + +LOGGER = get_logger() + + +def set_requires_grad(module, value): + for param in module.parameters(): + param.requires_grad = value + + +def add_prefix_to_keys(dct, prefix): + return {prefix + k: v for k, v in dct.items()} + + +class LinearRamp: + + def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0): + self.start_value = start_value + self.end_value = end_value + self.start_iter = start_iter + self.end_iter = end_iter + + def __call__(self, i): + if i < self.start_iter: + return self.start_value + if i >= self.end_iter: + return self.end_value + part = (i - self.start_iter) / (self.end_iter - self.start_iter) + return self.start_value * (1 - part) + self.end_value * part + + +class LadderRamp: + + def __init__(self, start_iters, values): + self.start_iters = start_iters + self.values = values + assert len(values) == len(start_iters) + 1, (len(values), + len(start_iters)) + + def __call__(self, i): + segment_i = bisect.bisect_right(self.start_iters, i) + return self.values[segment_i] + + +def get_ramp(kind='ladder', **kwargs): + if kind == 'linear': + return LinearRamp(**kwargs) + if kind == 'ladder': + return LadderRamp(**kwargs) + raise ValueError(f'Unexpected ramp kind: {kind}') + + +class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule): + + def __init__(self, + model_dir='', + predict_only=False, + concat_mask=True, + rescale_scheduler_kwargs=None, + image_to_discriminator='predicted_image', + add_noise_kwargs=None, + noise_fill_hole=False, + const_area_crop_kwargs=None, + distance_weighter_kwargs=None, + distance_weighted_mask_for_discr=False, + fake_fakes_proba=0, + fake_fakes_generator_kwargs=None, + **kwargs): + super().__init__(model_dir=model_dir, predict_only=predict_only) + self.concat_mask = concat_mask + self.rescale_size_getter = get_ramp( + **rescale_scheduler_kwargs + ) if rescale_scheduler_kwargs is not None else None + self.image_to_discriminator = image_to_discriminator + self.add_noise_kwargs = add_noise_kwargs + self.noise_fill_hole = noise_fill_hole + self.const_area_crop_kwargs = const_area_crop_kwargs + self.refine_mask_for_losses = None + self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr + + self.feature_matching_weight = 100 + self.losses_l1_weight_known = 10 + self.losses_l1_weight_missing = 0 + self.fake_fakes_proba = fake_fakes_proba + + def forward(self, batch): + img = batch['image'] + mask = batch['mask'] + + masked_img = img * (1 - mask) + + if self.concat_mask: + masked_img = torch.cat([masked_img, mask], dim=1) + + batch['predicted_image'] = self.generator(masked_img) + batch['inpainted'] = mask * batch['predicted_image'] + ( + 1 - mask) * batch['image'] + + batch['mask_for_losses'] = mask + + return batch + + def generator_loss(self, batch): + img = batch['image'] + predicted_img = batch[self.image_to_discriminator] + original_mask = batch['mask'] + supervised_mask = batch['mask_for_losses'] + + # L1 + l1_value = masked_l1_loss(predicted_img, img, supervised_mask, + self.losses_l1_weight_known, + self.losses_l1_weight_missing) + + total_loss = l1_value + metrics = dict(gen_l1=l1_value) + + # discriminator + # adversarial_loss calls backward by itself + mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask + self.adversarial_loss.pre_generator_step( + real_batch=img, + fake_batch=predicted_img, + generator=self.generator, + discriminator=self.discriminator) + discr_real_pred, discr_real_features = self.discriminator(img) + discr_fake_pred, discr_fake_features = self.discriminator( + predicted_img) + adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss( + real_batch=img, + fake_batch=predicted_img, + discr_real_pred=discr_real_pred, + discr_fake_pred=discr_fake_pred, + mask=mask_for_discr) + total_loss = total_loss + adv_gen_loss + metrics['gen_adv'] = adv_gen_loss + metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) + + # feature matching + if self.feature_matching_weight > 0: + need_mask_in_fm = False + mask_for_fm = supervised_mask if need_mask_in_fm else None + fm_value = feature_matching_loss( + discr_fake_features, discr_real_features, + mask=mask_for_fm) * self.feature_matching_weight + total_loss = total_loss + fm_value + metrics['gen_fm'] = fm_value + + if self.loss_resnet_pl is not None: + resnet_pl_value = self.loss_resnet_pl(predicted_img, img) + total_loss = total_loss + resnet_pl_value + metrics['gen_resnet_pl'] = resnet_pl_value + + return total_loss, metrics + + def discriminator_loss(self, batch): + total_loss = 0 + metrics = {} + + predicted_img = batch[self.image_to_discriminator].detach() + self.adversarial_loss.pre_discriminator_step( + real_batch=batch['image'], + fake_batch=predicted_img, + generator=self.generator, + discriminator=self.discriminator) + discr_real_pred, discr_real_features = self.discriminator( + batch['image']) + discr_fake_pred, discr_fake_features = self.discriminator( + predicted_img) + adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss( + real_batch=batch['image'], + fake_batch=predicted_img, + discr_real_pred=discr_real_pred, + discr_fake_pred=discr_fake_pred, + mask=batch['mask']) + + total_loss = (total_loss + adv_discr_loss) * 0.1 + metrics['discr_adv'] = adv_discr_loss + metrics.update(add_prefix_to_keys(adv_metrics, 'adv_')) + + return total_loss, metrics + + def _do_step(self, batch, optimizer_idx=None): + if optimizer_idx == 0: # step for generator + set_requires_grad(self.generator, True) + set_requires_grad(self.discriminator, False) + elif optimizer_idx == 1: # step for discriminator + set_requires_grad(self.generator, False) + set_requires_grad(self.discriminator, True) + + batch = self(batch) + total_loss = 0 + if optimizer_idx is None or optimizer_idx == 0: # step for generator + total_loss, metrics = self.generator_loss(batch) + + elif optimizer_idx is None or optimizer_idx == 1: # step for discriminator + total_loss, metrics = self.discriminator_loss(batch) + + result = dict(loss=total_loss) + return result diff --git a/modelscope/models/cv/image_inpainting/model.py b/modelscope/models/cv/image_inpainting/model.py new file mode 100644 index 00000000..b12f6edd --- /dev/null +++ b/modelscope/models/cv/image_inpainting/model.py @@ -0,0 +1,36 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +from typing import Any, Dict, Optional, Union + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +LOGGER = get_logger() + + +@MODELS.register_module( + Tasks.image_inpainting, module_name=Models.image_inpainting) +class FFTInpainting(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + super().__init__(model_dir, **kwargs) + + from .default import DefaultInpaintingTrainingModule + pretrained = kwargs.get('pretrained', True) + predict_only = kwargs.get('predict_only', False) + net = DefaultInpaintingTrainingModule( + model_dir=model_dir, predict_only=predict_only) + if pretrained: + path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) + LOGGER.info(f'loading pretrained model from {path}') + state = torch.load(path, map_location='cpu') + net.load_state_dict(state, strict=False) + self.model = net + + def forward(self, inputs): + return self.model(inputs) diff --git a/modelscope/models/cv/image_inpainting/modules/__init__.py b/modelscope/models/cv/image_inpainting/modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py b/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py new file mode 100644 index 00000000..89c3e293 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/ade20k/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .base import ModelBuilder diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/base.py b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py new file mode 100644 index 00000000..02bd3cc4 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/ade20k/base.py @@ -0,0 +1,380 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" + +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules import BatchNorm2d + +from . import resnet + +NUM_CLASS = 150 + + +# Model Builder +class ModelBuilder: + # custom weights initialization + @staticmethod + def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + nn.init.kaiming_normal_(m.weight.data) + elif classname.find('BatchNorm') != -1: + m.weight.data.fill_(1.) + m.bias.data.fill_(1e-4) + + @staticmethod + def build_encoder(arch='resnet50dilated', + fc_dim=512, + weights='', + model_dir=''): + pretrained = True if len(weights) == 0 else False + arch = arch.lower() + if arch == 'resnet50dilated': + orig_resnet = resnet.__dict__['resnet50']( + pretrained=pretrained, model_dir=model_dir) + net_encoder = ResnetDilated(orig_resnet, dilate_scale=8) + elif arch == 'resnet50': + orig_resnet = resnet.__dict__['resnet50']( + pretrained=pretrained, model_dir=model_dir) + net_encoder = Resnet(orig_resnet) + else: + raise Exception('Architecture undefined!') + + # encoders are usually pretrained + # net_encoder.apply(ModelBuilder.weights_init) + if len(weights) > 0: + print('Loading weights for net_encoder') + net_encoder.load_state_dict( + torch.load(weights, map_location=lambda storage, loc: storage), + strict=False) + return net_encoder + + @staticmethod + def build_decoder(arch='ppm_deepsup', + fc_dim=512, + num_class=NUM_CLASS, + weights='', + use_softmax=False, + drop_last_conv=False): + arch = arch.lower() + if arch == 'ppm_deepsup': + net_decoder = PPMDeepsup( + num_class=num_class, + fc_dim=fc_dim, + use_softmax=use_softmax, + drop_last_conv=drop_last_conv) + elif arch == 'c1_deepsup': + net_decoder = C1DeepSup( + num_class=num_class, + fc_dim=fc_dim, + use_softmax=use_softmax, + drop_last_conv=drop_last_conv) + else: + raise Exception('Architecture undefined!') + + net_decoder.apply(ModelBuilder.weights_init) + if len(weights) > 0: + print('Loading weights for net_decoder') + net_decoder.load_state_dict( + torch.load(weights, map_location=lambda storage, loc: storage), + strict=False) + return net_decoder + + @staticmethod + def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim, + drop_last_conv, *arts, **kwargs): + path = os.path.join( + weights_path, 'ade20k', + f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth') + return ModelBuilder.build_decoder( + arch=arch_decoder, + fc_dim=fc_dim, + weights=path, + use_softmax=True, + drop_last_conv=drop_last_conv) + + @staticmethod + def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim, + segmentation, *arts, **kwargs): + if segmentation: + path = os.path.join( + weights_path, 'ade20k', + f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth') + else: + path = '' + return ModelBuilder.build_encoder( + arch=arch_encoder, + fc_dim=fc_dim, + weights=path, + model_dir=weights_path) + + +def conv3x3_bn_relu(in_planes, out_planes, stride=1): + return nn.Sequential( + nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False), + BatchNorm2d(out_planes), + nn.ReLU(inplace=True), + ) + + +# pyramid pooling, deep supervision +class PPMDeepsup(nn.Module): + + def __init__(self, + num_class=NUM_CLASS, + fc_dim=4096, + use_softmax=False, + pool_scales=(1, 2, 3, 6), + drop_last_conv=False): + super().__init__() + self.use_softmax = use_softmax + self.drop_last_conv = drop_last_conv + + self.ppm = [] + for scale in pool_scales: + self.ppm.append( + nn.Sequential( + nn.AdaptiveAvgPool2d(scale), + nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False), + BatchNorm2d(512), nn.ReLU(inplace=True))) + self.ppm = nn.ModuleList(self.ppm) + self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1) + + self.conv_last = nn.Sequential( + nn.Conv2d( + fc_dim + len(pool_scales) * 512, + 512, + kernel_size=3, + padding=1, + bias=False), BatchNorm2d(512), nn.ReLU(inplace=True), + nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1)) + self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) + self.dropout_deepsup = nn.Dropout2d(0.1) + + def forward(self, conv_out, segSize=None): + conv5 = conv_out[-1] + + input_size = conv5.size() + ppm_out = [conv5] + for pool_scale in self.ppm: + ppm_out.append( + nn.functional.interpolate( + pool_scale(conv5), (input_size[2], input_size[3]), + mode='bilinear', + align_corners=False)) + ppm_out = torch.cat(ppm_out, 1) + + if self.drop_last_conv: + return ppm_out + else: + x = self.conv_last(ppm_out) + + if self.use_softmax: # is True during inference + x = nn.functional.interpolate( + x, size=segSize, mode='bilinear', align_corners=False) + x = nn.functional.softmax(x, dim=1) + return x + + # deep sup + conv4 = conv_out[-2] + _ = self.cbr_deepsup(conv4) + _ = self.dropout_deepsup(_) + _ = self.conv_last_deepsup(_) + + x = nn.functional.log_softmax(x, dim=1) + _ = nn.functional.log_softmax(_, dim=1) + + return (x, _) + + +class Resnet(nn.Module): + + def __init__(self, orig_resnet): + super(Resnet, self).__init__() + + # take pretrained resnet, except AvgPool and FC + self.conv1 = orig_resnet.conv1 + self.bn1 = orig_resnet.bn1 + self.relu1 = orig_resnet.relu1 + self.conv2 = orig_resnet.conv2 + self.bn2 = orig_resnet.bn2 + self.relu2 = orig_resnet.relu2 + self.conv3 = orig_resnet.conv3 + self.bn3 = orig_resnet.bn3 + self.relu3 = orig_resnet.relu3 + self.maxpool = orig_resnet.maxpool + self.layer1 = orig_resnet.layer1 + self.layer2 = orig_resnet.layer2 + self.layer3 = orig_resnet.layer3 + self.layer4 = orig_resnet.layer4 + + def forward(self, x, return_feature_maps=False): + conv_out = [] + + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.maxpool(x) + + x = self.layer1(x) + conv_out.append(x) + x = self.layer2(x) + conv_out.append(x) + x = self.layer3(x) + conv_out.append(x) + x = self.layer4(x) + conv_out.append(x) + + if return_feature_maps: + return conv_out + return [x] + + +# Resnet Dilated +class ResnetDilated(nn.Module): + + def __init__(self, orig_resnet, dilate_scale=8): + super().__init__() + from functools import partial + + if dilate_scale == 8: + orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2)) + orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4)) + elif dilate_scale == 16: + orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2)) + + # take pretrained resnet, except AvgPool and FC + self.conv1 = orig_resnet.conv1 + self.bn1 = orig_resnet.bn1 + self.relu1 = orig_resnet.relu1 + self.conv2 = orig_resnet.conv2 + self.bn2 = orig_resnet.bn2 + self.relu2 = orig_resnet.relu2 + self.conv3 = orig_resnet.conv3 + self.bn3 = orig_resnet.bn3 + self.relu3 = orig_resnet.relu3 + self.maxpool = orig_resnet.maxpool + self.layer1 = orig_resnet.layer1 + self.layer2 = orig_resnet.layer2 + self.layer3 = orig_resnet.layer3 + self.layer4 = orig_resnet.layer4 + + def _nostride_dilate(self, m, dilate): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + # the convolution with stride + if m.stride == (2, 2): + m.stride = (1, 1) + if m.kernel_size == (3, 3): + m.dilation = (dilate // 2, dilate // 2) + m.padding = (dilate // 2, dilate // 2) + # other convoluions + else: + if m.kernel_size == (3, 3): + m.dilation = (dilate, dilate) + m.padding = (dilate, dilate) + + def forward(self, x, return_feature_maps=False): + conv_out = [] + + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.maxpool(x) + + x = self.layer1(x) + conv_out.append(x) + x = self.layer2(x) + conv_out.append(x) + x = self.layer3(x) + conv_out.append(x) + x = self.layer4(x) + conv_out.append(x) + + if return_feature_maps: + return conv_out + return [x] + + +# last conv, deep supervision +class C1DeepSup(nn.Module): + + def __init__(self, + num_class=150, + fc_dim=2048, + use_softmax=False, + drop_last_conv=False): + super(C1DeepSup, self).__init__() + self.use_softmax = use_softmax + self.drop_last_conv = drop_last_conv + + self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1) + self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1) + + # last conv + self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) + self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) + + def forward(self, conv_out, segSize=None): + conv5 = conv_out[-1] + + x = self.cbr(conv5) + + if self.drop_last_conv: + return x + else: + x = self.conv_last(x) + + if self.use_softmax: # is True during inference + x = nn.functional.interpolate( + x, size=segSize, mode='bilinear', align_corners=False) + x = nn.functional.softmax(x, dim=1) + return x + + # deep sup + conv4 = conv_out[-2] + _ = self.cbr_deepsup(conv4) + _ = self.conv_last_deepsup(_) + + x = nn.functional.log_softmax(x, dim=1) + _ = nn.functional.log_softmax(_, dim=1) + + return (x, _) + + +# last conv +class C1(nn.Module): + + def __init__(self, num_class=150, fc_dim=2048, use_softmax=False): + super(C1, self).__init__() + self.use_softmax = use_softmax + + self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1) + + # last conv + self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0) + + def forward(self, conv_out, segSize=None): + conv5 = conv_out[-1] + x = self.cbr(conv5) + x = self.conv_last(x) + + if self.use_softmax: # is True during inference + x = nn.functional.interpolate( + x, size=segSize, mode='bilinear', align_corners=False) + x = nn.functional.softmax(x, dim=1) + else: + x = nn.functional.log_softmax(x, dim=1) + + return x diff --git a/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py new file mode 100644 index 00000000..7da9ff07 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py @@ -0,0 +1,183 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +import math +import os + +import torch +import torch.nn as nn +from torch.nn import BatchNorm2d + +__all__ = ['ResNet', 'resnet50'] + + +def conv3x3(in_planes, out_planes, stride=1): + '3x3 convolution with padding' + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BatchNorm2d(planes) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + self.bn2 = BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + + def __init__(self, block, layers, num_classes=1000): + self.inplanes = 128 + super(ResNet, self).__init__() + self.conv1 = conv3x3(3, 64, stride=2) + self.bn1 = BatchNorm2d(64) + self.relu1 = nn.ReLU(inplace=True) + self.conv2 = conv3x3(64, 64) + self.bn2 = BatchNorm2d(64) + self.relu2 = nn.ReLU(inplace=True) + self.conv3 = conv3x3(64, 128) + self.bn3 = BatchNorm2d(128) + self.relu3 = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def resnet50(pretrained=False, model_dir='', **kwargs): + """Constructs a ResNet-50 model. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth') + model.load_state_dict( + torch.load(cached_file, map_location='cpu'), strict=False) + return model diff --git a/modelscope/models/cv/image_inpainting/modules/adversarial.py b/modelscope/models/cv/image_inpainting/modules/adversarial.py new file mode 100644 index 00000000..b183876b --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/adversarial.py @@ -0,0 +1,167 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +from typing import Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class BaseAdversarialLoss: + + def pre_generator_step(self, real_batch: torch.Tensor, + fake_batch: torch.Tensor, generator: nn.Module, + discriminator: nn.Module): + """ + Prepare for generator step + :param real_batch: Tensor, a batch of real samples + :param fake_batch: Tensor, a batch of samples produced by generator + :param generator: + :param discriminator: + :return: None + """ + + def pre_discriminator_step(self, real_batch: torch.Tensor, + fake_batch: torch.Tensor, generator: nn.Module, + discriminator: nn.Module): + """ + Prepare for discriminator step + :param real_batch: Tensor, a batch of real samples + :param fake_batch: Tensor, a batch of samples produced by generator + :param generator: + :param discriminator: + :return: None + """ + + def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, + discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, + mask: Optional[torch.Tensor] = None) \ + -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + """ + Calculate generator loss + :param real_batch: Tensor, a batch of real samples + :param fake_batch: Tensor, a batch of samples produced by generator + :param discr_real_pred: Tensor, discriminator output for real_batch + :param discr_fake_pred: Tensor, discriminator output for fake_batch + :param mask: Tensor, actual mask, which was at input of generator when making fake_batch + :return: total generator loss along with some values that might be interesting to log + """ + raise NotImplementedError + + def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, + discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, + mask: Optional[torch.Tensor] = None) \ + -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + """ + Calculate discriminator loss and call .backward() on it + :param real_batch: Tensor, a batch of real samples + :param fake_batch: Tensor, a batch of samples produced by generator + :param discr_real_pred: Tensor, discriminator output for real_batch + :param discr_fake_pred: Tensor, discriminator output for fake_batch + :param mask: Tensor, actual mask, which was at input of generator when making fake_batch + :return: total discriminator loss along with some values that might be interesting to log + """ + raise NotImplementedError + + def interpolate_mask(self, mask, shape): + assert mask is not None + assert self.allow_scale_mask or shape == mask.shape[-2:] + if shape != mask.shape[-2:] and self.allow_scale_mask: + if self.mask_scale_mode == 'maxpool': + mask = F.adaptive_max_pool2d(mask, shape) + else: + mask = F.interpolate( + mask, size=shape, mode=self.mask_scale_mode) + return mask + + +def make_r1_gp(discr_real_pred, real_batch): + if torch.is_grad_enabled(): + grad_real = torch.autograd.grad( + outputs=discr_real_pred.sum(), + inputs=real_batch, + create_graph=True)[0] + grad_penalty = (grad_real.view(grad_real.shape[0], + -1).norm(2, dim=1)**2).mean() + else: + grad_penalty = 0 + real_batch.requires_grad = False + + return grad_penalty + + +class NonSaturatingWithR1(BaseAdversarialLoss): + + def __init__(self, + gp_coef=5, + weight=1, + mask_as_fake_target=False, + allow_scale_mask=False, + mask_scale_mode='nearest', + extra_mask_weight_for_gen=0, + use_unmasked_for_gen=True, + use_unmasked_for_discr=True): + self.gp_coef = gp_coef + self.weight = weight + # use for discr => use for gen; + # otherwise we teach only the discr to pay attention to very small difference + assert use_unmasked_for_gen or (not use_unmasked_for_discr) + # mask as target => use unmasked for discr: + # if we don't care about unmasked regions at all + # then it doesn't matter if the value of mask_as_fake_target is true or false + assert use_unmasked_for_discr or (not mask_as_fake_target) + self.use_unmasked_for_gen = use_unmasked_for_gen + self.use_unmasked_for_discr = use_unmasked_for_discr + self.mask_as_fake_target = mask_as_fake_target + self.allow_scale_mask = allow_scale_mask + self.mask_scale_mode = mask_scale_mode + self.extra_mask_weight_for_gen = extra_mask_weight_for_gen + + def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, + discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, + mask=None) \ + -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + fake_loss = F.softplus(-discr_fake_pred) + if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \ + not self.use_unmasked_for_gen: # == if masked region should be treated differently + mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) + if not self.use_unmasked_for_gen: + fake_loss = fake_loss * mask + else: + pixel_weights = 1 + mask * self.extra_mask_weight_for_gen + fake_loss = fake_loss * pixel_weights + + return fake_loss.mean() * self.weight, dict() + + def pre_discriminator_step(self, real_batch: torch.Tensor, + fake_batch: torch.Tensor, generator: nn.Module, + discriminator: nn.Module): + real_batch.requires_grad = True + + def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor, + discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor, + mask=None) \ + -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + + real_loss = F.softplus(-discr_real_pred) + grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef + fake_loss = F.softplus(discr_fake_pred) + + if not self.use_unmasked_for_discr or self.mask_as_fake_target: + # == if masked region should be treated differently + mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:]) + # use_unmasked_for_discr=False only makes sense for fakes; + # for reals there is no difference beetween two regions + fake_loss = fake_loss * mask + if self.mask_as_fake_target: + fake_loss = fake_loss + (1 + - mask) * F.softplus(-discr_fake_pred) + + sum_discr_loss = real_loss + grad_penalty + fake_loss + metrics = dict( + discr_real_out=discr_real_pred.mean(), + discr_fake_out=discr_fake_pred.mean(), + discr_real_gp=grad_penalty) + return sum_discr_loss.mean(), metrics diff --git a/modelscope/models/cv/image_inpainting/modules/feature_matching.py b/modelscope/models/cv/image_inpainting/modules/feature_matching.py new file mode 100644 index 00000000..c2effb20 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/feature_matching.py @@ -0,0 +1,45 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +from typing import List + +import torch +import torch.nn.functional as F + + +def masked_l2_loss(pred, target, mask, weight_known, weight_missing): + per_pixel_l2 = F.mse_loss(pred, target, reduction='none') + pixel_weights = mask * weight_missing + (1 - mask) * weight_known + return (pixel_weights * per_pixel_l2).mean() + + +def masked_l1_loss(pred, target, mask, weight_known, weight_missing): + per_pixel_l1 = F.l1_loss(pred, target, reduction='none') + pixel_weights = mask * weight_missing + (1 - mask) * weight_known + return (pixel_weights * per_pixel_l1).mean() + + +def feature_matching_loss(fake_features: List[torch.Tensor], + target_features: List[torch.Tensor], + mask=None): + if mask is None: + res = torch.stack([ + F.mse_loss(fake_feat, target_feat) + for fake_feat, target_feat in zip(fake_features, target_features) + ]).mean() + else: + res = 0 + norm = 0 + for fake_feat, target_feat in zip(fake_features, target_features): + cur_mask = F.interpolate( + mask, + size=fake_feat.shape[-2:], + mode='bilinear', + align_corners=False) + error_weights = 1 - cur_mask + cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean() + res = res + cur_val + norm += 1 + res = res / norm + return res diff --git a/modelscope/models/cv/image_inpainting/modules/ffc.py b/modelscope/models/cv/image_inpainting/modules/ffc.py new file mode 100644 index 00000000..c74425e3 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/ffc.py @@ -0,0 +1,588 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from kornia.geometry.transform import rotate + + +def get_activation(kind='tanh'): + if kind == 'tanh': + return nn.Tanh() + if kind == 'sigmoid': + return nn.Sigmoid() + if kind is False: + return nn.Identity() + raise ValueError(f'Unknown activation kind {kind}') + + +class SELayer(nn.Module): + + def __init__(self, channel, reduction=16): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction, bias=False), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid()) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + res = x * y.expand_as(x) + return res + + +class FourierUnit(nn.Module): + + def __init__(self, + in_channels, + out_channels, + groups=1, + spatial_scale_factor=None, + spatial_scale_mode='bilinear', + spectral_pos_encoding=False, + use_se=False, + se_kwargs=None, + ffc3d=False, + fft_norm='ortho'): + # bn_layer not used + super(FourierUnit, self).__init__() + self.groups = groups + + self.conv_layer = torch.nn.Conv2d( + in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0), + out_channels=out_channels * 2, + kernel_size=1, + stride=1, + padding=0, + groups=self.groups, + bias=False) + self.bn = torch.nn.BatchNorm2d(out_channels * 2) + self.relu = torch.nn.ReLU(inplace=True) + + # squeeze and excitation block + self.use_se = use_se + if use_se: + if se_kwargs is None: + se_kwargs = {} + self.se = SELayer(self.conv_layer.in_channels, **se_kwargs) + + self.spatial_scale_factor = spatial_scale_factor + self.spatial_scale_mode = spatial_scale_mode + self.spectral_pos_encoding = spectral_pos_encoding + self.ffc3d = ffc3d + self.fft_norm = fft_norm + + def forward(self, x): + batch = x.shape[0] + + if self.spatial_scale_factor is not None: + orig_size = x.shape[-2:] + x = F.interpolate( + x, + scale_factor=self.spatial_scale_factor, + mode=self.spatial_scale_mode, + align_corners=False) + + # (batch, c, h, w/2+1, 2) + fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1) + ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm) + ffted = torch.stack((ffted.real, ffted.imag), dim=-1) + ffted = ffted.permute(0, 1, 4, 2, + 3).contiguous() # (batch, c, 2, h, w/2+1) + ffted = ffted.view(( + batch, + -1, + ) + ffted.size()[3:]) + + if self.spectral_pos_encoding: + height, width = ffted.shape[-2:] + coords_vert = torch.linspace(0, 1, + height)[None, None, :, None].expand( + batch, 1, height, width).to(ffted) + coords_hor = torch.linspace(0, 1, + width)[None, None, None, :].expand( + batch, 1, height, width).to(ffted) + ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1) + + if self.use_se: + ffted = self.se(ffted) + + ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1) + ffted = self.relu(self.bn(ffted)) + + ffted = ffted.view(( + batch, + -1, + 2, + ) + ffted.size()[2:]).permute( + 0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2) + ffted = torch.complex(ffted[..., 0], ffted[..., 1]) + + ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:] + output = torch.fft.irfftn( + ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm) + + if self.spatial_scale_factor is not None: + output = F.interpolate( + output, + size=orig_size, + mode=self.spatial_scale_mode, + align_corners=False) + + return output + + +class SpectralTransform(nn.Module): + + def __init__(self, + in_channels, + out_channels, + stride=1, + groups=1, + enable_lfu=True, + **fu_kwargs): + # bn_layer not used + super(SpectralTransform, self).__init__() + self.enable_lfu = enable_lfu + if stride == 2: + self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2) + else: + self.downsample = nn.Identity() + + self.stride = stride + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels, + out_channels // 2, + kernel_size=1, + groups=groups, + bias=False), nn.BatchNorm2d(out_channels // 2), + nn.ReLU(inplace=True)) + self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups, + **fu_kwargs) + if self.enable_lfu: + self.lfu = FourierUnit(out_channels // 2, out_channels // 2, + groups) + self.conv2 = torch.nn.Conv2d( + out_channels // 2, + out_channels, + kernel_size=1, + groups=groups, + bias=False) + + def forward(self, x): + + x = self.downsample(x) + x = self.conv1(x) + output = self.fu(x) + + if self.enable_lfu: + n, c, h, w = x.shape + split_no = 2 + split_s = h // split_no + xs = torch.cat( + torch.split(x[:, :c // 4], split_s, dim=-2), + dim=1).contiguous() + xs = torch.cat( + torch.split(xs, split_s, dim=-1), dim=1).contiguous() + xs = self.lfu(xs) + xs = xs.repeat(1, 1, split_no, split_no).contiguous() + else: + xs = 0 + + output = self.conv2(x + output + xs) + + return output + + +class LearnableSpatialTransformWrapper(nn.Module): + + def __init__(self, + impl, + pad_coef=0.5, + angle_init_range=80, + train_angle=True): + super().__init__() + self.impl = impl + self.angle = torch.rand(1) * angle_init_range + if train_angle: + self.angle = nn.Parameter(self.angle, requires_grad=True) + self.pad_coef = pad_coef + + def forward(self, x): + if torch.is_tensor(x): + return self.inverse_transform(self.impl(self.transform(x)), x) + elif isinstance(x, tuple): + x_trans = tuple(self.transform(elem) for elem in x) + y_trans = self.impl(x_trans) + return tuple( + self.inverse_transform(elem, orig_x) + for elem, orig_x in zip(y_trans, x)) + else: + raise ValueError(f'Unexpected input type {type(x)}') + + def transform(self, x): + height, width = x.shape[2:] + pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) + x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect') + x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded)) + return x_padded_rotated + + def inverse_transform(self, y_padded_rotated, orig_x): + height, width = orig_x.shape[2:] + pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) + + y_padded = rotate( + y_padded_rotated, angle=-self.angle.to(y_padded_rotated)) + y_height, y_width = y_padded.shape[2:] + y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w] + return y + + +class FFC(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + enable_lfu=True, + padding_type='reflect', + gated=False, + **spectral_kwargs): + super(FFC, self).__init__() + + assert stride == 1 or stride == 2, 'Stride should be 1 or 2.' + self.stride = stride + + in_cg = int(in_channels * ratio_gin) + in_cl = in_channels - in_cg + out_cg = int(out_channels * ratio_gout) + out_cl = out_channels - out_cg + + self.ratio_gin = ratio_gin + self.ratio_gout = ratio_gout + self.global_in_num = in_cg + + module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d + self.convl2l = module( + in_cl, + out_cl, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type) + module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d + self.convl2g = module( + in_cl, + out_cg, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type) + module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d + self.convg2l = module( + in_cg, + out_cl, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type) + module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform + self.convg2g = module(in_cg, out_cg, stride, + 1 if groups == 1 else groups // 2, enable_lfu, + **spectral_kwargs) + + self.gated = gated + module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d + self.gate = module(in_channels, 2, 1) + + def forward(self, x): + x_l, x_g = x if type(x) is tuple else (x, 0) + out_xl, out_xg = 0, 0 + + if self.gated: + total_input_parts = [x_l] + if torch.is_tensor(x_g): + total_input_parts.append(x_g) + total_input = torch.cat(total_input_parts, dim=1) + + gates = torch.sigmoid(self.gate(total_input)) + g2l_gate, l2g_gate = gates.chunk(2, dim=1) + else: + g2l_gate, l2g_gate = 1, 1 + + if self.ratio_gout != 1: + out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate + if self.ratio_gout != 0: + out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g) + + return out_xl, out_xg + + +class FFC_BN_ACT(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + norm_layer=nn.BatchNorm2d, + activation_layer=nn.Identity, + padding_type='reflect', + enable_lfu=True, + **kwargs): + super(FFC_BN_ACT, self).__init__() + self.ffc = FFC( + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride, + padding, + dilation, + groups, + bias, + enable_lfu, + padding_type=padding_type, + **kwargs) + lnorm = nn.Identity if ratio_gout == 1 else norm_layer + gnorm = nn.Identity if ratio_gout == 0 else norm_layer + global_channels = int(out_channels * ratio_gout) + self.bn_l = lnorm(out_channels - global_channels) + self.bn_g = gnorm(global_channels) + + lact = nn.Identity if ratio_gout == 1 else activation_layer + gact = nn.Identity if ratio_gout == 0 else activation_layer + self.act_l = lact(inplace=True) + self.act_g = gact(inplace=True) + + def forward(self, x): + x_l, x_g = self.ffc(x) + x_l = self.act_l(self.bn_l(x_l)) + x_g = self.act_g(self.bn_g(x_g)) + return x_l, x_g + + +class FFCResnetBlock(nn.Module): + + def __init__(self, + dim, + padding_type, + norm_layer, + activation_layer=nn.ReLU, + dilation=1, + spatial_transform_kwargs=None, + inline=False, + **conv_kwargs): + super().__init__() + self.conv1 = FFC_BN_ACT( + dim, + dim, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm_layer=norm_layer, + activation_layer=activation_layer, + padding_type=padding_type, + **conv_kwargs) + self.conv2 = FFC_BN_ACT( + dim, + dim, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm_layer=norm_layer, + activation_layer=activation_layer, + padding_type=padding_type, + **conv_kwargs) + if spatial_transform_kwargs is not None: + self.conv1 = LearnableSpatialTransformWrapper( + self.conv1, **spatial_transform_kwargs) + self.conv2 = LearnableSpatialTransformWrapper( + self.conv2, **spatial_transform_kwargs) + self.inline = inline + + def forward(self, x): + if self.inline: + x_l, x_g = x[:, :-self.conv1.ffc. + global_in_num], x[:, -self.conv1.ffc.global_in_num:] + else: + x_l, x_g = x if type(x) is tuple else (x, 0) + + id_l, id_g = x_l, x_g + + x_l, x_g = self.conv1((x_l, x_g)) + x_l, x_g = self.conv2((x_l, x_g)) + + x_l, x_g = id_l + x_l, id_g + x_g + out = x_l, x_g + if self.inline: + out = torch.cat(out, dim=1) + return out + + +class ConcatTupleLayer(nn.Module): + + def forward(self, x): + assert isinstance(x, tuple) + x_l, x_g = x + assert torch.is_tensor(x_l) or torch.is_tensor(x_g) + if not torch.is_tensor(x_g): + return x_l + return torch.cat(x, dim=1) + + +class FFCResNetGenerator(nn.Module): + + def __init__(self, + input_nc=4, + output_nc=3, + ngf=64, + n_downsampling=3, + n_blocks=18, + norm_layer=nn.BatchNorm2d, + padding_type='reflect', + activation_layer=nn.ReLU, + up_norm_layer=nn.BatchNorm2d, + up_activation=nn.ReLU(True), + init_conv_kwargs={ + 'ratio_gin': 0, + 'ratio_gout': 0, + 'enable_lfu': False + }, + downsample_conv_kwargs={ + 'ratio_gin': 0, + 'ratio_gout': 0, + 'enable_lfu': False + }, + resnet_conv_kwargs={ + 'ratio_gin': 0.75, + 'ratio_gout': 0.75, + 'enable_lfu': False + }, + spatial_transform_layers=None, + spatial_transform_kwargs={}, + add_out_act='sigmoid', + max_features=1024, + out_ffc=False, + out_ffc_kwargs={}): + assert (n_blocks >= 0) + super().__init__() + + model = [ + nn.ReflectionPad2d(3), + FFC_BN_ACT( + input_nc, + ngf, + kernel_size=7, + padding=0, + norm_layer=norm_layer, + activation_layer=activation_layer, + **init_conv_kwargs) + ] + + # downsample + for i in range(n_downsampling): + mult = 2**i + if i == n_downsampling - 1: + cur_conv_kwargs = dict(downsample_conv_kwargs) + cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get( + 'ratio_gin', 0) + else: + cur_conv_kwargs = downsample_conv_kwargs + model += [ + FFC_BN_ACT( + min(max_features, ngf * mult), + min(max_features, ngf * mult * 2), + kernel_size=3, + stride=2, + padding=1, + norm_layer=norm_layer, + activation_layer=activation_layer, + **cur_conv_kwargs) + ] + + mult = 2**n_downsampling + feats_num_bottleneck = min(max_features, ngf * mult) + + # resnet blocks + for i in range(n_blocks): + cur_resblock = FFCResnetBlock( + feats_num_bottleneck, + padding_type=padding_type, + activation_layer=activation_layer, + norm_layer=norm_layer, + **resnet_conv_kwargs) + if spatial_transform_layers is not None and i in spatial_transform_layers: + cur_resblock = LearnableSpatialTransformWrapper( + cur_resblock, **spatial_transform_kwargs) + model += [cur_resblock] + + model += [ConcatTupleLayer()] + + # upsample + for i in range(n_downsampling): + mult = 2**(n_downsampling - i) + model += [ + nn.ConvTranspose2d( + min(max_features, ngf * mult), + min(max_features, int(ngf * mult / 2)), + kernel_size=3, + stride=2, + padding=1, + output_padding=1), + up_norm_layer(min(max_features, int(ngf * mult / 2))), + up_activation + ] + + if out_ffc: + model += [ + FFCResnetBlock( + ngf, + padding_type=padding_type, + activation_layer=activation_layer, + norm_layer=norm_layer, + inline=True, + **out_ffc_kwargs) + ] + + model += [ + nn.ReflectionPad2d(3), + nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0) + ] + if add_out_act: + model.append( + get_activation('tanh' if add_out_act is True else add_out_act)) + self.model = nn.Sequential(*model) + + def forward(self, input): + return self.model(input) diff --git a/modelscope/models/cv/image_inpainting/modules/inception.py b/modelscope/models/cv/image_inpainting/modules/inception.py new file mode 100644 index 00000000..5070533d --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/inception.py @@ -0,0 +1,324 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision import models + +from modelscope.utils.logger import get_logger + +try: + from torchvision.models.utils import load_state_dict_from_url +except ImportError: + from torch.utils.model_zoo import load_url as load_state_dict_from_url + +# Inception weights ported to Pytorch from +# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz +FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \ + 'fid_weights/pt_inception-2015-12-05-6726825d.pth' + +LOGGER = get_logger() + + +class InceptionV3(nn.Module): + """Pretrained InceptionV3 network returning feature maps""" + + # Index of default block of inception to return, + # corresponds to output of final average pooling + DEFAULT_BLOCK_INDEX = 3 + + # Maps feature dimensionality to their output blocks indices + BLOCK_INDEX_BY_DIM = { + 64: 0, # First max pooling features + 192: 1, # Second max pooling featurs + 768: 2, # Pre-aux classifier features + 2048: 3 # Final average pooling features + } + + def __init__(self, + output_blocks=[DEFAULT_BLOCK_INDEX], + resize_input=True, + normalize_input=True, + requires_grad=False, + use_fid_inception=True): + """Build pretrained InceptionV3 + + Parameters + ---------- + output_blocks : list of int + Indices of blocks to return features of. Possible values are: + - 0: corresponds to output of first max pooling + - 1: corresponds to output of second max pooling + - 2: corresponds to output which is fed to aux classifier + - 3: corresponds to output of final average pooling + resize_input : bool + If true, bilinearly resizes input to width and height 299 before + feeding input to model. As the network without fully connected + layers is fully convolutional, it should be able to handle inputs + of arbitrary size, so resizing might not be strictly needed + normalize_input : bool + If true, scales the input from range (0, 1) to the range the + pretrained Inception network expects, namely (-1, 1) + requires_grad : bool + If true, parameters of the model require gradients. Possibly useful + for finetuning the network + use_fid_inception : bool + If true, uses the pretrained Inception model used in Tensorflow's + FID implementation. If false, uses the pretrained Inception model + available in torchvision. The FID Inception model has different + weights and a slightly different structure from torchvision's + Inception model. If you want to compute FID scores, you are + strongly advised to set this parameter to true to get comparable + results. + """ + super(InceptionV3, self).__init__() + + self.resize_input = resize_input + self.normalize_input = normalize_input + self.output_blocks = sorted(output_blocks) + self.last_needed_block = max(output_blocks) + + assert self.last_needed_block <= 3, \ + 'Last possible output block index is 3' + + self.blocks = nn.ModuleList() + + if use_fid_inception: + inception = fid_inception_v3() + else: + inception = models.inception_v3(pretrained=True) + + # Block 0: input to maxpool1 + block0 = [ + inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3, + inception.Conv2d_2b_3x3, + nn.MaxPool2d(kernel_size=3, stride=2) + ] + self.blocks.append(nn.Sequential(*block0)) + + # Block 1: maxpool1 to maxpool2 + if self.last_needed_block >= 1: + block1 = [ + inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3, + nn.MaxPool2d(kernel_size=3, stride=2) + ] + self.blocks.append(nn.Sequential(*block1)) + + # Block 2: maxpool2 to aux classifier + if self.last_needed_block >= 2: + block2 = [ + inception.Mixed_5b, + inception.Mixed_5c, + inception.Mixed_5d, + inception.Mixed_6a, + inception.Mixed_6b, + inception.Mixed_6c, + inception.Mixed_6d, + inception.Mixed_6e, + ] + self.blocks.append(nn.Sequential(*block2)) + + # Block 3: aux classifier to final avgpool + if self.last_needed_block >= 3: + block3 = [ + inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c, + nn.AdaptiveAvgPool2d(output_size=(1, 1)) + ] + self.blocks.append(nn.Sequential(*block3)) + + for param in self.parameters(): + param.requires_grad = requires_grad + + def forward(self, inp): + """Get Inception feature maps + + Parameters + ---------- + inp : torch.autograd.Variable + Input tensor of shape Bx3xHxW. Values are expected to be in + range (0, 1) + + Returns + ------- + List of torch.autograd.Variable, corresponding to the selected output + block, sorted ascending by index + """ + outp = [] + x = inp + + if self.resize_input: + x = F.interpolate( + x, size=(299, 299), mode='bilinear', align_corners=False) + + if self.normalize_input: + x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) + + for idx, block in enumerate(self.blocks): + x = block(x) + if idx in self.output_blocks: + outp.append(x) + + if idx == self.last_needed_block: + break + + return outp + + +def fid_inception_v3(): + """Build pretrained Inception model for FID computation + + The Inception model for FID computation uses a different set of weights + and has a slightly different structure than torchvision's Inception. + + This method first constructs torchvision's Inception and then patches the + necessary parts that are different in the FID Inception model. + """ + LOGGER.info('fid_inception_v3 called') + inception = models.inception_v3( + num_classes=1008, aux_logits=False, pretrained=False) + LOGGER.info('models.inception_v3 done') + inception.Mixed_5b = FIDInceptionA(192, pool_features=32) + inception.Mixed_5c = FIDInceptionA(256, pool_features=64) + inception.Mixed_5d = FIDInceptionA(288, pool_features=64) + inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128) + inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160) + inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160) + inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192) + inception.Mixed_7b = FIDInceptionE_1(1280) + inception.Mixed_7c = FIDInceptionE_2(2048) + + LOGGER.info('fid_inception_v3 patching done') + + state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True) + LOGGER.info('fid_inception_v3 weights downloaded') + + inception.load_state_dict(state_dict) + LOGGER.info('fid_inception_v3 weights loaded into model') + + return inception + + +class FIDInceptionA(models.inception.InceptionA): + """InceptionA block patched for FID computation""" + + def __init__(self, in_channels, pool_features): + super(FIDInceptionA, self).__init__(in_channels, pool_features) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch5x5 = self.branch5x5_1(x) + branch5x5 = self.branch5x5_2(branch5x5) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) + + # Patch: Tensorflow's average pool does not use the padded zero's in + # its average calculation + branch_pool = F.avg_pool2d( + x, kernel_size=3, stride=1, padding=1, count_include_pad=False) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] + return torch.cat(outputs, 1) + + +class FIDInceptionC(models.inception.InceptionC): + """InceptionC block patched for FID computation""" + + def __init__(self, in_channels, channels_7x7): + super(FIDInceptionC, self).__init__(in_channels, channels_7x7) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch7x7 = self.branch7x7_1(x) + branch7x7 = self.branch7x7_2(branch7x7) + branch7x7 = self.branch7x7_3(branch7x7) + + branch7x7dbl = self.branch7x7dbl_1(x) + branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) + + # Patch: Tensorflow's average pool does not use the padded zero's in + # its average calculation + branch_pool = F.avg_pool2d( + x, kernel_size=3, stride=1, padding=1, count_include_pad=False) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool] + return torch.cat(outputs, 1) + + +class FIDInceptionE_1(models.inception.InceptionE): + """First InceptionE block patched for FID computation""" + + def __init__(self, in_channels): + super(FIDInceptionE_1, self).__init__(in_channels) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch3x3 = self.branch3x3_1(x) + branch3x3 = [ + self.branch3x3_2a(branch3x3), + self.branch3x3_2b(branch3x3), + ] + branch3x3 = torch.cat(branch3x3, 1) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = [ + self.branch3x3dbl_3a(branch3x3dbl), + self.branch3x3dbl_3b(branch3x3dbl), + ] + branch3x3dbl = torch.cat(branch3x3dbl, 1) + + # Patch: Tensorflow's average pool does not use the padded zero's in + # its average calculation + branch_pool = F.avg_pool2d( + x, kernel_size=3, stride=1, padding=1, count_include_pad=False) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] + return torch.cat(outputs, 1) + + +class FIDInceptionE_2(models.inception.InceptionE): + """Second InceptionE block patched for FID computation""" + + def __init__(self, in_channels): + super(FIDInceptionE_2, self).__init__(in_channels) + + def forward(self, x): + branch1x1 = self.branch1x1(x) + + branch3x3 = self.branch3x3_1(x) + branch3x3 = [ + self.branch3x3_2a(branch3x3), + self.branch3x3_2b(branch3x3), + ] + branch3x3 = torch.cat(branch3x3, 1) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = [ + self.branch3x3dbl_3a(branch3x3dbl), + self.branch3x3dbl_3b(branch3x3dbl), + ] + branch3x3dbl = torch.cat(branch3x3dbl, 1) + + # Patch: The FID Inception model uses max pooling instead of average + # pooling. This is likely an error in this specific Inception + # implementation, as other Inception models use average pooling here + # (which matches the description in the paper). + branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] + return torch.cat(outputs, 1) diff --git a/modelscope/models/cv/image_inpainting/modules/perceptual.py b/modelscope/models/cv/image_inpainting/modules/perceptual.py new file mode 100644 index 00000000..80fe2b96 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/perceptual.py @@ -0,0 +1,47 @@ +""" +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision + +from .ade20k import ModelBuilder + +IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None] +IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None] + + +class ResNetPL(nn.Module): + + def __init__(self, + weight=1, + weights_path=None, + arch_encoder='resnet50dilated', + segmentation=True): + super().__init__() + self.impl = ModelBuilder.get_encoder( + weights_path=weights_path, + arch_encoder=arch_encoder, + arch_decoder='ppm_deepsup', + fc_dim=2048, + segmentation=segmentation) + self.impl.eval() + for w in self.impl.parameters(): + w.requires_grad_(False) + + self.weight = weight + + def forward(self, pred, target): + pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred) + target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target) + + pred_feats = self.impl(pred, return_feature_maps=True) + target_feats = self.impl(target, return_feature_maps=True) + + result = torch.stack([ + F.mse_loss(cur_pred, cur_target) + for cur_pred, cur_target in zip(pred_feats, target_feats) + ]).sum() * self.weight + return result diff --git a/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py new file mode 100644 index 00000000..32e18f3e --- /dev/null +++ b/modelscope/models/cv/image_inpainting/modules/pix2pixhd.py @@ -0,0 +1,75 @@ +""" +The implementation is adopted from +https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py +""" +import collections +import functools +import logging +from collections import defaultdict +from functools import partial + +import numpy as np +import torch.nn as nn + + +# Defines the PatchGAN discriminator with the specified arguments. +class NLayerDiscriminator(nn.Module): + + def __init__( + self, + input_nc=3, + ndf=64, + n_layers=4, + norm_layer=nn.BatchNorm2d, + ): + super().__init__() + self.n_layers = n_layers + + kw = 4 + padw = int(np.ceil((kw - 1.0) / 2)) + sequence = [[ + nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), + nn.LeakyReLU(0.2, True) + ]] + + nf = ndf + for n in range(1, n_layers): + nf_prev = nf + nf = min(nf * 2, 512) + + cur_model = [] + cur_model += [ + nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw), + norm_layer(nf), + nn.LeakyReLU(0.2, True) + ] + sequence.append(cur_model) + + nf_prev = nf + nf = min(nf * 2, 512) + + cur_model = [] + cur_model += [ + nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw), + norm_layer(nf), + nn.LeakyReLU(0.2, True) + ] + sequence.append(cur_model) + + sequence += [[ + nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw) + ]] + + for n in range(len(sequence)): + setattr(self, 'model' + str(n), nn.Sequential(*sequence[n])) + + def get_all_activations(self, x): + res = [x] + for n in range(self.n_layers + 2): + model = getattr(self, 'model' + str(n)) + res.append(model(res[-1])) + return res[1:] + + def forward(self, x): + act = self.get_all_activations(x) + return act[-1], act[:-1] diff --git a/modelscope/models/cv/image_inpainting/refinement.py b/modelscope/models/cv/image_inpainting/refinement.py new file mode 100644 index 00000000..662d8a05 --- /dev/null +++ b/modelscope/models/cv/image_inpainting/refinement.py @@ -0,0 +1,393 @@ +''' +Part of the implementation is borrowed and modified from LaMa, publicly available at +https://github.com/saic-mdal/lama +''' +import cv2 +import numpy as np +import torch +import torch.nn as nn +from kornia.filters import gaussian_blur2d +from kornia.geometry.transform import resize +from kornia.morphology import erosion +from torch.nn import functional as F +from torch.optim import SGD, Adam +from tqdm import tqdm + +from .modules.ffc import FFCResnetBlock + + +def move_to_device(obj, device): + if isinstance(obj, nn.Module): + return obj.to(device) + if torch.is_tensor(obj): + return obj.to(device) + if isinstance(obj, (tuple, list)): + return [move_to_device(el, device) for el in obj] + if isinstance(obj, dict): + return {name: move_to_device(val, device) for name, val in obj.items()} + raise ValueError(f'Unexpected type {type(obj)}') + + +def ceil_modulo(x, mod): + if x % mod == 0: + return x + return (x // mod + 1) * mod + + +def pad_tensor_to_modulo(img, mod): + batch_size, channels, height, width = img.shape + out_height = ceil_modulo(height, mod) + out_width = ceil_modulo(width, mod) + return F.pad( + img, + pad=(0, out_width - width, 0, out_height - height), + mode='reflect') + + +def _pyrdown(im: torch.Tensor, downsize: tuple = None): + """downscale the image""" + if downsize is None: + downsize = (im.shape[2] // 2, im.shape[3] // 2) + assert im.shape[ + 1] == 3, 'Expected shape for the input to be (n,3,height,width)' + im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0)) + im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False) + return im + + +def _pyrdown_mask(mask: torch.Tensor, + downsize: tuple = None, + eps: float = 1e-8, + blur_mask: bool = True, + round_up: bool = True): + """downscale the mask tensor + + Parameters + ---------- + mask : torch.Tensor + mask of size (B, 1, H, W) + downsize : tuple, optional + size to downscale to. If None, image is downscaled to half, by default None + eps : float, optional + threshold value for binarizing the mask, by default 1e-8 + blur_mask : bool, optional + if True, apply gaussian filter before downscaling, by default True + round_up : bool, optional + if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True + + Returns + ------- + torch.Tensor + downscaled mask + """ + + if downsize is None: + downsize = (mask.shape[2] // 2, mask.shape[3] // 2) + assert mask.shape[ + 1] == 1, 'Expected shape for the input to be (n,1,height,width)' + if blur_mask is True: + mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0)) + mask = F.interpolate( + mask, size=downsize, mode='bilinear', align_corners=False) + else: + mask = F.interpolate( + mask, size=downsize, mode='bilinear', align_corners=False) + if round_up: + mask[mask >= eps] = 1 + mask[mask < eps] = 0 + else: + mask[mask >= 1.0 - eps] = 1 + mask[mask < 1.0 - eps] = 0 + return mask + + +def _erode_mask(mask: torch.Tensor, + ekernel: torch.Tensor = None, + eps: float = 1e-8): + """erode the mask, and set gray pixels to 0""" + if ekernel is not None: + mask = erosion(mask, ekernel) + mask[mask >= 1.0 - eps] = 1 + mask[mask < 1.0 - eps] = 0 + return mask + + +def _l1_loss(pred: torch.Tensor, + pred_downscaled: torch.Tensor, + ref: torch.Tensor, + mask: torch.Tensor, + mask_downscaled: torch.Tensor, + image: torch.Tensor, + on_pred: bool = True): + """l1 loss on src pixels, and downscaled predictions if on_pred=True""" + loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8])) + if on_pred: + loss += torch.mean( + torch.abs(pred_downscaled[mask_downscaled >= 1e-8] + - ref[mask_downscaled >= 1e-8])) + return loss + + +def _infer(image: torch.Tensor, + mask: torch.Tensor, + forward_front: nn.Module, + forward_rears: nn.Module, + ref_lower_res: torch.Tensor, + orig_shape: tuple, + devices: list, + scale_ind: int, + n_iters: int = 15, + lr: float = 0.002): + """Performs inference with refinement at a given scale. + + Parameters + ---------- + image : torch.Tensor + input image to be inpainted, of size (1,3,H,W) + mask : torch.Tensor + input inpainting mask, of size (1,1,H,W) + forward_front : nn.Module + the front part of the inpainting network + forward_rears : nn.Module + the rear part of the inpainting network + ref_lower_res : torch.Tensor + the inpainting at previous scale, used as reference image + orig_shape : tuple + shape of the original input image before padding + devices : list + list of available devices + scale_ind : int + the scale index + n_iters : int, optional + number of iterations of refinement, by default 15 + lr : float, optional + learning rate, by default 0.002 + + Returns + ------- + torch.Tensor + inpainted image + """ + masked_image = image * (1 - mask) + masked_image = torch.cat([masked_image, mask], dim=1) + + mask = mask.repeat(1, 3, 1, 1) + if ref_lower_res is not None: + ref_lower_res = ref_lower_res.detach() + with torch.no_grad(): + z1, z2 = forward_front(masked_image) + # Inference + mask = mask.to(devices[-1]) + ekernel = torch.from_numpy( + cv2.getStructuringElement(cv2.MORPH_ELLIPSE, + (15, 15)).astype(bool)).float() + ekernel = ekernel.to(devices[-1]) + image = image.to(devices[-1]) + z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0]) + z1.requires_grad, z2.requires_grad = True, True + + optimizer = Adam([z1, z2], lr=lr) + + pbar = tqdm(range(n_iters), leave=False) + for idi in pbar: + optimizer.zero_grad() + input_feat = (z1, z2) + for idd, forward_rear in enumerate(forward_rears): + output_feat = forward_rear(input_feat) + if idd < len(devices) - 1: + midz1, midz2 = output_feat + midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to( + devices[idd + 1]) + input_feat = (midz1, midz2) + else: + pred = output_feat + + if ref_lower_res is None: + break + losses = {} + # scaled loss with downsampler + pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]]) + mask_downscaled = _pyrdown_mask( + mask[:, :1, :orig_shape[0], :orig_shape[1]], + blur_mask=False, + round_up=False) + mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel) + mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1) + losses['ms_l1'] = _l1_loss( + pred, + pred_downscaled, + ref_lower_res, + mask, + mask_downscaled, + image, + on_pred=True) + + loss = sum(losses.values()) + pbar.set_description( + 'Refining scale {} using scale {} ...current loss: {:.4f}'.format( + scale_ind + 1, scale_ind, loss.item())) + if idi < n_iters - 1: + loss.backward() + optimizer.step() + del pred_downscaled + del loss + del pred + # "pred" is the prediction after Plug-n-Play module + inpainted = mask * pred + (1 - mask) * image + inpainted = inpainted.detach().cpu() + return inpainted + + +def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int, + px_budget: int): + """Build the image mask pyramid + + Parameters + ---------- + batch : dict + batch containing image, mask, etc + min_side : int + minimum side length to limit the number of scales of the pyramid + max_scales : int + maximum number of scales allowed + px_budget : int + the product H*W cannot exceed this budget, because of resource constraints + + Returns + ------- + tuple + image-mask pyramid in the form of list of images and list of masks + """ + + assert batch['image'].shape[ + 0] == 1, 'refiner works on only batches of size 1!' + + h, w = batch['unpad_to_size'] + h, w = h[0].item(), w[0].item() + + image = batch['image'][..., :h, :w] + mask = batch['mask'][..., :h, :w] + if h * w > px_budget: + # resize + ratio = np.sqrt(px_budget / float(h * w)) + h_orig, w_orig = h, w + h, w = int(h * ratio), int(w * ratio) + print( + f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...' + ) + image = resize( + image, (h, w), interpolation='bilinear', align_corners=False) + mask = resize( + mask, (h, w), interpolation='bilinear', align_corners=False) + mask[mask > 1e-8] = 1 + breadth = min(h, w) + n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))), + max_scales) + ls_images = [] + ls_masks = [] + + ls_images.append(image) + ls_masks.append(mask) + + for _ in range(n_scales - 1): + image_p = _pyrdown(ls_images[-1]) + mask_p = _pyrdown_mask(ls_masks[-1]) + ls_images.append(image_p) + ls_masks.append(mask_p) + # reverse the lists because we want the lowest resolution image as index 0 + return ls_images[::-1], ls_masks[::-1] + + +def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str, + modulo: int, n_iters: int, lr: float, min_side: int, + max_scales: int, px_budget: int): + """Refines the inpainting of the network + + Parameters + ---------- + batch : dict + image-mask batch, currently we assume the batchsize to be 1 + inpainter : nn.Module + the inpainting neural network + gpu_ids : str + the GPU ids of the machine to use. If only single GPU, use: "0," + modulo : int + pad the image to ensure dimension % modulo == 0 + n_iters : int + number of iterations of refinement for each scale + lr : float + learning rate + min_side : int + all sides of image on all scales should be >= min_side / sqrt(2) + max_scales : int + max number of downscaling scales for the image-mask pyramid + px_budget : int + pixels budget. Any image will be resized to satisfy height*width <= px_budget + + Returns + ------- + torch.Tensor + inpainted image of size (1,3,H,W) + """ + inpainter = inpainter.model + assert not inpainter.training + assert not inpainter.add_noise_kwargs + assert inpainter.concat_mask + + gpu_ids = [ + f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',') + if gpuid.isdigit() + ] + n_resnet_blocks = 0 + first_resblock_ind = 0 + found_first_resblock = False + for idl in range(len(inpainter.generator.model)): + if isinstance(inpainter.generator.model[idl], FFCResnetBlock): + n_resnet_blocks += 1 + found_first_resblock = True + elif not found_first_resblock: + first_resblock_ind += 1 + resblocks_per_gpu = n_resnet_blocks // len(gpu_ids) + + devices = [torch.device(gpu_id) for gpu_id in gpu_ids] + + # split the model into front, and rear parts + forward_front = inpainter.generator.model[0:first_resblock_ind] + forward_front.to(devices[0]) + forward_rears = [] + for idd in range(len(gpu_ids)): + if idd < len(gpu_ids) - 1: + forward_rears.append( + inpainter.generator.model[first_resblock_ind + + resblocks_per_gpu + * (idd):first_resblock_ind + + resblocks_per_gpu * (idd + 1)]) + else: + forward_rears.append( + inpainter.generator.model[first_resblock_ind + + resblocks_per_gpu * (idd):]) + forward_rears[idd].to(devices[idd]) + + ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales, + px_budget) + image_inpainted = None + + for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)): + orig_shape = image.shape[2:] + image = pad_tensor_to_modulo(image, modulo) + mask = pad_tensor_to_modulo(mask, modulo) + mask[mask >= 1e-8] = 1.0 + mask[mask < 1e-8] = 0.0 + image, mask = move_to_device(image, devices[0]), move_to_device( + mask, devices[0]) + if image_inpainted is not None: + image_inpainted = move_to_device(image_inpainted, devices[-1]) + image_inpainted = _infer(image, mask, forward_front, forward_rears, + image_inpainted, orig_shape, devices, ids, + n_iters, lr) + image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]] + # detach everything to save resources + image = image.detach().cpu() + mask = mask.detach().cpu() + + return image_inpainted diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py index e2bf5bc1..35c060f0 100644 --- a/modelscope/msdatasets/task_datasets/__init__.py +++ b/modelscope/msdatasets/task_datasets/__init__.py @@ -11,6 +11,7 @@ if TYPE_CHECKING: from .image_instance_segmentation_coco_dataset import ImageInstanceSegmentationCocoDataset from .movie_scene_segmentation import MovieSceneSegmentationDataset from .video_summarization_dataset import VideoSummarizationDataset + from .image_inpainting import ImageInpaintingDataset from .passage_ranking_dataset import PassageRankingDataset else: @@ -24,6 +25,7 @@ else: ['ImageInstanceSegmentationCocoDataset'], 'video_summarization_dataset': ['VideoSummarizationDataset'], 'movie_scene_segmentation': ['MovieSceneSegmentationDataset'], + 'image_inpainting': ['ImageInpaintingDataset'], } import sys diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py new file mode 100644 index 00000000..732a1bd7 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/image_inpainting/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .image_inpainting_dataset import ImageInpaintingDataset diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/aug.py b/modelscope/msdatasets/task_datasets/image_inpainting/aug.py new file mode 100644 index 00000000..445bb9b4 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/image_inpainting/aug.py @@ -0,0 +1,100 @@ +""" +The implementation is borrowed from LaMa, +publicly available at https://github.com/saic-mdal/lama +""" +import imgaug.augmenters as iaa +from albumentations import DualIAATransform, to_tuple + + +class IAAAffine2(DualIAATransform): + """Place a regular grid of points on the input and randomly move the neighbourhood of these point around + via affine transformations. + + Note: This class introduce interpolation artifacts to mask if it has values other than {0;1} + + Args: + p (float): probability of applying the transform. Default: 0.5. + + Targets: + image, mask + """ + + def __init__( + self, + scale=(0.7, 1.3), + translate_percent=None, + translate_px=None, + rotate=0.0, + shear=(-0.1, 0.1), + order=1, + cval=0, + mode='reflect', + always_apply=False, + p=0.5, + ): + super(IAAAffine2, self).__init__(always_apply, p) + self.scale = dict(x=scale, y=scale) + self.translate_percent = to_tuple(translate_percent, 0) + self.translate_px = to_tuple(translate_px, 0) + self.rotate = to_tuple(rotate) + self.shear = dict(x=shear, y=shear) + self.order = order + self.cval = cval + self.mode = mode + + @property + def processor(self): + return iaa.Affine( + self.scale, + self.translate_percent, + self.translate_px, + self.rotate, + self.shear, + self.order, + self.cval, + self.mode, + ) + + def get_transform_init_args_names(self): + return ('scale', 'translate_percent', 'translate_px', 'rotate', + 'shear', 'order', 'cval', 'mode') + + +class IAAPerspective2(DualIAATransform): + """Perform a random four point perspective transform of the input. + + Note: This class introduce interpolation artifacts to mask if it has values other than {0;1} + + Args: + scale ((float, float): standard deviation of the normal distributions. These are used to sample + the random distances of the subimage's corners from the full image's corners. Default: (0.05, 0.1). + p (float): probability of applying the transform. Default: 0.5. + + Targets: + image, mask + """ + + def __init__(self, + scale=(0.05, 0.1), + keep_size=True, + always_apply=False, + p=0.5, + order=1, + cval=0, + mode='replicate'): + super(IAAPerspective2, self).__init__(always_apply, p) + self.scale = to_tuple(scale, 1.0) + self.keep_size = keep_size + self.cval = cval + self.mode = mode + + @property + def processor(self): + return iaa.PerspectiveTransform( + self.scale, + keep_size=self.keep_size, + mode=self.mode, + cval=self.cval) + + def get_transform_init_args_names(self): + return ('scale', 'keep_size') diff --git a/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py b/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py new file mode 100644 index 00000000..057b8f88 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/image_inpainting/image_inpainting_dataset.py @@ -0,0 +1,337 @@ +""" +Part of the implementation is borrowed and modified from LaMa, +publicly available at https://github.com/saic-mdal/lama +""" +import glob +import os +import os.path as osp +from enum import Enum + +import albumentations as A +import cv2 +import json +import numpy as np +import torch + +from modelscope.metainfo import Models +from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.msdatasets.task_datasets.torch_base_dataset import \ + TorchTaskDataset +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from .aug import IAAAffine2, IAAPerspective2 + +LOGGER = get_logger() + + +class LinearRamp: + + def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0): + self.start_value = start_value + self.end_value = end_value + self.start_iter = start_iter + self.end_iter = end_iter + + def __call__(self, i): + if i < self.start_iter: + return self.start_value + if i >= self.end_iter: + return self.end_value + part = (i - self.start_iter) / (self.end_iter - self.start_iter) + return self.start_value * (1 - part) + self.end_value * part + + +class DrawMethod(Enum): + LINE = 'line' + CIRCLE = 'circle' + SQUARE = 'square' + + +def make_random_superres_mask(shape, + min_step=2, + max_step=4, + min_width=1, + max_width=3): + height, width = shape + mask = np.zeros((height, width), np.float32) + step_x = np.random.randint(min_step, max_step + 1) + width_x = np.random.randint(min_width, min(step_x, max_width + 1)) + offset_x = np.random.randint(0, step_x) + + step_y = np.random.randint(min_step, max_step + 1) + width_y = np.random.randint(min_width, min(step_y, max_width + 1)) + offset_y = np.random.randint(0, step_y) + + for dy in range(width_y): + mask[offset_y + dy::step_y] = 1 + for dx in range(width_x): + mask[:, offset_x + dx::step_x] = 1 + return mask[None, ...] + + +class RandomSuperresMaskGenerator: + + def __init__(self, **kwargs): + self.kwargs = kwargs + + def __call__(self, img, iter_i=None): + return make_random_superres_mask(img.shape[1:], **self.kwargs) + + +def make_random_rectangle_mask(shape, + margin=10, + bbox_min_size=30, + bbox_max_size=100, + min_times=0, + max_times=3): + height, width = shape + mask = np.zeros((height, width), np.float32) + bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2) + times = np.random.randint(min_times, max_times + 1) + for i in range(times): + box_width = np.random.randint(bbox_min_size, bbox_max_size) + box_height = np.random.randint(bbox_min_size, bbox_max_size) + start_x = np.random.randint(margin, width - margin - box_width + 1) + start_y = np.random.randint(margin, height - margin - box_height + 1) + mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1 + return mask[None, ...] + + +class RandomRectangleMaskGenerator: + + def __init__(self, + margin=10, + bbox_min_size=30, + bbox_max_size=100, + min_times=0, + max_times=3, + ramp_kwargs=None): + self.margin = margin + self.bbox_min_size = bbox_min_size + self.bbox_max_size = bbox_max_size + self.min_times = min_times + self.max_times = max_times + self.ramp = LinearRamp( + **ramp_kwargs) if ramp_kwargs is not None else None + + def __call__(self, img, iter_i=None, raw_image=None): + coef = self.ramp(iter_i) if (self.ramp is not None) and ( + iter_i is not None) else 1 + cur_bbox_max_size = int(self.bbox_min_size + 1 + + (self.bbox_max_size - self.bbox_min_size) + * coef) + cur_max_times = int(self.min_times + + (self.max_times - self.min_times) * coef) + return make_random_rectangle_mask( + img.shape[1:], + margin=self.margin, + bbox_min_size=self.bbox_min_size, + bbox_max_size=cur_bbox_max_size, + min_times=self.min_times, + max_times=cur_max_times) + + +def make_random_irregular_mask(shape, + max_angle=4, + max_len=60, + max_width=20, + min_times=0, + max_times=10, + draw_method=DrawMethod.LINE): + draw_method = DrawMethod(draw_method) + + height, width = shape + mask = np.zeros((height, width), np.float32) + times = np.random.randint(min_times, max_times + 1) + for i in range(times): + start_x = np.random.randint(width) + start_y = np.random.randint(height) + for j in range(1 + np.random.randint(5)): + angle = 0.01 + np.random.randint(max_angle) + if i % 2 == 0: + angle = 2 * 3.1415926 - angle + length = 10 + np.random.randint(max_len) + brush_w = 5 + np.random.randint(max_width) + end_x = np.clip( + (start_x + length * np.sin(angle)).astype(np.int32), 0, width) + end_y = np.clip( + (start_y + length * np.cos(angle)).astype(np.int32), 0, height) + if draw_method == DrawMethod.LINE: + cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0, + brush_w) + elif draw_method == DrawMethod.CIRCLE: + cv2.circle( + mask, (start_x, start_y), + radius=brush_w, + color=1., + thickness=-1) + elif draw_method == DrawMethod.SQUARE: + radius = brush_w // 2 + mask[start_y - radius:start_y + radius, + start_x - radius:start_x + radius] = 1 + start_x, start_y = end_x, end_y + return mask[None, ...] + + +class RandomIrregularMaskGenerator: + + def __init__(self, + max_angle=4, + max_len=60, + max_width=20, + min_times=0, + max_times=10, + ramp_kwargs=None, + draw_method=DrawMethod.LINE): + self.max_angle = max_angle + self.max_len = max_len + self.max_width = max_width + self.min_times = min_times + self.max_times = max_times + self.draw_method = draw_method + self.ramp = LinearRamp( + **ramp_kwargs) if ramp_kwargs is not None else None + + def __call__(self, img, iter_i=None, raw_image=None): + coef = self.ramp(iter_i) if (self.ramp is not None) and ( + iter_i is not None) else 1 + cur_max_len = int(max(1, self.max_len * coef)) + cur_max_width = int(max(1, self.max_width * coef)) + cur_max_times = int(self.min_times + 1 + + (self.max_times - self.min_times) * coef) + return make_random_irregular_mask( + img.shape[1:], + max_angle=self.max_angle, + max_len=cur_max_len, + max_width=cur_max_width, + min_times=self.min_times, + max_times=cur_max_times, + draw_method=self.draw_method) + + +class MixedMaskGenerator: + + def __init__(self, + irregular_proba=1 / 3, + irregular_kwargs=None, + box_proba=1 / 3, + box_kwargs=None, + segm_proba=1 / 3, + segm_kwargs=None, + squares_proba=0, + squares_kwargs=None, + superres_proba=0, + superres_kwargs=None, + outpainting_proba=0, + outpainting_kwargs=None, + invert_proba=0): + self.probas = [] + self.gens = [] + + if irregular_proba > 0: + self.probas.append(irregular_proba) + if irregular_kwargs is None: + irregular_kwargs = {} + else: + irregular_kwargs = dict(irregular_kwargs) + irregular_kwargs['draw_method'] = DrawMethod.LINE + self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs)) + + if box_proba > 0: + self.probas.append(box_proba) + if box_kwargs is None: + box_kwargs = {} + self.gens.append(RandomRectangleMaskGenerator(**box_kwargs)) + + if squares_proba > 0: + self.probas.append(squares_proba) + if squares_kwargs is None: + squares_kwargs = {} + else: + squares_kwargs = dict(squares_kwargs) + squares_kwargs['draw_method'] = DrawMethod.SQUARE + self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs)) + + if superres_proba > 0: + self.probas.append(superres_proba) + if superres_kwargs is None: + superres_kwargs = {} + self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs)) + + self.probas = np.array(self.probas, dtype='float32') + self.probas /= self.probas.sum() + self.invert_proba = invert_proba + + def __call__(self, img, iter_i=None, raw_image=None): + kind = np.random.choice(len(self.probas), p=self.probas) + gen = self.gens[kind] + result = gen(img, iter_i=iter_i, raw_image=raw_image) + if self.invert_proba > 0 and random.random() < self.invert_proba: + result = 1 - result + return result + + +def get_transforms(test_mode, out_size): + if not test_mode: + transform = A.Compose([ + IAAPerspective2(scale=(0.0, 0.06)), + IAAAffine2(scale=(0.7, 1.3), rotate=(-40, 40), shear=(-0.1, 0.1)), + A.PadIfNeeded(min_height=out_size, min_width=out_size), + A.OpticalDistortion(), + A.RandomCrop(height=out_size, width=out_size), + A.HorizontalFlip(), + A.CLAHE(), + A.RandomBrightnessContrast( + brightness_limit=0.2, contrast_limit=0.2), + A.HueSaturationValue( + hue_shift_limit=5, sat_shift_limit=30, val_shift_limit=5), + A.ToFloat() + ]) + else: + transform = A.Compose([ + A.PadIfNeeded(min_height=out_size, min_width=out_size), + A.CenterCrop(height=out_size, width=out_size), + A.ToFloat() + ]) + return transform + + +@TASK_DATASETS.register_module( + Tasks.image_inpainting, module_name=Models.image_inpainting) +class ImageInpaintingDataset(TorchTaskDataset): + + def __init__(self, **kwargs): + split_config = kwargs['split_config'] + LOGGER.info(kwargs) + mode = kwargs.get('test_mode', False) + + self.data_root = next(iter(split_config.values())) + if not osp.exists(self.data_root): + self.data_root = osp.dirname(self.data_root) + assert osp.exists(self.data_root) + mask_gen_kwargs = kwargs.get('mask_gen_kwargs', {}) + out_size = kwargs.get('out_size', 256) + self.mask_generator = MixedMaskGenerator(**mask_gen_kwargs) + self.transform = get_transforms(mode, out_size) + self.in_files = sorted( + list( + glob.glob( + osp.join(self.data_root, '**', '*.jpg'), recursive=True)) + + list( + glob.glob( + osp.join(self.data_root, '**', '*.png'), recursive=True))) + self.iter_i = 0 + + def __len__(self): + return len(self.in_files) + + def __getitem__(self, index): + path = self.in_files[index] + img = cv2.imread(path) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = self.transform(image=img)['image'] + img = np.transpose(img, (2, 0, 1)) + # TODO: maybe generate mask before augmentations? slower, but better for segmentation-based masks + mask = self.mask_generator(img, iter_i=self.iter_i) + self.iter_i += 1 + return dict(image=img, mask=mask) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 07a14191..dd59d6fb 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -177,6 +177,7 @@ TASK_OUTPUTS = { Tasks.image_denoising: [OutputKeys.OUTPUT_IMG], Tasks.image_portrait_enhancement: [OutputKeys.OUTPUT_IMG], Tasks.crowd_counting: [OutputKeys.SCORES, OutputKeys.OUTPUT_IMG], + Tasks.image_inpainting: [OutputKeys.OUTPUT_IMG], # image generation task result for a single image # {"output_img": np.array with shape (h, w, 3)} diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index c9a70d14..b18d4465 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -181,6 +181,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_resnet50-bert_video-scene-segmentation_movienet'), Tasks.shop_segmentation: (Pipelines.shop_segmentation, 'damo/cv_vitb16_segmentation_shop-seg'), + Tasks.image_inpainting: (Pipelines.image_inpainting, + 'damo/cv_fft_inpainting_lama'), Tasks.video_inpainting: (Pipelines.video_inpainting, 'damo/cv_video-inpainting'), Tasks.hand_static: (Pipelines.hand_static, diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 55bad09a..118eaf17 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -35,6 +35,7 @@ if TYPE_CHECKING: from .image_super_resolution_pipeline import ImageSuperResolutionPipeline from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline + from .image_inpainting_pipeline import ImageInpaintingPipeline from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline from .realtime_object_detection_pipeline import RealtimeObjectDetectionPipeline from .live_category_pipeline import LiveCategoryPipeline @@ -99,6 +100,7 @@ else: 'live_category_pipeline': ['LiveCategoryPipeline'], 'image_to_image_generation_pipeline': ['Image2ImageGenerationPipeline'], + 'image_inpainting_pipeline': ['ImageInpaintingPipeline'], 'ocr_detection_pipeline': ['OCRDetectionPipeline'], 'ocr_recognition_pipeline': ['OCRRecognitionPipeline'], 'skin_retouching_pipeline': ['SkinRetouchingPipeline'], diff --git a/modelscope/pipelines/cv/image_inpainting_pipeline.py b/modelscope/pipelines/cv/image_inpainting_pipeline.py new file mode 100644 index 00000000..6ae0d63e --- /dev/null +++ b/modelscope/pipelines/cv/image_inpainting_pipeline.py @@ -0,0 +1,146 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict + +import cv2 +import numpy as np +import PIL +import torch +import torch.nn as nn +from torch.utils.data._utils.collate import default_collate + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.image_inpainting import FFTInpainting +from modelscope.models.cv.image_inpainting.refinement import refine_predict +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors.image import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_inpainting, module_name=Pipelines.image_inpainting) +class ImageInpaintingPipeline(Pipeline): + + def __init__(self, + model: str, + pad_out_to_modulo=8, + refine=False, + **kwargs): + """ + model: model id on modelscope hub. + """ + assert isinstance(model, str), 'model must be a single str' + super().__init__(model=model, auto_collate=False, **kwargs) + self.refine = refine + logger.info(f'loading model from dir {model}') + self.infer_model = FFTInpainting(model, predict_only=True) + if not self.refine: + self.infer_model.to(self.device) + self.infer_model.eval() + logger.info(f'loading model done, refinement is set to {self.refine}') + self.pad_out_to_modulo = pad_out_to_modulo + + def move_to_device(self, obj, device): + if isinstance(obj, nn.Module): + return obj.to(device) + if torch.is_tensor(obj): + return obj.to(device) + if isinstance(obj, (tuple, list)): + return [self.move_to_device(el, device) for el in obj] + if isinstance(obj, dict): + return { + name: self.move_to_device(val, device) + for name, val in obj.items() + } + raise ValueError(f'Unexpected type {type(obj)}') + + def transforms(self, img): + if img.ndim == 3: + img = np.transpose(img, (2, 0, 1)) + out_img = img.astype('float32') / 255 + return out_img + + def ceil_modulo(self, x, mod): + if x % mod == 0: + return x + return (x // mod + 1) * mod + + def pad_img_to_modulo(self, img, mod): + channels, height, width = img.shape + out_height = self.ceil_modulo(height, mod) + out_width = self.ceil_modulo(width, mod) + return np.pad( + img, ((0, 0), (0, out_height - height), (0, out_width - width)), + mode='symmetric') + + def preprocess(self, input: Input) -> Dict[str, Any]: + if isinstance(input, str): + image_name, mask_name = input.split('+') + img = LoadImage.convert_to_ndarray(image_name) + img = self.transforms(img) + mask = np.array(LoadImage(mode='L')(mask_name)['img']) + mask = self.transforms(mask) + elif isinstance(input, PIL.Image.Image): + img = input.crop((0, 0, int(input.width / 2), input.height)) + img = self.transforms(np.array(img)) + mask = input.crop((int(input.width / 2), 0, input.width, + input.height)).convert('L') + mask = self.transforms(np.array(mask)) + else: + raise TypeError('input should be either str or PIL.Image') + result = dict(image=img, mask=mask[None, ...]) + + if self.pad_out_to_modulo is not None and self.pad_out_to_modulo > 1: + result['unpad_to_size'] = result['image'].shape[1:] + result['image'] = self.pad_img_to_modulo(result['image'], + self.pad_out_to_modulo) + result['mask'] = self.pad_img_to_modulo(result['mask'], + self.pad_out_to_modulo) + + # Since Pipeline use default torch.no_grad() for performing forward func. + # We conduct inference here in case of doing training for refinement. + result = self.perform_inference(result) + return result + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + return {OutputKeys.OUTPUT_IMG: input} + + def perform_inference(self, data): + batch = default_collate([data]) + if self.refine: + assert 'unpad_to_size' in batch, 'Unpadded size is required for the refinement' + assert 'cuda' in str(self.device), 'GPU is required for refinement' + gpu_ids = str(self.device).split(':')[-1] + cur_res = refine_predict( + batch, + self.infer_model, + gpu_ids=gpu_ids, + modulo=self.pad_out_to_modulo, + n_iters=15, + lr=0.002, + min_side=512, + max_scales=3, + px_budget=900000) + cur_res = cur_res[0].permute(1, 2, 0).detach().cpu().numpy() + else: + with torch.no_grad(): + batch = self.move_to_device(batch, self.device) + batch['mask'] = (batch['mask'] > 0) * 1 + batch = self.infer_model(batch) + cur_res = batch['inpainted'][0].permute( + 1, 2, 0).detach().cpu().numpy() + unpad_to_size = batch.get('unpad_to_size', None) + if unpad_to_size is not None: + orig_height, orig_width = unpad_to_size + cur_res = cur_res[:orig_height, :orig_width] + + cur_res = np.clip(cur_res * 255, 0, 255).astype('uint8') + cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR) + return cur_res + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py index a632642a..86917261 100644 --- a/modelscope/trainers/__init__.py +++ b/modelscope/trainers/__init__.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from .builder import build_trainer from .cv import (ImageInstanceSegmentationTrainer, ImagePortraitEnhancementTrainer, - MovieSceneSegmentationTrainer) + MovieSceneSegmentationTrainer, ImageInpaintingTrainer) from .multi_modal import CLIPTrainer from .nlp import SequenceClassificationTrainer, PassageRankingTrainer from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer @@ -22,7 +22,8 @@ else: 'builder': ['build_trainer'], 'cv': [ 'ImageInstanceSegmentationTrainer', - 'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer' + 'ImagePortraitEnhancementTrainer', 'MovieSceneSegmentationTrainer', + 'ImageInpaintingTrainer' ], 'multi_modal': ['CLIPTrainer'], 'nlp': ['SequenceClassificationTrainer', 'PassageRankingTrainer'], diff --git a/modelscope/trainers/cv/__init__.py b/modelscope/trainers/cv/__init__.py index 4c65870e..d09fd75c 100644 --- a/modelscope/trainers/cv/__init__.py +++ b/modelscope/trainers/cv/__init__.py @@ -8,6 +8,7 @@ if TYPE_CHECKING: ImageInstanceSegmentationTrainer from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer + from .image_inpainting_trainer import ImageInpaintingTrainer else: _import_structure = { @@ -15,7 +16,8 @@ else: ['ImageInstanceSegmentationTrainer'], 'image_portrait_enhancement_trainer': ['ImagePortraitEnhancementTrainer'], - 'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'] + 'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'], + 'image_inpainting_trainer': ['ImageInpaintingTrainer'] } import sys diff --git a/modelscope/trainers/cv/image_inpainting_trainer.py b/modelscope/trainers/cv/image_inpainting_trainer.py new file mode 100644 index 00000000..74d1ed9f --- /dev/null +++ b/modelscope/trainers/cv/image_inpainting_trainer.py @@ -0,0 +1,111 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import time +from collections.abc import Mapping + +from torch import distributed as dist + +from modelscope.metainfo import Trainers +from modelscope.trainers.builder import TRAINERS +from modelscope.trainers.trainer import EpochBasedTrainer +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, ConfigFields, + ConfigKeys, Hubs, ModeKeys, ModelFile, + Tasks, TrainerStages) +from modelscope.utils.data_utils import to_device +from modelscope.utils.file_utils import func_receive_dict_inputs + + +@TRAINERS.register_module(module_name=Trainers.image_inpainting) +class ImageInpaintingTrainer(EpochBasedTrainer): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def train(self, *args, **kwargs): + super().train(*args, **kwargs) + + def evaluate(self, *args, **kwargs): + metric_values = super().evaluate(*args, **kwargs) + return metric_values + + def prediction_step(self, model, inputs): + pass + + def train_loop(self, data_loader): + """ Training loop used by `EpochBasedTrainer.train()` + """ + self.invoke_hook(TrainerStages.before_run) + self._epoch = 0 + self.model.train() + for _ in range(self._epoch, self._max_epochs): + self.invoke_hook(TrainerStages.before_train_epoch) + for i, data_batch in enumerate(data_loader): + data_batch = to_device(data_batch, self.device) + self.data_batch = data_batch + self._inner_iter = i + for idx in range(2): + self.invoke_hook(TrainerStages.before_train_iter) + self.train_step(self.model, data_batch, idx) + self.invoke_hook(TrainerStages.after_train_iter) + del self.data_batch + self._iter += 1 + self._mode = ModeKeys.TRAIN + + if i + 1 >= self.iters_per_epoch: + break + + self.invoke_hook(TrainerStages.after_train_epoch) + self._epoch += 1 + + self.invoke_hook(TrainerStages.after_run) + + def train_step(self, model, inputs, idx): + """ Perform a training step on a batch of inputs. + + Subclass and override to inject custom behavior. + + Args: + model (`TorchModel`): The model to train. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + + Return: + `torch.Tensor`: The tensor with training loss on this batch. + """ + # EvaluationHook will do evaluate and change mode to val, return to train mode + # TODO: find more pretty way to change mode + model.train() + self._mode = ModeKeys.TRAIN + # call model forward but not __call__ to skip postprocess + if isinstance(inputs, + Mapping) and not func_receive_dict_inputs(model.forward): + train_outputs = model.model._do_step(**inputs, optimizer_idx=idx) + else: + train_outputs = model.model._do_step(inputs, optimizer_idx=idx) + + if not isinstance(train_outputs, dict): + raise TypeError('"model.forward()" must return a dict') + + # add model output info to log + if 'log_vars' not in train_outputs: + default_keys_pattern = ['loss'] + match_keys = set([]) + for key_p in default_keys_pattern: + match_keys.update( + [key for key in train_outputs.keys() if key_p in key]) + + log_vars = {} + for key in match_keys: + value = train_outputs.get(key, None) + if value is not None: + if dist.is_available() and dist.is_initialized(): + value = value.data.clone() + dist.all_reduce(value.div_(dist.get_world_size())) + log_vars.update({key: value.item()}) + self.log_buffer.update(log_vars) + else: + self.log_buffer.update(train_outputs['log_vars']) + + self.train_outputs = train_outputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 2331dc85..2a5ac694 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -47,6 +47,8 @@ class CVTasks(object): face_emotion = 'face-emotion' product_segmentation = 'product-segmentation' + crowd_counting = 'crowd-counting' + # image editing skin_retouching = 'skin-retouching' image_super_resolution = 'image-super-resolution' @@ -54,6 +56,7 @@ class CVTasks(object): image_color_enhancement = 'image-color-enhancement' image_denoising = 'image-denoising' image_portrait_enhancement = 'image-portrait-enhancement' + image_inpainting = 'image-inpainting' # image generation image_to_image_translation = 'image-to-image-translation' @@ -72,7 +75,6 @@ class CVTasks(object): video_category = 'video-category' video_embedding = 'video-embedding' virtual_try_on = 'virtual-try-on' - crowd_counting = 'crowd-counting' movie_scene_segmentation = 'movie-scene-segmentation' # video editing diff --git a/requirements/cv.txt b/requirements/cv.txt index f907256d..e6ffb5ff 100644 --- a/requirements/cv.txt +++ b/requirements/cv.txt @@ -7,6 +7,8 @@ ffmpeg-python>=0.2.0 ftfy imageio>=2.9.0 imageio-ffmpeg>=0.4.2 +imgaug>=0.4.0 +kornia>=0.5.0 lmdb lpips ml_collections diff --git a/tests/pipelines/test_image_inpainting.py b/tests/pipelines/test_image_inpainting.py new file mode 100644 index 00000000..b89ce399 --- /dev/null +++ b/tests/pipelines/test_image_inpainting.py @@ -0,0 +1,77 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import cv2 +import torch +from PIL import Image + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + +logger = get_logger() + + +class ImageInpaintingTest(unittest.TestCase): + + def setUp(self) -> None: + self.input_location = 'data/test/images/image_inpainting/image_inpainting.png' + self.input_mask_location = 'data/test/images/image_inpainting/image_inpainting_mask.png' + self.model_id = 'damo/cv_fft_inpainting_lama' + + def save_result(self, result): + vis_img = result[OutputKeys.OUTPUT_IMG] + cv2.imwrite('result.png', vis_img) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_inpainting(self): + inpainting = pipeline(Tasks.image_inpainting, model=self.model_id) + result = inpainting(self.input_location + '+' + + self.input_mask_location) + if result: + self.save_result(result) + else: + raise ValueError('process error') + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest') + def test_inpainting_with_refinement(self): + # if input image is HR, set refine=True is more better + inpainting = pipeline( + Tasks.image_inpainting, model=self.model_id, refine=True) + result = inpainting(self.input_location + '+' + + self.input_mask_location) + if result: + self.save_result(result) + else: + raise ValueError('process error') + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_inpainting_with_image(self): + inpainting = pipeline(Tasks.image_inpainting, model=self.model_id) + img = Image.open(self.input_location).convert('RGB') + mask = Image.open(self.input_mask_location).convert('RGB') + img_new = Image.new('RGB', (img.width + mask.width, img.height)) + img_new.paste(img, (0, 0)) + img_new.paste(mask, (img.width, 0)) + result = inpainting(img_new) + if result: + self.save_result(result) + else: + raise ValueError('process error') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_inpainting_with_default_task(self): + inpainting = pipeline(Tasks.image_inpainting) + result = inpainting(self.input_location + '+' + + self.input_mask_location) + if result: + self.save_result(result) + else: + raise ValueError('process error') + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/run_config.yaml b/tests/run_config.yaml index 4c571b7f..b4149dc9 100644 --- a/tests/run_config.yaml +++ b/tests/run_config.yaml @@ -10,6 +10,7 @@ isolated: # test cases that may require excessive anmount of GPU memory, which - test_easycv_trainer.py - test_segformer.py - test_segmentation_pipeline.py + - test_image_inpainting.py envs: default: # default env, case not in other env will in default, pytorch. diff --git a/tests/trainers/test_image_inpainting_trainer.py b/tests/trainers/test_image_inpainting_trainer.py new file mode 100644 index 00000000..807fe64f --- /dev/null +++ b/tests/trainers/test_image_inpainting_trainer.py @@ -0,0 +1,84 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Trainers +from modelscope.models.cv.image_inpainting import FFTInpainting +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.config import Config, ConfigDict +from modelscope.utils.constant import ModelFile +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + +logger = get_logger() + + +class ImageInpaintingTrainerTest(unittest.TestCase): + + def setUp(self): + print(('Testing %s.%s' % (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + self.model_id = 'damo/cv_fft_inpainting_lama' + self.cache_path = snapshot_download(self.model_id) + cfg = Config.from_file( + os.path.join(self.cache_path, ModelFile.CONFIGURATION)) + + train_data_cfg = ConfigDict( + name='PlacesToydataset', + split='train', + mask_gen_kwargs=cfg.dataset.mask_gen_kwargs, + out_size=cfg.dataset.train_out_size, + test_mode=False) + + test_data_cfg = ConfigDict( + name='PlacesToydataset', + split='test', + mask_gen_kwargs=cfg.dataset.mask_gen_kwargs, + out_size=cfg.dataset.val_out_size, + test_mode=True) + + self.train_dataset = MsDataset.load( + dataset_name=train_data_cfg.name, + split=train_data_cfg.split, + mask_gen_kwargs=train_data_cfg.mask_gen_kwargs, + out_size=train_data_cfg.out_size, + test_mode=train_data_cfg.test_mode) + assert next( + iter(self.train_dataset.config_kwargs['split_config'].values())) + + self.test_dataset = MsDataset.load( + dataset_name=test_data_cfg.name, + split=test_data_cfg.split, + mask_gen_kwargs=test_data_cfg.mask_gen_kwargs, + out_size=test_data_cfg.out_size, + test_mode=test_data_cfg.test_mode) + assert next( + iter(self.test_dataset.config_kwargs['split_config'].values())) + + def tearDown(self): + shutil.rmtree(self.tmp_dir, ignore_errors=True) + super().tearDown() + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_trainer(self): + kwargs = dict( + model=self.model_id, + train_dataset=self.train_dataset, + eval_dataset=self.test_dataset) + + trainer = build_trainer( + name=Trainers.image_inpainting, default_args=kwargs) + trainer.train() + results_files = os.listdir(trainer.work_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + + +if __name__ == '__main__': + unittest.main() From 2bfdbbc9d0d77372ccfeb85745c2bcf8c736b534 Mon Sep 17 00:00:00 2001 From: ly261666 Date: Tue, 11 Oct 2022 22:23:36 +0800 Subject: [PATCH 12/57] [to #42322933]update fer to satisfy demo service requirements Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10357094 --- .../models/cv/face_detection/mogface/models/detectors.py | 2 ++ .../pipelines/cv/facial_expression_recognition_pipeline.py | 5 ++++- modelscope/utils/cv/image_utils.py | 7 +------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/modelscope/models/cv/face_detection/mogface/models/detectors.py b/modelscope/models/cv/face_detection/mogface/models/detectors.py index 5ae67104..8c1d9150 100644 --- a/modelscope/models/cv/face_detection/mogface/models/detectors.py +++ b/modelscope/models/cv/face_detection/mogface/models/detectors.py @@ -1,3 +1,5 @@ +# The implementation is based on MogFace, available at +# https://github.com/damo-cv/MogFace import os import cv2 diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py index 1b1f13d1..b598a457 100644 --- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py +++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py @@ -45,6 +45,9 @@ class FacialExpressionRecognitionPipeline(Pipeline): # face detect pipeline det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' + self.map_list = [ + 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral' + ] self.face_detection = pipeline( Tasks.face_detection, model=det_model_id) @@ -122,7 +125,7 @@ class FacialExpressionRecognitionPipeline(Pipeline): labels = result[1].tolist() return { OutputKeys.SCORES: scores, - OutputKeys.LABELS: labels, + OutputKeys.LABELS: self.map_list[labels] } def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index 98ba533e..ad0d6c8e 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -113,12 +113,7 @@ def draw_face_detection_no_lm_result(img_path, detection_result): def draw_facial_expression_result(img_path, facial_expression_result): - label_idx = facial_expression_result[OutputKeys.LABELS] - map_list = [ - 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral' - ] - label = map_list[label_idx] - + label = facial_expression_result[OutputKeys.LABELS] img = cv2.imread(img_path) assert img is not None, f"Can't read img: {img_path}" cv2.putText( From 0d97f8959d2095ecfd4b43bb4eb607534474a44f Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Tue, 11 Oct 2022 22:24:19 +0800 Subject: [PATCH 13/57] [to #42322933] test: unify kws pipeline input type to AUDIO Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10362437 --- .../test_key_word_spotting_farfield.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tests/pipelines/test_key_word_spotting_farfield.py b/tests/pipelines/test_key_word_spotting_farfield.py index f8c167de..bf61c9e7 100644 --- a/tests/pipelines/test_key_word_spotting_farfield.py +++ b/tests/pipelines/test_key_word_spotting_farfield.py @@ -22,18 +22,14 @@ class KWSFarfieldTest(unittest.TestCase): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_normal(self): kws = pipeline(Tasks.keyword_spotting, model=self.model_id) - inputs = {'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE)} - result = kws(inputs) + result = kws(os.path.join(os.getcwd(), TEST_SPEECH_FILE)) self.assertEqual(len(result['kws_list']), 5) print(result['kws_list'][-1]) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_mono(self): kws = pipeline(Tasks.keyword_spotting, model=self.model_id) - inputs = { - 'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO) - } - result = kws(inputs) + result = kws(os.path.join(os.getcwd(), TEST_SPEECH_FILE_MONO)) self.assertEqual(len(result['kws_list']), 5) print(result['kws_list'][-1]) @@ -44,17 +40,6 @@ class KWSFarfieldTest(unittest.TestCase): self.assertEqual(len(result['kws_list']), 5) print(result['kws_list'][-1]) - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') - def test_output(self): - kws = pipeline(Tasks.keyword_spotting, model=self.model_id) - inputs = { - 'input_file': os.path.join(os.getcwd(), TEST_SPEECH_FILE), - 'output_file': 'output.wav' - } - result = kws(inputs) - self.assertEqual(len(result['kws_list']), 5) - print(result['kws_list'][-1]) - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_input_bytes(self): with open(os.path.join(os.getcwd(), TEST_SPEECH_FILE), 'rb') as f: From da5d5cd10bf8bb75ff9fde2df3f0112308c35cc7 Mon Sep 17 00:00:00 2001 From: "xixing.tj" Date: Tue, 11 Oct 2022 22:37:57 +0800 Subject: [PATCH 14/57] [to #42322933]add copyright info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加ocr部分代码的copyright信息 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10342392 --- .../cv/ocr_utils/model_convnext_transformer.py | 1 + .../model_resnet_mutex_v4_linewithchar.py | 2 ++ .../pipelines/cv/ocr_utils/ocr_modules/convnext.py | 10 ++-------- .../cv/ocr_utils/ocr_modules/timm_tinyc.py | 6 ++---- .../pipelines/cv/ocr_utils/ocr_modules/vitstr.py | 9 ++------- modelscope/pipelines/cv/ocr_utils/ops.py | 2 ++ modelscope/pipelines/cv/ocr_utils/resnet18_v1.py | 14 ++++++++++++++ modelscope/pipelines/cv/ocr_utils/resnet_utils.py | 14 ++++++++++++++ modelscope/pipelines/cv/ocr_utils/utils.py | 1 + 9 files changed, 40 insertions(+), 19 deletions(-) diff --git a/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py b/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py index cf5e2fe1..6ecff7ef 100644 --- a/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py +++ b/modelscope/pipelines/cv/ocr_utils/model_convnext_transformer.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import torch import torch.nn as nn diff --git a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py index d03ff405..2c2d5b00 100644 --- a/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py +++ b/modelscope/pipelines/cv/ocr_utils/model_resnet_mutex_v4_linewithchar.py @@ -1,3 +1,5 @@ +# Part of the implementation is borrowed and modified from SegLink, +# publicly available at https://github.com/bgshih/seglink import tensorflow as tf from . import ops, resnet18_v1, resnet_utils diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py index c2059107..c0e30616 100644 --- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py +++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/convnext.py @@ -1,11 +1,5 @@ -""" Contains various versions of ConvNext Networks. -ConvNext Networks (ConvNext) were proposed in: - Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell and Saining Xie - A ConvNet for the 2020s. CVPR 2022. -Compared to https://github.com/facebookresearch/ConvNeXt, -we obtain different ConvNext variants by changing the network depth, width, -feature number, and downsample ratio. -""" +# Part of the implementation is borrowed and modified from ConvNext, +# publicly available at https://github.com/facebookresearch/ConvNeXt import torch import torch.nn as nn import torch.nn.functional as F diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py index f54c0e78..555b1e42 100644 --- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py +++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/timm_tinyc.py @@ -1,7 +1,5 @@ -'''Referenced from rwightman's pytorch-image-models(timm). -Github: https://github.com/rwightman/pytorch-image-models -We use some modules and modify the parameters according to our network. -''' +# Part of the implementation is borrowed and modified from timm, +# publicly available at https://github.com/rwightman/pytorch-image-models import collections.abc import logging import math diff --git a/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py b/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py index e7d96574..5ce3aeca 100644 --- a/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py +++ b/modelscope/pipelines/cv/ocr_utils/ocr_modules/vitstr.py @@ -1,10 +1,5 @@ -""" Contains various versions of ViTSTR. -ViTSTR were proposed in: - Rowel Atienza - Vision transformer for fast and efficient scene text recognition. ICDAR 2021. -Compared to https://github.com/roatienza/deep-text-recognition-benchmark, -we obtain different ViTSTR variants by changing the network patch_size and in_chans. -""" +# Part of the implementation is borrowed and modified from ViTSTR, +# publicly available at https://github.com/roatienza/deep-text-recognition-benchmark from __future__ import absolute_import, division, print_function import logging from copy import deepcopy diff --git a/modelscope/pipelines/cv/ocr_utils/ops.py b/modelscope/pipelines/cv/ocr_utils/ops.py index 09807b10..a36838a6 100644 --- a/modelscope/pipelines/cv/ocr_utils/ops.py +++ b/modelscope/pipelines/cv/ocr_utils/ops.py @@ -1,3 +1,5 @@ +# Part of the implementation is borrowed and modified from SegLink, +# publicly available at https://github.com/bgshih/seglink import math import os import shutil diff --git a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py index 7930c5a3..85f9faca 100644 --- a/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py +++ b/modelscope/pipelines/cv/ocr_utils/resnet18_v1.py @@ -1,3 +1,17 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Contains definitions for the original form of Residual Networks. The 'v1' residual networks (ResNets) implemented in this module were proposed by: diff --git a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py index 0a9af224..2ccbd038 100644 --- a/modelscope/pipelines/cv/ocr_utils/resnet_utils.py +++ b/modelscope/pipelines/cv/ocr_utils/resnet_utils.py @@ -1,3 +1,17 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Contains building blocks for various versions of Residual Networks. Residual networks (ResNets) were proposed in: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py index be8e3371..1d0fb297 100644 --- a/modelscope/pipelines/cv/ocr_utils/utils.py +++ b/modelscope/pipelines/cv/ocr_utils/utils.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import cv2 import numpy as np From 922f4c589b8da4111e787cd38d0a53518aaa8ada Mon Sep 17 00:00:00 2001 From: "huizheng.hz" Date: Tue, 11 Oct 2022 22:46:30 +0800 Subject: [PATCH 15/57] =?UTF-8?q?[to=20#42322933]=E5=9B=BE=E5=83=8F?= =?UTF-8?q?=E5=8E=BB=E5=99=AAusing=20msdataset=20to=20load=20dataset=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20Link:=20https://code.alibaba-inc.com/Ali-M?= =?UTF-8?q?aaS/MaaS-lib/codereview/10338265?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modelscope/metrics/image_denoise_metric.py | 140 +++++++++++++++- .../cv/image_denoise/nafnet/NAFNet_arch.py | 5 + .../cv/image_denoise/nafnet/arch_util.py | 5 + .../image_denoise/nafnet_for_image_denoise.py | 1 + .../image_denoise_data/data_utils.py | 152 ------------------ .../image_denoise_dataset.py | 78 --------- .../sidd_image_denoising}/__init__.py | 4 +- .../sidd_image_denoising/data_utils.py | 46 ++++++ .../sidd_image_denoising_dataset.py | 62 +++++++ .../sidd_image_denoising}/transforms.py | 0 .../pipelines/cv/image_denoise_pipeline.py | 2 +- tests/pipelines/test_image_denoise.py | 25 ++- tests/trainers/test_image_denoise_trainer.py | 24 ++- 13 files changed, 284 insertions(+), 260 deletions(-) delete mode 100644 modelscope/msdatasets/image_denoise_data/data_utils.py delete mode 100644 modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py rename modelscope/msdatasets/{image_denoise_data => task_datasets/sidd_image_denoising}/__init__.py (73%) create mode 100644 modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py create mode 100644 modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py rename modelscope/msdatasets/{image_denoise_data => task_datasets/sidd_image_denoising}/transforms.py (100%) diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py index 94ec9dc7..c6df8df1 100644 --- a/modelscope/metrics/image_denoise_metric.py +++ b/modelscope/metrics/image_denoise_metric.py @@ -1,7 +1,9 @@ +# The code is modified based on BasicSR metrics: +# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py from typing import Dict +import cv2 import numpy as np -from skimage.metrics import peak_signal_noise_ratio, structural_similarity from modelscope.metainfo import Metrics from modelscope.utils.registry import default_group @@ -34,12 +36,138 @@ class ImageDenoiseMetric(Metric): def evaluate(self): psnr_list, ssim_list = [], [] for (pred, label) in zip(self.preds, self.labels): - psnr_list.append( - peak_signal_noise_ratio(label[0], pred[0], data_range=255)) - ssim_list.append( - structural_similarity( - label[0], pred[0], multichannel=True, data_range=255)) + psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0)) + ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0)) return { MetricKeys.PSNR: np.mean(psnr_list), MetricKeys.SSIM: np.mean(ssim_list) } + + +def reorder_image(img, input_order='HWC'): + """Reorder images to 'HWC' order. + If the input_order is (h, w), return (h, w, 1); + If the input_order is (c, h, w), return (h, w, c); + If the input_order is (h, w, c), return as it is. + Args: + img (ndarray): Input image. + input_order (str): Whether the input order is 'HWC' or 'CHW'. + If the input image shape is (h, w), input_order will not have + effects. Default: 'HWC'. + Returns: + ndarray: reordered image. + """ + + if input_order not in ['HWC', 'CHW']: + raise ValueError( + f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'" + ) + if len(img.shape) == 2: + img = img[..., None] + if input_order == 'CHW': + img = img.transpose(1, 2, 0) + return img + + +def calculate_psnr(img, img2, crop_border, input_order='HWC', **kwargs): + """Calculate PSNR (Peak Signal-to-Noise Ratio). + Reference: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio + Args: + img (ndarray): Images with range [0, 255]. + img2 (ndarray): Images with range [0, 255]. + crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation. + input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'. + Returns: + float: PSNR result. + """ + + assert img.shape == img2.shape, ( + f'Image shapes are different: {img.shape}, {img2.shape}.') + if input_order not in ['HWC', 'CHW']: + raise ValueError( + f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"' + ) + img = reorder_image(img, input_order=input_order) + img2 = reorder_image(img2, input_order=input_order) + + if crop_border != 0: + img = img[crop_border:-crop_border, crop_border:-crop_border, ...] + img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] + + img = img.astype(np.float64) + img2 = img2.astype(np.float64) + + mse = np.mean((img - img2)**2) + if mse == 0: + return float('inf') + return 10. * np.log10(255. * 255. / mse) + + +def calculate_ssim(img, img2, crop_border, input_order='HWC', **kwargs): + """Calculate SSIM (structural similarity). + ``Paper: Image quality assessment: From error visibility to structural similarity`` + The results are the same as that of the official released MATLAB code in + https://ece.uwaterloo.ca/~z70wang/research/ssim/. + For three-channel images, SSIM is calculated for each channel and then + averaged. + Args: + img (ndarray): Images with range [0, 255]. + img2 (ndarray): Images with range [0, 255]. + crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation. + input_order (str): Whether the input order is 'HWC' or 'CHW'. + Default: 'HWC'. + Returns: + float: SSIM result. + """ + + assert img.shape == img2.shape, ( + f'Image shapes are different: {img.shape}, {img2.shape}.') + if input_order not in ['HWC', 'CHW']: + raise ValueError( + f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"' + ) + img = reorder_image(img, input_order=input_order) + img2 = reorder_image(img2, input_order=input_order) + + if crop_border != 0: + img = img[crop_border:-crop_border, crop_border:-crop_border, ...] + img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] + + img = img.astype(np.float64) + img2 = img2.astype(np.float64) + + ssims = [] + for i in range(img.shape[2]): + ssims.append(_ssim(img[..., i], img2[..., i])) + return np.array(ssims).mean() + + +def _ssim(img, img2): + """Calculate SSIM (structural similarity) for one channel images. + It is called by func:`calculate_ssim`. + Args: + img (ndarray): Images with range [0, 255] with order 'HWC'. + img2 (ndarray): Images with range [0, 255] with order 'HWC'. + Returns: + float: SSIM result. + """ + + c1 = (0.01 * 255)**2 + c2 = (0.03 * 255)**2 + kernel = cv2.getGaussianKernel(11, 1.5) + window = np.outer(kernel, kernel.transpose()) + + mu1 = cv2.filter2D(img, -1, window)[5:-5, + 5:-5] # valid mode for window size 11 + mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5] + mu1_sq = mu1**2 + mu2_sq = mu2**2 + mu1_mu2 = mu1 * mu2 + sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq + sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq + sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 + + tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2) + tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) + ssim_map = tmp1 / tmp2 + return ssim_map.mean() diff --git a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py index 5b4e8ce1..c4de0729 100644 --- a/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py +++ b/modelscope/models/cv/image_denoise/nafnet/NAFNet_arch.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------ +# Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ + import numpy as np import torch import torch.nn as nn diff --git a/modelscope/models/cv/image_denoise/nafnet/arch_util.py b/modelscope/models/cv/image_denoise/nafnet/arch_util.py index df394dd5..2d406141 100644 --- a/modelscope/models/cv/image_denoise/nafnet/arch_util.py +++ b/modelscope/models/cv/image_denoise/nafnet/arch_util.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------ +# Modified from BasicSR (https://github.com/xinntao/BasicSR) +# Copyright 2018-2020 BasicSR Authors +# ------------------------------------------------------------------------ + import torch import torch.nn as nn diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py index c484b37b..a6fbf22f 100644 --- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py +++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py @@ -1,3 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. import os from copy import deepcopy from typing import Any, Dict, Union diff --git a/modelscope/msdatasets/image_denoise_data/data_utils.py b/modelscope/msdatasets/image_denoise_data/data_utils.py deleted file mode 100644 index dd735830..00000000 --- a/modelscope/msdatasets/image_denoise_data/data_utils.py +++ /dev/null @@ -1,152 +0,0 @@ -# ------------------------------------------------------------------------ -# Modified from BasicSR (https://github.com/xinntao/BasicSR) -# Copyright 2018-2020 BasicSR Authors -# ------------------------------------------------------------------------ -import os -from os import path as osp - -import cv2 -import numpy as np -import torch - -from .transforms import mod_crop - - -def img2tensor(imgs, bgr2rgb=True, float32=True): - """Numpy array to tensor. - Args: - imgs (list[ndarray] | ndarray): Input images. - bgr2rgb (bool): Whether to change bgr to rgb. - float32 (bool): Whether to change to float32. - Returns: - list[tensor] | tensor: Tensor images. If returned results only have - one element, just return tensor. - """ - - def _totensor(img, bgr2rgb, float32): - if img.shape[2] == 3 and bgr2rgb: - img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - img = torch.from_numpy(img.transpose(2, 0, 1)) - if float32: - img = img.float() - return img - - if isinstance(imgs, list): - return [_totensor(img, bgr2rgb, float32) for img in imgs] - else: - return _totensor(imgs, bgr2rgb, float32) - - -def scandir(dir_path, keyword=None, recursive=False, full_path=False): - """Scan a directory to find the interested files. - Args: - dir_path (str): Path of the directory. - keyword (str | tuple(str), optional): File keyword that we are - interested in. Default: None. - recursive (bool, optional): If set to True, recursively scan the - directory. Default: False. - full_path (bool, optional): If set to True, include the dir_path. - Default: False. - Returns: - A generator for all the interested files with relative pathes. - """ - - if (keyword is not None) and not isinstance(keyword, (str, tuple)): - raise TypeError('"suffix" must be a string or tuple of strings') - - root = dir_path - - def _scandir(dir_path, keyword, recursive): - for entry in os.scandir(dir_path): - if not entry.name.startswith('.') and entry.is_file(): - if full_path: - return_path = entry.path - else: - return_path = osp.relpath(entry.path, root) - - if keyword is None: - yield return_path - elif keyword in return_path: - yield return_path - else: - if recursive: - yield from _scandir( - entry.path, keyword=keyword, recursive=recursive) - else: - continue - - return _scandir(dir_path, keyword=keyword, recursive=recursive) - - -def padding(img_lq, img_gt, gt_size): - h, w, _ = img_lq.shape - - h_pad = max(0, gt_size - h) - w_pad = max(0, gt_size - w) - - if h_pad == 0 and w_pad == 0: - return img_lq, img_gt - - img_lq = cv2.copyMakeBorder(img_lq, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT) - img_gt = cv2.copyMakeBorder(img_gt, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT) - return img_lq, img_gt - - -def read_img_seq(path, require_mod_crop=False, scale=1): - """Read a sequence of images from a given folder path. - Args: - path (list[str] | str): List of image paths or image folder path. - require_mod_crop (bool): Require mod crop for each image. - Default: False. - scale (int): Scale factor for mod_crop. Default: 1. - Returns: - Tensor: size (t, c, h, w), RGB, [0, 1]. - """ - if isinstance(path, list): - img_paths = path - else: - img_paths = sorted(list(scandir(path, full_path=True))) - imgs = [cv2.imread(v).astype(np.float32) / 255. for v in img_paths] - if require_mod_crop: - imgs = [mod_crop(img, scale) for img in imgs] - imgs = img2tensor(imgs, bgr2rgb=True, float32=True) - imgs = torch.stack(imgs, dim=0) - return imgs - - -def paired_paths_from_folder(folders, keys, filename_tmpl): - """Generate paired paths from folders. - Args: - folders (list[str]): A list of folder path. The order of list should - be [input_folder, gt_folder]. - keys (list[str]): A list of keys identifying folders. The order should - be in consistent with folders, e.g., ['lq', 'gt']. - filename_tmpl (str): Template for each filename. Note that the - template excludes the file extension. Usually the filename_tmpl is - for files in the input folder. - Returns: - list[str]: Returned path list. - """ - assert len(folders) == 2, ( - 'The len of folders should be 2 with [input_folder, gt_folder]. ' - f'But got {len(folders)}') - assert len(keys) == 2, ( - 'The len of keys should be 2 with [input_key, gt_key]. ' - f'But got {len(keys)}') - input_folder, gt_folder = folders - input_key, gt_key = keys - - input_paths = list(scandir(input_folder, keyword='NOISY', recursive=True)) - gt_paths = list(scandir(gt_folder, keyword='GT', recursive=True)) - assert len(input_paths) == len(gt_paths), ( - f'{input_key} and {gt_key} datasets have different number of images: ' - f'{len(input_paths)}, {len(gt_paths)}.') - paths = [] - for idx in range(len(gt_paths)): - gt_path = os.path.join(gt_folder, gt_paths[idx]) - input_path = os.path.join(input_folder, gt_path.replace('GT', 'NOISY')) - - paths.append( - dict([(f'{input_key}_path', input_path), - (f'{gt_key}_path', gt_path)])) - return paths diff --git a/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py b/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py deleted file mode 100644 index 96b777e6..00000000 --- a/modelscope/msdatasets/image_denoise_data/image_denoise_dataset.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -from typing import Callable, List, Optional, Tuple, Union - -import cv2 -import numpy as np -from torch.utils import data - -from .data_utils import img2tensor, padding, paired_paths_from_folder -from .transforms import augment, paired_random_crop - - -def default_loader(path): - return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0 - - -class PairedImageDataset(data.Dataset): - """Paired image dataset for image restoration. - """ - - def __init__(self, opt, root, is_train): - super(PairedImageDataset, self).__init__() - self.opt = opt - self.is_train = is_train - self.gt_folder, self.lq_folder = os.path.join( - root, opt.dataroot_gt), os.path.join(root, opt.dataroot_lq) - - if opt.filename_tmpl is not None: - self.filename_tmpl = opt.filename_tmpl - else: - self.filename_tmpl = '{}' - self.paths = paired_paths_from_folder([self.lq_folder, self.gt_folder], - ['lq', 'gt'], self.filename_tmpl) - - def __getitem__(self, index): - scale = self.opt.scale - - # Load gt and lq images. Dimension order: HWC; channel order: BGR; - # image range: [0, 1], float32. - gt_path = self.paths[index]['gt_path'] - img_gt = default_loader(gt_path) - lq_path = self.paths[index]['lq_path'] - img_lq = default_loader(lq_path) - - # augmentation for training - # if self.is_train: - gt_size = self.opt.gt_size - # padding - img_gt, img_lq = padding(img_gt, img_lq, gt_size) - - # random crop - img_gt, img_lq = paired_random_crop(img_gt, img_lq, gt_size, scale) - - # flip, rotation - img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip, - self.opt.use_rot) - - # BGR to RGB, HWC to CHW, numpy to tensor - img_gt, img_lq = img2tensor([img_gt, img_lq], - bgr2rgb=True, - float32=True) - - return { - 'input': img_lq, - 'target': img_gt, - 'input_path': lq_path, - 'target_path': gt_path - } - - def __len__(self): - return len(self.paths) - - def to_torch_dataset( - self, - columns: Union[str, List[str]] = None, - preprocessors: Union[Callable, List[Callable]] = None, - **format_kwargs, - ): - return self diff --git a/modelscope/msdatasets/image_denoise_data/__init__.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py similarity index 73% rename from modelscope/msdatasets/image_denoise_data/__init__.py rename to modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py index ba1d2df8..5376cd7c 100644 --- a/modelscope/msdatasets/image_denoise_data/__init__.py +++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/__init__.py @@ -4,11 +4,11 @@ from typing import TYPE_CHECKING from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .image_denoise_dataset import PairedImageDataset + from .sidd_image_denoising_dataset import SiddImageDenoisingDataset else: _import_structure = { - 'image_denoise_dataset': ['PairedImageDataset'], + 'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'], } import sys diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py new file mode 100644 index 00000000..33fce4c8 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/data_utils.py @@ -0,0 +1,46 @@ +# ------------------------------------------------------------------------ +# Modified from BasicSR (https://github.com/xinntao/BasicSR) +# Copyright 2018-2020 BasicSR Authors +# ------------------------------------------------------------------------ + +import cv2 +import torch + + +def img2tensor(imgs, bgr2rgb=True, float32=True): + """Numpy array to tensor. + Args: + imgs (list[ndarray] | ndarray): Input images. + bgr2rgb (bool): Whether to change bgr to rgb. + float32 (bool): Whether to change to float32. + Returns: + list[tensor] | tensor: Tensor images. If returned results only have + one element, just return tensor. + """ + + def _totensor(img, bgr2rgb, float32): + if img.shape[2] == 3 and bgr2rgb: + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = torch.from_numpy(img.transpose(2, 0, 1)) + if float32: + img = img.float() + return img + + if isinstance(imgs, list): + return [_totensor(img, bgr2rgb, float32) for img in imgs] + else: + return _totensor(imgs, bgr2rgb, float32) + + +def padding(img_lq, img_gt, gt_size): + h, w, _ = img_lq.shape + + h_pad = max(0, gt_size - h) + w_pad = max(0, gt_size - w) + + if h_pad == 0 and w_pad == 0: + return img_lq, img_gt + + img_lq = cv2.copyMakeBorder(img_lq, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT) + img_gt = cv2.copyMakeBorder(img_gt, 0, h_pad, 0, w_pad, cv2.BORDER_REFLECT) + return img_lq, img_gt diff --git a/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py new file mode 100644 index 00000000..3f0cdae0 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/sidd_image_denoising/sidd_image_denoising_dataset.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import cv2 +import numpy as np + +from modelscope.metainfo import Models +from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.msdatasets.task_datasets.torch_base_dataset import \ + TorchTaskDataset +from modelscope.utils.constant import Tasks +from .data_utils import img2tensor, padding +from .transforms import augment, paired_random_crop + + +def default_loader(path): + return cv2.imread(path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255.0 + + +@TASK_DATASETS.register_module( + Tasks.image_denoising, module_name=Models.nafnet) +class SiddImageDenoisingDataset(TorchTaskDataset): + """Paired image dataset for image restoration. + """ + + def __init__(self, dataset, opt, is_train): + self.dataset = dataset + self.opt = opt + self.is_train = is_train + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, index): + + # Load gt and lq images. Dimension order: HWC; channel order: BGR; + # image range: [0, 1], float32. + item_dict = self.dataset[index] + gt_path = item_dict['Clean Image:FILE'] + img_gt = default_loader(gt_path) + lq_path = item_dict['Noisy Image:FILE'] + img_lq = default_loader(lq_path) + + # augmentation for training + if self.is_train: + gt_size = self.opt.gt_size + # padding + img_gt, img_lq = padding(img_gt, img_lq, gt_size) + + # random crop + img_gt, img_lq = paired_random_crop( + img_gt, img_lq, gt_size, scale=1) + + # flip, rotation + img_gt, img_lq = augment([img_gt, img_lq], self.opt.use_flip, + self.opt.use_rot) + + # BGR to RGB, HWC to CHW, numpy to tensor + img_gt, img_lq = img2tensor([img_gt, img_lq], + bgr2rgb=True, + float32=True) + + return {'input': img_lq, 'target': img_gt} diff --git a/modelscope/msdatasets/image_denoise_data/transforms.py b/modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py similarity index 100% rename from modelscope/msdatasets/image_denoise_data/transforms.py rename to modelscope/msdatasets/task_datasets/sidd_image_denoising/transforms.py diff --git a/modelscope/pipelines/cv/image_denoise_pipeline.py b/modelscope/pipelines/cv/image_denoise_pipeline.py index a11abf36..34ac1e81 100644 --- a/modelscope/pipelines/cv/image_denoise_pipeline.py +++ b/modelscope/pipelines/cv/image_denoise_pipeline.py @@ -105,4 +105,4 @@ class ImageDenoisePipeline(Pipeline): def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]: output_img = (input['output_tensor'].squeeze(0) * 255).cpu().permute( 1, 2, 0).numpy().astype('uint8') - return {OutputKeys.OUTPUT_IMG: output_img} + return {OutputKeys.OUTPUT_IMG: output_img[:, :, ::-1]} diff --git a/tests/pipelines/test_image_denoise.py b/tests/pipelines/test_image_denoise.py index bf8cfd0f..d95dd343 100644 --- a/tests/pipelines/test_image_denoise.py +++ b/tests/pipelines/test_image_denoise.py @@ -2,8 +2,6 @@ import unittest -from PIL import Image - from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model from modelscope.outputs import OutputKeys @@ -20,16 +18,16 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck): self.task = Tasks.image_denoising self.model_id = 'damo/cv_nafnet_image-denoise_sidd' - demo_image_path = 'data/test/images/noisy-demo-1.png' + demo_image_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/noisy-demo-0.png' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) pipeline = ImageDenoisePipeline(cache_path) + pipeline.group_key = self.task denoise_img = pipeline( - input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] - denoise_img = Image.fromarray(denoise_img) - w, h = denoise_img.size + input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] # BGR + h, w = denoise_img.shape[:2] print('pipeline: the shape of output_img is {}x{}'.format(h, w)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @@ -37,9 +35,8 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck): model = Model.from_pretrained(self.model_id) pipeline_ins = pipeline(task=Tasks.image_denoising, model=model) denoise_img = pipeline_ins( - input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] - denoise_img = Image.fromarray(denoise_img) - w, h = denoise_img.size + input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] # BGR + h, w = denoise_img.shape[:2] print('pipeline: the shape of output_img is {}x{}'.format(h, w)) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') @@ -47,18 +44,16 @@ class ImageDenoiseTest(unittest.TestCase, DemoCompatibilityCheck): pipeline_ins = pipeline( task=Tasks.image_denoising, model=self.model_id) denoise_img = pipeline_ins( - input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] - denoise_img = Image.fromarray(denoise_img) - w, h = denoise_img.size + input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] # BGR + h, w = denoise_img.shape[:2] print('pipeline: the shape of output_img is {}x{}'.format(h, w)) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): pipeline_ins = pipeline(task=Tasks.image_denoising) denoise_img = pipeline_ins( - input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] - denoise_img = Image.fromarray(denoise_img) - w, h = denoise_img.size + input=self.demo_image_path)[OutputKeys.OUTPUT_IMG] # BGR + h, w = denoise_img.shape[:2] print('pipeline: the shape of output_img is {}x{}'.format(h, w)) @unittest.skip('demo compatibility test is only enabled on a needed-basis') diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py index 261ee4ed..0bcb8930 100644 --- a/tests/trainers/test_image_denoise_trainer.py +++ b/tests/trainers/test_image_denoise_trainer.py @@ -6,10 +6,12 @@ import unittest from modelscope.hub.snapshot_download import snapshot_download from modelscope.models.cv.image_denoise import NAFNetForImageDenoise -from modelscope.msdatasets.image_denoise_data import PairedImageDataset +from modelscope.msdatasets import MsDataset +from modelscope.msdatasets.task_datasets.sidd_image_denoising import \ + SiddImageDenoisingDataset from modelscope.trainers import build_trainer from modelscope.utils.config import Config -from modelscope.utils.constant import ModelFile +from modelscope.utils.constant import DownloadMode, ModelFile from modelscope.utils.logger import get_logger from modelscope.utils.test_utils import test_level @@ -28,10 +30,20 @@ class ImageDenoiseTrainerTest(unittest.TestCase): self.cache_path = snapshot_download(self.model_id) self.config = Config.from_file( os.path.join(self.cache_path, ModelFile.CONFIGURATION)) - self.dataset_train = PairedImageDataset( - self.config.dataset, self.cache_path, is_train=True) - self.dataset_val = PairedImageDataset( - self.config.dataset, self.cache_path, is_train=False) + dataset_train = MsDataset.load( + 'SIDD', + namespace='huizheng', + split='validation', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds + dataset_val = MsDataset.load( + 'SIDD', + namespace='huizheng', + split='test', + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds + self.dataset_train = SiddImageDenoisingDataset( + dataset_train, self.config.dataset, is_train=True) + self.dataset_val = SiddImageDenoisingDataset( + dataset_val, self.config.dataset, is_train=False) def tearDown(self): shutil.rmtree(self.tmp_dir, ignore_errors=True) From 800588b8a6f4a867f8fd58cc05206f780e13dc9c Mon Sep 17 00:00:00 2001 From: ly261666 Date: Wed, 12 Oct 2022 10:53:47 +0800 Subject: [PATCH 16/57] [to #42322933]add licence on MogFace Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10335569 From 42be514bac5f985d6d8ce710646e2a79e3d81d39 Mon Sep 17 00:00:00 2001 From: ly261666 Date: Wed, 12 Oct 2022 15:17:11 +0800 Subject: [PATCH 17/57] [to #42322933]update fer to satisfy demo service requirements Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10372291 --- .../pipelines/cv/facial_expression_recognition_pipeline.py | 6 +----- modelscope/utils/cv/image_utils.py | 4 +++- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py index b598a457..3c85ae62 100644 --- a/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py +++ b/modelscope/pipelines/cv/facial_expression_recognition_pipeline.py @@ -122,11 +122,7 @@ class FacialExpressionRecognitionPipeline(Pipeline): result = self.fer(input) assert result is not None scores = result[0].tolist() - labels = result[1].tolist() - return { - OutputKeys.SCORES: scores, - OutputKeys.LABELS: self.map_list[labels] - } + return {OutputKeys.SCORES: scores, OutputKeys.LABELS: self.map_list} def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index ad0d6c8e..eab74688 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -113,7 +113,9 @@ def draw_face_detection_no_lm_result(img_path, detection_result): def draw_facial_expression_result(img_path, facial_expression_result): - label = facial_expression_result[OutputKeys.LABELS] + scores = facial_expression_result[OutputKeys.SCORES] + labels = facial_expression_result[OutputKeys.LABELS] + label = labels[np.argmax(scores)] img = cv2.imread(img_path) assert img is not None, f"Can't read img: {img_path}" cv2.putText( From 71459900544438b3d44bf0e922cdda64ac4d5701 Mon Sep 17 00:00:00 2001 From: "caorongyu.cry" Date: Wed, 12 Oct 2022 15:18:35 +0800 Subject: [PATCH 18/57] [to #42322933] reivse model problem and remove history sql for demo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 相比于master上的tableqa,做出了如下修复: 1. 修复了schema linking中的问题。 2. 同时设置了有history sql和没有history sql的两种输入 3. 增加了sqlite执行逻辑,可以返回sql执行结果 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10365114 --- .../models/nlp/table_question_answering.py | 3 +- modelscope/outputs.py | 1 + .../nlp/table_question_answering_pipeline.py | 61 +++++++++--- .../preprocessors/star3/fields/database.py | 53 ++++++++++- .../preprocessors/star3/fields/schema_link.py | 33 +++++-- .../table_question_answering_preprocessor.py | 5 +- modelscope/utils/nlp/nlp_utils.py | 17 +--- .../test_table_question_answering.py | 94 +++++++++++++++---- 8 files changed, 206 insertions(+), 61 deletions(-) diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py index 3c91a518..c6a03ef3 100644 --- a/modelscope/models/nlp/table_question_answering.py +++ b/modelscope/models/nlp/table_question_answering.py @@ -3,9 +3,11 @@ import os from typing import Dict +import json import numpy import torch import torch.nn.functional as F +import tqdm from transformers import BertTokenizer from modelscope.metainfo import Models @@ -82,7 +84,6 @@ class TableQuestionAnswering(Model): if ntok.startswith('##'): ntok = ntok.replace('##', '') - tok = nlu1[idx:idx + 1].lower() if ntok == tok: conv_dict[i] = [idx, idx + 1] diff --git a/modelscope/outputs.py b/modelscope/outputs.py index dd59d6fb..0f353d3d 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -37,6 +37,7 @@ class OutputKeys(object): WORD = 'word' KWS_LIST = 'kws_list' HISTORY = 'history' + QUERT_RESULT = 'query_result' TIMESTAMPS = 'timestamps' SHOT_NUM = 'shot_num' SCENE_NUM = 'scene_num' diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py index 96bfbc34..e1b2b07b 100644 --- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py @@ -2,6 +2,8 @@ import os from typing import Any, Dict, Union +import json +import torch from transformers import BertTokenizer from modelscope.metainfo import Pipelines @@ -230,14 +232,16 @@ class TableQuestionAnsweringPipeline(Pipeline): str_sel_list.append(header_name) sql_sel_list.append(header_id) else: - str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( ' - + header_name + ' )') - sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '( ' - + header_id + ' )') + str_sel_list.append(self.agg_ops[sql['agg'][idx]] + '(' + + header_name + ')') + sql_sel_list.append(self.agg_ops[sql['agg'][idx]] + '(' + + header_id + ')') str_cond_list, sql_cond_list = [], [] for cond in sql['conds']: header_name = header_names[cond[0]] + if header_name == '空列': + continue header_id = '`%s`.`%s`' % (table['table_id'], header_ids[cond[0]]) op = self.cond_ops[cond[1]] value = cond[2] @@ -248,12 +252,17 @@ class TableQuestionAnsweringPipeline(Pipeline): cond = ' ' + self.cond_conn_ops[sql['cond_conn_op']] + ' ' - final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join(str_sel_list), - table['table_name'], - cond.join(str_cond_list)) - final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join(sql_sel_list), - table['table_id'], - cond.join(sql_cond_list)) + if len(str_cond_list) != 0: + final_str = 'SELECT %s FROM %s WHERE %s' % (', '.join( + str_sel_list), table['table_name'], cond.join(str_cond_list)) + final_sql = 'SELECT %s FROM `%s` WHERE %s' % (', '.join( + sql_sel_list), table['table_id'], cond.join(sql_cond_list)) + else: + final_str = 'SELECT %s FROM %s' % (', '.join(str_sel_list), + table['table_name']) + final_sql = 'SELECT %s FROM `%s`' % (', '.join(sql_sel_list), + table['table_id']) + sql = SQLQuery( string=final_str, query=final_sql, sql_result=result['sql']) @@ -274,9 +283,39 @@ class TableQuestionAnsweringPipeline(Pipeline): history_sql=history_sql, result=result, table=self.db.tables[result['table_id']]) + result['sql']['from'] = [result['table_id']] sql = self.sql_dict_to_str( result=result, table=self.db.tables[result['table_id']]) - output = {OutputKeys.OUTPUT: sql, OutputKeys.HISTORY: result['sql']} + + # add sqlite + if self.db.is_use_sqlite: + try: + cursor = self.db.connection_obj.cursor().execute(sql.query) + names = [{ + 'name': + description[0], + 'label': + self.db.tables[result['table_id']]['headerid2name'].get( + description[0], description[0]) + } for description in cursor.description] + cells = [] + for res in cursor.fetchall(): + row = {} + for name, cell in zip(names, res): + row[name['name']] = cell + cells.append(row) + tabledata = {'headers': names, 'cells': cells} + except Exception: + tabledata = {'headers': [], 'cells': []} + else: + tabledata = {'headers': [], 'cells': []} + + output = { + OutputKeys.OUTPUT: sql, + OutputKeys.HISTORY: result['sql'], + OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False), + } + return output def _collate_fn(self, data): diff --git a/modelscope/preprocessors/star3/fields/database.py b/modelscope/preprocessors/star3/fields/database.py index a99800cf..3d3a1f8d 100644 --- a/modelscope/preprocessors/star3/fields/database.py +++ b/modelscope/preprocessors/star3/fields/database.py @@ -1,4 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import sqlite3 + import json import tqdm @@ -7,18 +9,38 @@ from modelscope.preprocessors.star3.fields.struct import Trie class Database: - def __init__(self, tokenizer, table_file_path, syn_dict_file_path): + def __init__(self, + tokenizer, + table_file_path, + syn_dict_file_path, + is_use_sqlite=False): self.tokenizer = tokenizer + self.is_use_sqlite = is_use_sqlite + if self.is_use_sqlite: + self.connection_obj = sqlite3.connect(':memory:') + self.type_dict = {'text': 'TEXT', 'number': 'INT', 'date': 'TEXT'} self.tables = self.init_tables(table_file_path=table_file_path) self.syn_dict = self.init_syn_dict( syn_dict_file_path=syn_dict_file_path) + def __del__(self): + if self.is_use_sqlite: + self.connection_obj.close() + def init_tables(self, table_file_path): tables = {} lines = [] - with open(table_file_path, 'r') as fo: - for line in fo: - lines.append(line) + if type(table_file_path) == str: + with open(table_file_path, 'r') as fo: + for line in fo: + lines.append(line) + elif type(table_file_path) == list: + for path in table_file_path: + with open(path, 'r') as fo: + for line in fo: + lines.append(line) + else: + raise ValueError() for line in tqdm.tqdm(lines, desc='Load Tables'): table = json.loads(line.strip()) @@ -34,6 +56,9 @@ class Database: headers_tokens.append(empty_column) table['tablelen'] = table_header_length table['header_tok'] = headers_tokens + table['headerid2name'] = {} + for hid, hname in zip(table['header_id'], table['header_name']): + table['headerid2name'][hid] = hname table['header_types'].append('null') table['header_units'] = [ @@ -51,6 +76,26 @@ class Database: trie_set[ii].insert(word, word) table['value_trie'] = trie_set + + # create sqlite + if self.is_use_sqlite: + cursor_obj = self.connection_obj.cursor() + cursor_obj.execute('DROP TABLE IF EXISTS %s' % + (table['table_id'])) + header_string = ', '.join([ + '%s %s' % + (name, self.type_dict[htype]) for name, htype in zip( + table['header_id'], table['header_types']) + ]) + create_table_string = 'CREATE TABLE %s (%s);' % ( + table['table_id'], header_string) + cursor_obj.execute(create_table_string) + for row in table['rows']: + value_string = ', '.join(['"%s"' % (val) for val in row]) + insert_row_string = 'INSERT INTO %s VALUES(%s)' % ( + table['table_id'], value_string) + cursor_obj.execute(insert_row_string) + tables[table['table_id']] = table return tables diff --git a/modelscope/preprocessors/star3/fields/schema_link.py b/modelscope/preprocessors/star3/fields/schema_link.py index 40613f78..7f483a1f 100644 --- a/modelscope/preprocessors/star3/fields/schema_link.py +++ b/modelscope/preprocessors/star3/fields/schema_link.py @@ -287,7 +287,13 @@ class SchemaLinker: return match_len / (len(nlu_t) + 0.1) - def get_entity_linking(self, tokenizer, nlu, nlu_t, tables, col_syn_dict): + def get_entity_linking(self, + tokenizer, + nlu, + nlu_t, + tables, + col_syn_dict, + history_sql=None): """ get linking between question and schema column """ @@ -305,8 +311,7 @@ class SchemaLinker: typeinfos = [] for ii, column in enumerate(table['header_name']): column = column.lower() - column_new = re.sub('(.*?)', '', column) - column_new = re.sub('(.*?)', '', column_new) + column_new = column cphrase, cscore = self.get_match_phrase( nlu.lower(), column_new) if cscore > 0.3 and cphrase.strip() != '': @@ -330,7 +335,6 @@ class SchemaLinker: for cell in ans.keys(): vphrase = cell vscore = 1.0 - # print("trie_set find:", cell, ans[cell]) phrase_tok = tokenizer.tokenize(vphrase) if len(phrase_tok) == 0 or len(vphrase) < 2: continue @@ -408,16 +412,25 @@ class SchemaLinker: match_score = self.get_table_match_score(nlu_t, schema_link) search_result = { - 'table_id': table['table_id'], - 'question_knowledge': final_question, - 'header_knowledge': final_header, - 'schema_link': schema_link, - 'match_score': match_score + 'table_id': + table['table_id'], + 'question_knowledge': + final_question, + 'header_knowledge': + final_header, + 'schema_link': + schema_link, + 'match_score': + match_score, + 'table_score': + int(table['table_id'] == history_sql['from'][0]) + if history_sql is not None else 0 } search_result_list.append(search_result) search_result_list = sorted( - search_result_list, key=lambda x: x['match_score'], + search_result_list, + key=lambda x: (x['match_score'], x['table_score']), reverse=True)[0:4] return search_result_list diff --git a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py index 163759a1..f98aa6d0 100644 --- a/modelscope/preprocessors/star3/table_question_answering_preprocessor.py +++ b/modelscope/preprocessors/star3/table_question_answering_preprocessor.py @@ -95,7 +95,7 @@ class TableQuestionAnsweringPreprocessor(Preprocessor): # tokenize question question = data['question'] - history_sql = data['history_sql'] + history_sql = data.get('history_sql', None) nlu = question.lower() nlu_t = self.tokenizer.tokenize(nlu) @@ -105,7 +105,8 @@ class TableQuestionAnsweringPreprocessor(Preprocessor): nlu=nlu, nlu_t=nlu_t, tables=self.db.tables, - col_syn_dict=self.db.syn_dict) + col_syn_dict=self.db.syn_dict, + history_sql=history_sql) # collect data datas = self.construct_data( diff --git a/modelscope/utils/nlp/nlp_utils.py b/modelscope/utils/nlp/nlp_utils.py index eba12103..35b374f2 100644 --- a/modelscope/utils/nlp/nlp_utils.py +++ b/modelscope/utils/nlp/nlp_utils.py @@ -2,8 +2,7 @@ from typing import List from modelscope.outputs import OutputKeys from modelscope.pipelines.nlp import (ConversationalTextToSqlPipeline, - DialogStateTrackingPipeline, - TableQuestionAnsweringPipeline) + DialogStateTrackingPipeline) def text2sql_tracking_and_print_results( @@ -42,17 +41,3 @@ def tracking_and_print_dialog_states( print(json.dumps(result)) history_states.extend([result[OutputKeys.OUTPUT], {}]) - - -def tableqa_tracking_and_print_results( - test_case, pipelines: List[TableQuestionAnsweringPipeline]): - for pipeline in pipelines: - historical_queries = None - for question in test_case['utterance']: - output_dict = pipeline({ - 'question': question, - 'history_sql': historical_queries - }) - print('output_dict', output_dict['output'].string, - output_dict['output'].query) - historical_queries = output_dict['history'] diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py index 7ea28725..68e0564f 100644 --- a/tests/pipelines/test_table_question_answering.py +++ b/tests/pipelines/test_table_question_answering.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os import unittest +from typing import List from transformers import BertTokenizer @@ -11,10 +12,60 @@ from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline from modelscope.preprocessors import TableQuestionAnsweringPreprocessor from modelscope.preprocessors.star3.fields.database import Database from modelscope.utils.constant import ModelFile, Tasks -from modelscope.utils.nlp.nlp_utils import tableqa_tracking_and_print_results from modelscope.utils.test_utils import test_level +def tableqa_tracking_and_print_results_with_history( + pipelines: List[TableQuestionAnsweringPipeline]): + test_case = { + 'utterance': [ + '有哪些风险类型?', + '风险类型有多少种?', + '珠江流域的小(2)型水库的库容总量是多少?', + '那平均值是多少?', + '那水库的名称呢?', + '换成中型的呢?', + '枣庄营业厅的电话', + '那地址呢?', + '枣庄营业厅的电话和地址', + ] + } + for p in pipelines: + historical_queries = None + for question in test_case['utterance']: + output_dict = p({ + 'question': question, + 'history_sql': historical_queries + }) + print('question', question) + print('sql text:', output_dict['output'].string) + print('sql query:', output_dict['output'].query) + print('query result:', output_dict['query_result']) + print() + historical_queries = output_dict['history'] + + +def tableqa_tracking_and_print_results_without_history( + pipelines: List[TableQuestionAnsweringPipeline]): + test_case = { + 'utterance': [ + '有哪些风险类型?', + '风险类型有多少种?', + '珠江流域的小(2)型水库的库容总量是多少?', + '枣庄营业厅的电话', + '枣庄营业厅的电话和地址', + ] + } + for p in pipelines: + for question in test_case['utterance']: + output_dict = p({'question': question}) + print('question', question) + print('sql text:', output_dict['output'].string) + print('sql query:', output_dict['output'].query) + print('query result:', output_dict['query_result']) + print() + + class TableQuestionAnswering(unittest.TestCase): def setUp(self) -> None: @@ -22,20 +73,18 @@ class TableQuestionAnswering(unittest.TestCase): self.model_id = 'damo/nlp_convai_text2sql_pretrain_cn' model_id = 'damo/nlp_convai_text2sql_pretrain_cn' - test_case = { - 'utterance': - ['长江流域的小(2)型水库的库容总量是多少?', '那平均值是多少?', '那水库的名称呢?', '换成中型的呢?'] - } @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) preprocessor = TableQuestionAnsweringPreprocessor(model_dir=cache_path) pipelines = [ - TableQuestionAnsweringPipeline( - model=cache_path, preprocessor=preprocessor) + pipeline( + Tasks.table_question_answering, + model=cache_path, + preprocessor=preprocessor) ] - tableqa_tracking_and_print_results(self.test_case, pipelines) + tableqa_tracking_and_print_results_with_history(pipelines) @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_model_from_modelhub(self): @@ -43,15 +92,17 @@ class TableQuestionAnswering(unittest.TestCase): preprocessor = TableQuestionAnsweringPreprocessor( model_dir=model.model_dir) pipelines = [ - TableQuestionAnsweringPipeline( - model=model, preprocessor=preprocessor) + pipeline( + Tasks.table_question_answering, + model=model, + preprocessor=preprocessor) ] - tableqa_tracking_and_print_results(self.test_case, pipelines) + tableqa_tracking_and_print_results_with_history(pipelines) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_model_from_task(self): pipelines = [pipeline(Tasks.table_question_answering, self.model_id)] - tableqa_tracking_and_print_results(self.test_case, pipelines) + tableqa_tracking_and_print_results_with_history(pipelines) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub_with_other_classes(self): @@ -60,15 +111,24 @@ class TableQuestionAnswering(unittest.TestCase): os.path.join(model.model_dir, ModelFile.VOCAB_FILE)) db = Database( tokenizer=self.tokenizer, - table_file_path=os.path.join(model.model_dir, 'table.json'), - syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt')) + table_file_path=[ + os.path.join(model.model_dir, 'databases', fname) + for fname in os.listdir( + os.path.join(model.model_dir, 'databases')) + ], + syn_dict_file_path=os.path.join(model.model_dir, 'synonym.txt'), + is_use_sqlite=True) preprocessor = TableQuestionAnsweringPreprocessor( model_dir=model.model_dir, db=db) pipelines = [ - TableQuestionAnsweringPipeline( - model=model, preprocessor=preprocessor, db=db) + pipeline( + Tasks.table_question_answering, + model=model, + preprocessor=preprocessor, + db=db) ] - tableqa_tracking_and_print_results(self.test_case, pipelines) + tableqa_tracking_and_print_results_without_history(pipelines) + tableqa_tracking_and_print_results_with_history(pipelines) if __name__ == '__main__': From 3edf30caa60af9bab70f8aea4217a79581bb473c Mon Sep 17 00:00:00 2001 From: ly261666 Date: Wed, 12 Oct 2022 15:19:12 +0800 Subject: [PATCH 19/57] [to #42322933]change the default model of face detection after discussion Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371469 --- modelscope/pipelines/builder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index b18d4465..1f563915 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -118,8 +118,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.hand_2d_keypoints: (Pipelines.hand_2d_keypoints, 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'), - Tasks.face_detection: (Pipelines.face_detection, - 'damo/cv_resnet_facedetection_scrfd10gkps'), + Tasks.face_detection: + (Pipelines.face_detection, + 'damo/cv_resnet101_face-detection_cvpr22papermogface'), Tasks.face_recognition: (Pipelines.face_recognition, 'damo/cv_ir101_facerecognition_cfglint'), Tasks.facial_expression_recognition: From a26e6e38697a8795b99de4c7929b415baef78268 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Wed, 12 Oct 2022 17:33:03 +0800 Subject: [PATCH 20/57] [to #45071449] fix setup error Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10196007 --- modelscope/models/audio/tts/models/datasets/__init__.py | 0 requirements/framework.txt | 1 + 2 files changed, 1 insertion(+) mode change 100644 => 100755 modelscope/models/audio/tts/models/datasets/__init__.py diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py old mode 100644 new mode 100755 diff --git a/requirements/framework.txt b/requirements/framework.txt index b51faeda..aae200da 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -15,6 +15,7 @@ pyyaml requests scipy setuptools +setuptools_scm tensorboard tqdm>=4.64.0 yapf From 8c91a4972e2ced9f1b40613ee88f4c5197bafa6e Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Wed, 12 Oct 2022 19:01:34 +0800 Subject: [PATCH 21/57] require pai-easycv 0.6.3.7 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10380097 --- requirements/cv.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/cv.txt b/requirements/cv.txt index e6ffb5ff..eb38beb1 100644 --- a/requirements/cv.txt +++ b/requirements/cv.txt @@ -17,7 +17,7 @@ mmdet>=2.25.0 networkx>=2.5 numba onnxruntime>=1.10 -pai-easycv>=0.6.3.6 +pai-easycv>=0.6.3.7 pandas psutil regex From 295fdd1a609def5e0c8b57783f1fca656e4cbcb0 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Wed, 12 Oct 2022 19:01:57 +0800 Subject: [PATCH 22/57] [to #45443331]fix: git config email with username bug Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10378571 --- modelscope/hub/git.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py index 486f8df3..a149ede1 100644 --- a/modelscope/hub/git.py +++ b/modelscope/hub/git.py @@ -138,8 +138,8 @@ class GitCommandWrapper(metaclass=Singleton): repo_base_dir, repo_name, user_name) response = self._run_git_command(*config_user_name_args.split(' ')) logger.debug(response.stdout.decode('utf8')) - config_user_email_args = '-C %s/%s config user.name %s' % ( - repo_base_dir, repo_name, user_name) + config_user_email_args = '-C %s/%s config user.email %s' % ( + repo_base_dir, repo_name, user_email) response = self._run_git_command( *config_user_email_args.split(' ')) logger.debug(response.stdout.decode('utf8')) From 4cb5f8a2cd104f89b765d56527d448b2df1be151 Mon Sep 17 00:00:00 2001 From: "shouzhou.bx" Date: Wed, 12 Oct 2022 19:53:14 +0800 Subject: [PATCH 23/57] [to #42322933] add human whole body model and image object detection auto model Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10319306 --- data/test/images/auto_demo.jpg | 3 + .../body_keypoints_detection.jpg | 3 - .../keypoints_detect/img_test_wholebody.jpg | 3 + modelscope/metainfo.py | 5 ++ modelscope/models/cv/__init__.py | 20 +++--- .../cv/human_wholebody_keypoint/__init__.py | 22 +++++++ .../human_wholebody_keypoint.py | 17 +++++ .../models/cv/object_detection/__init__.py | 2 +- .../models/cv/object_detection/yolox_pai.py | 3 + .../cv/human_wholebody_keypoint/__init__.py | 22 +++++++ .../human_wholebody_keypoint_dataset.py | 39 +++++++++++ modelscope/outputs.py | 19 +++++- modelscope/pipelines/builder.py | 8 ++- modelscope/pipelines/cv/__init__.py | 11 +++- .../cv/body_2d_keypoints_pipeline.py | 4 +- .../cv/body_3d_keypoints_pipeline.py | 2 +- .../pipelines/cv/easycv_pipelines/__init__.py | 5 +- .../cv/easycv_pipelines/detection_pipeline.py | 41 +++++++++++- .../human_wholebody_keypoint_pipeline.py | 65 +++++++++++++++++++ modelscope/utils/constant.py | 1 + modelscope/utils/cv/image_utils.py | 34 +++++++++- .../test_human_wholebody_keypoint.py | 40 ++++++++++++ tests/pipelines/test_object_detection.py | 12 ++++ 23 files changed, 353 insertions(+), 28 deletions(-) create mode 100644 data/test/images/auto_demo.jpg delete mode 100644 data/test/images/keypoints_detect/body_keypoints_detection.jpg create mode 100644 data/test/images/keypoints_detect/img_test_wholebody.jpg create mode 100644 modelscope/models/cv/human_wholebody_keypoint/__init__.py create mode 100644 modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py create mode 100644 modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py create mode 100644 modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py create mode 100644 modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py create mode 100644 tests/pipelines/test_human_wholebody_keypoint.py diff --git a/data/test/images/auto_demo.jpg b/data/test/images/auto_demo.jpg new file mode 100644 index 00000000..30393e53 --- /dev/null +++ b/data/test/images/auto_demo.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32 +size 53219 diff --git a/data/test/images/keypoints_detect/body_keypoints_detection.jpg b/data/test/images/keypoints_detect/body_keypoints_detection.jpg deleted file mode 100644 index 71ce7d7e..00000000 --- a/data/test/images/keypoints_detect/body_keypoints_detection.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d -size 1702339 diff --git a/data/test/images/keypoints_detect/img_test_wholebody.jpg b/data/test/images/keypoints_detect/img_test_wholebody.jpg new file mode 100644 index 00000000..40a9f3f8 --- /dev/null +++ b/data/test/images/keypoints_detect/img_test_wholebody.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd +size 167407 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index cae9d188..759f1688 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -40,6 +40,7 @@ class Models(object): mtcnn = 'mtcnn' ulfd = 'ulfd' video_inpainting = 'video-inpainting' + human_wholebody_keypoint = 'human-wholebody-keypoint' hand_static = 'hand-static' face_human_hand_detection = 'face-human-hand-detection' face_emotion = 'face-emotion' @@ -49,6 +50,7 @@ class Models(object): # EasyCV models yolox = 'YOLOX' segformer = 'Segformer' + image_object_detection_auto = 'image-object-detection-auto' # nlp models bert = 'bert' @@ -170,6 +172,7 @@ class Pipelines(object): ocr_recognition = 'convnextTiny-ocr-recognition' image_portrait_enhancement = 'gpen-image-portrait-enhancement' image_to_image_generation = 'image-to-image-generation' + image_object_detection_auto = 'yolox_image-object-detection-auto' skin_retouching = 'unet-skin-retouching' tinynas_classification = 'tinynas-classification' tinynas_detection = 'tinynas-detection' @@ -185,6 +188,7 @@ class Pipelines(object): movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation' shop_segmentation = 'shop-segmentation' video_inpainting = 'video-inpainting' + human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image' pst_action_recognition = 'patchshift-action-recognition' hand_static = 'hand-static' face_human_hand_detection = 'face-human-hand-detection' @@ -427,6 +431,7 @@ class Datasets(object): """ ClsDataset = 'ClsDataset' Face2dKeypointsDataset = 'Face2dKeypointsDataset' + HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset' SegDataset = 'SegDataset' DetDataset = 'DetDataset' DetImagesMixDataset = 'DetImagesMixDataset' diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index ba7b03c5..fd950f4c 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -4,15 +4,15 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, body_3d_keypoints, cartoon, cmdssl_video_embedding, crowd_counting, face_2d_keypoints, face_detection, - face_generation, image_classification, image_color_enhance, - image_colorization, image_denoise, image_inpainting, - image_instance_segmentation, image_panoptic_segmentation, - image_portrait_enhancement, image_reid_person, - image_semantic_segmentation, image_to_image_generation, - image_to_image_translation, movie_scene_segmentation, - object_detection, product_retrieval_embedding, - realtime_object_detection, salient_detection, shop_segmentation, - super_resolution, video_single_object_tracking, - video_summarization, virual_tryon) + face_generation, human_wholebody_keypoint, image_classification, + image_color_enhance, image_colorization, image_denoise, + image_inpainting, image_instance_segmentation, + image_panoptic_segmentation, image_portrait_enhancement, + image_reid_person, image_semantic_segmentation, + image_to_image_generation, image_to_image_translation, + movie_scene_segmentation, object_detection, + product_retrieval_embedding, realtime_object_detection, + salient_detection, shop_segmentation, super_resolution, + video_single_object_tracking, video_summarization, virual_tryon) # yapf: enable diff --git a/modelscope/models/cv/human_wholebody_keypoint/__init__.py b/modelscope/models/cv/human_wholebody_keypoint/__init__.py new file mode 100644 index 00000000..30e23457 --- /dev/null +++ b/modelscope/models/cv/human_wholebody_keypoint/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .human_wholebody_keypoint import HumanWholeBodyKeypoint + +else: + _import_structure = { + 'human_wholebody_keypoint': ['HumanWholeBodyKeypoint'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py new file mode 100644 index 00000000..dd3c0290 --- /dev/null +++ b/modelscope/models/cv/human_wholebody_keypoint/human_wholebody_keypoint.py @@ -0,0 +1,17 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.models.pose.top_down import TopDown + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.models.cv.easycv_base import EasyCVBaseModel +from modelscope.utils.constant import Tasks + + +@MODELS.register_module( + group_key=Tasks.human_wholebody_keypoint, + module_name=Models.human_wholebody_keypoint) +class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown): + + def __init__(self, model_dir=None, *args, **kwargs): + EasyCVBaseModel.__init__(self, model_dir, args, kwargs) + TopDown.__init__(self, *args, **kwargs) diff --git a/modelscope/models/cv/object_detection/__init__.py b/modelscope/models/cv/object_detection/__init__.py index 974375ce..0c782d7b 100644 --- a/modelscope/models/cv/object_detection/__init__.py +++ b/modelscope/models/cv/object_detection/__init__.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: else: _import_structure = { 'mmdet_model': ['DetectionModel'], - 'yolox_pai': ['YOLOX'] + 'yolox_pai': ['YOLOX'], } import sys diff --git a/modelscope/models/cv/object_detection/yolox_pai.py b/modelscope/models/cv/object_detection/yolox_pai.py index 985cc136..46bd4e3c 100644 --- a/modelscope/models/cv/object_detection/yolox_pai.py +++ b/modelscope/models/cv/object_detection/yolox_pai.py @@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks @MODELS.register_module( group_key=Tasks.image_object_detection, module_name=Models.yolox) +@MODELS.register_module( + group_key=Tasks.image_object_detection, + module_name=Models.image_object_detection_auto) class YOLOX(EasyCVBaseModel, _YOLOX): def __init__(self, model_dir=None, *args, **kwargs): diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py b/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py new file mode 100644 index 00000000..472ed2d8 --- /dev/null +++ b/modelscope/msdatasets/cv/human_wholebody_keypoint/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .human_wholebody_keypoint_dataset import WholeBodyCocoTopDownDataset + +else: + _import_structure = { + 'human_wholebody_keypoint_dataset': ['WholeBodyCocoTopDownDataset'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py b/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py new file mode 100644 index 00000000..fc9469f2 --- /dev/null +++ b/modelscope/msdatasets/cv/human_wholebody_keypoint/human_wholebody_keypoint_dataset.py @@ -0,0 +1,39 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.datasets.pose import \ + WholeBodyCocoTopDownDataset as _WholeBodyCocoTopDownDataset + +from modelscope.metainfo import Datasets +from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset +from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.utils.constant import Tasks + + +@TASK_DATASETS.register_module( + group_key=Tasks.human_wholebody_keypoint, + module_name=Datasets.HumanWholeBodyKeypointDataset) +class WholeBodyCocoTopDownDataset(EasyCVBaseDataset, + _WholeBodyCocoTopDownDataset): + """EasyCV dataset for human whole body 2d keypoints. + + Args: + split_config (dict): Dataset root path from MSDataset, e.g. + {"train":"local cache path"} or {"evaluation":"local cache path"}. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. Not support yet. + mode: Training or Evaluation. + """ + + def __init__(self, + split_config=None, + preprocessor=None, + mode=None, + *args, + **kwargs) -> None: + EasyCVBaseDataset.__init__( + self, + split_config=split_config, + preprocessor=preprocessor, + mode=mode, + args=args, + kwargs=kwargs) + _WholeBodyCocoTopDownDataset.__init__(self, *args, **kwargs) diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 0f353d3d..ab3ea54a 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -203,7 +203,7 @@ TASK_OUTPUTS = { # human body keypoints detection result for single sample # { - # "poses": [ + # "keypoints": [ # [[x, y]*15], # [[x, y]*15], # [[x, y]*15] @@ -220,7 +220,7 @@ TASK_OUTPUTS = { # ] # } Tasks.body_2d_keypoints: - [OutputKeys.POSES, OutputKeys.SCORES, OutputKeys.BOXES], + [OutputKeys.KEYPOINTS, OutputKeys.SCORES, OutputKeys.BOXES], # 3D human body keypoints detection result for single sample # { @@ -339,6 +339,21 @@ TASK_OUTPUTS = { OutputKeys.SCENE_META_LIST ], + # human whole body keypoints detection result for single sample + # { + # "keypoints": [ + # [[x, y]*133], + # [[x, y]*133], + # [[x, y]*133] + # ] + # "boxes": [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ] + # } + Tasks.human_wholebody_keypoint: [OutputKeys.KEYPOINTS, OutputKeys.BOXES], + # video summarization result for a single video # { # "output": diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 1f563915..bc9073bc 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -75,8 +75,6 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/nlp_bart_text-error-correction_chinese'), Tasks.image_captioning: (Pipelines.image_captioning, 'damo/ofa_image-caption_coco_large_en'), - Tasks.image_body_reshaping: (Pipelines.image_body_reshaping, - 'damo/cv_flow-based-body-reshaping_damo'), Tasks.image_portrait_stylization: (Pipelines.person_image_cartoon, 'damo/cv_unet_person-image-cartoon_compound-models'), @@ -159,6 +157,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.image_classification: (Pipelines.daily_image_classification, 'damo/cv_vit-base_image-classification_Dailylife-labels'), + Tasks.image_object_detection: + (Pipelines.image_object_detection_auto, + 'damo/cv_yolox_image-object-detection-auto'), Tasks.ocr_recognition: (Pipelines.ocr_recognition, 'damo/cv_convnextTiny_ocr-recognition-general_damo'), @@ -186,6 +187,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_fft_inpainting_lama'), Tasks.video_inpainting: (Pipelines.video_inpainting, 'damo/cv_video-inpainting'), + Tasks.human_wholebody_keypoint: + (Pipelines.human_wholebody_keypoint, + 'damo/cv_hrnetw48_human-wholebody-keypoint_image'), Tasks.hand_static: (Pipelines.hand_static, 'damo/cv_mobileface_hand-static'), Tasks.face_human_hand_detection: diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 118eaf17..f84f5fe5 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -46,7 +46,10 @@ if TYPE_CHECKING: from .video_category_pipeline import VideoCategoryPipeline from .virtual_try_on_pipeline import VirtualTryonPipeline from .shop_segmentation_pipleline import ShopSegmentationPipeline - from .easycv_pipelines import EasyCVDetectionPipeline, EasyCVSegmentationPipeline, Face2DKeypointsPipeline + from .easycv_pipelines import (EasyCVDetectionPipeline, + EasyCVSegmentationPipeline, + Face2DKeypointsPipeline, + HumanWholebodyKeypointsPipeline) from .text_driven_segmentation_pipleline import TextDrivenSegmentationPipeline from .movie_scene_segmentation_pipeline import MovieSceneSegmentationPipeline from .mog_face_detection_pipeline import MogFaceDetectionPipeline @@ -109,8 +112,10 @@ else: 'virtual_try_on_pipeline': ['VirtualTryonPipeline'], 'shop_segmentation_pipleline': ['ShopSegmentationPipeline'], 'easycv_pipeline': [ - 'EasyCVDetectionPipeline', 'EasyCVSegmentationPipeline', - 'Face2DKeypointsPipeline' + 'EasyCVDetectionPipeline', + 'EasyCVSegmentationPipeline', + 'Face2DKeypointsPipeline', + 'HumanWholebodyKeypointsPipeline', ], 'text_driven_segmentation_pipeline': ['TextDrivenSegmentationPipeline'], diff --git a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py index d6afbae4..bc2e975d 100644 --- a/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/body_2d_keypoints_pipeline.py @@ -73,7 +73,7 @@ class Body2DKeypointsPipeline(Pipeline): if input[0] is None or input[1] is None: return { OutputKeys.BOXES: [], - OutputKeys.POSES: [], + OutputKeys.KEYPOINTS: [], OutputKeys.SCORES: [] } @@ -83,7 +83,7 @@ class Body2DKeypointsPipeline(Pipeline): result_boxes.append([box[0][0], box[0][1], box[1][0], box[1][1]]) return { OutputKeys.BOXES: result_boxes, - OutputKeys.POSES: poses, + OutputKeys.KEYPOINTS: poses, OutputKeys.SCORES: scores } diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py index c3f4e8c1..3502915c 100644 --- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py @@ -145,7 +145,7 @@ class Body3DKeypointsPipeline(Pipeline): kps_2d = self.human_body_2d_kps_detector(frame) box = kps_2d['boxes'][ 0] # box: [[[x1, y1], [x2, y2]]], N human boxes per frame, [0] represent using first detected bbox - pose = kps_2d['poses'][0] # keypoints: [15, 2] + pose = kps_2d['keypoints'][0] # keypoints: [15, 2] score = kps_2d['scores'][0] # keypoints: [15, 2] all_2d_poses.append(pose) all_boxes_with_socre.append( diff --git a/modelscope/pipelines/cv/easycv_pipelines/__init__.py b/modelscope/pipelines/cv/easycv_pipelines/__init__.py index 4f149130..e0209b85 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/__init__.py +++ b/modelscope/pipelines/cv/easycv_pipelines/__init__.py @@ -7,11 +7,14 @@ if TYPE_CHECKING: from .detection_pipeline import EasyCVDetectionPipeline from .segmentation_pipeline import EasyCVSegmentationPipeline from .face_2d_keypoints_pipeline import Face2DKeypointsPipeline + from .human_wholebody_keypoint_pipeline import HumanWholebodyKeypointsPipeline else: _import_structure = { 'detection_pipeline': ['EasyCVDetectionPipeline'], 'segmentation_pipeline': ['EasyCVSegmentationPipeline'], - 'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline'] + 'face_2d_keypoints_pipeline': ['Face2DKeypointsPipeline'], + 'human_wholebody_keypoint_pipeline': + ['HumanWholebodyKeypointsPipeline'], } import sys diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py index 32365102..0c2058d5 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py @@ -1,16 +1,28 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any + from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.constant import Tasks +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.cv.image_utils import \ + show_image_object_detection_auto_result from .base import EasyCVPipeline @PIPELINES.register_module( Tasks.image_object_detection, module_name=Pipelines.easycv_detection) +@PIPELINES.register_module( + Tasks.image_object_detection, + module_name=Pipelines.image_object_detection_auto) class EasyCVDetectionPipeline(EasyCVPipeline): """Pipeline for easycv detection task.""" - def __init__(self, model: str, model_file_pattern='*.pt', *args, **kwargs): + def __init__(self, + model: str, + model_file_pattern=ModelFile.TORCH_MODEL_FILE, + *args, + **kwargs): """ model (str): model id on modelscope hub or local model path. model_file_pattern (str): model file pattern. @@ -21,3 +33,28 @@ class EasyCVDetectionPipeline(EasyCVPipeline): model_file_pattern=model_file_pattern, *args, **kwargs) + + def show_result(self, img_path, result, save_path=None): + show_image_object_detection_auto_result(img_path, result, save_path) + + def __call__(self, inputs) -> Any: + outputs = self.predict_op(inputs) + + scores = [] + labels = [] + boxes = [] + for output in outputs: + for score, label, box in zip(output['detection_scores'], + output['detection_classes'], + output['detection_boxes']): + scores.append(score) + labels.append(self.cfg.CLASSES[label]) + boxes.append([b for b in box]) + + results = [{ + OutputKeys.SCORES: scores, + OutputKeys.LABELS: labels, + OutputKeys.BOXES: boxes + } for output in outputs] + + return results diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py new file mode 100644 index 00000000..263f8225 --- /dev/null +++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py @@ -0,0 +1,65 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path +from typing import Any + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import ModelFile, Tasks +from .base import EasyCVPipeline + + +@PIPELINES.register_module( + Tasks.human_wholebody_keypoint, + module_name=Pipelines.human_wholebody_keypoint) +class HumanWholebodyKeypointsPipeline(EasyCVPipeline): + """Pipeline for human wholebody 2d keypoints detection.""" + + def __init__(self, + model: str, + model_file_pattern=ModelFile.TORCH_MODEL_FILE, + *args, + **kwargs): + """ + model (str): model id on modelscope hub or local model path. + model_file_pattern (str): model file pattern. + """ + self.model_dir = model + super(HumanWholebodyKeypointsPipeline, self).__init__( + model=model, + model_file_pattern=model_file_pattern, + *args, + **kwargs) + + def _build_predict_op(self, **kwargs): + """Build EasyCV predictor.""" + from easycv.predictors.builder import build_predictor + detection_predictor_type = self.cfg['DETECTION']['type'] + detection_model_path = os.path.join( + self.model_dir, self.cfg['DETECTION']['model_path']) + detection_cfg_file = os.path.join(self.model_dir, + self.cfg['DETECTION']['config_file']) + detection_score_threshold = self.cfg['DETECTION']['score_threshold'] + self.cfg.pipeline.predictor_config[ + 'detection_predictor_config'] = dict( + type=detection_predictor_type, + model_path=detection_model_path, + config_file=detection_cfg_file, + score_threshold=detection_score_threshold) + easycv_config = self._to_easycv_config() + pipeline_op = build_predictor(self.cfg.pipeline.predictor_config, { + 'model_path': self.model_path, + 'config_file': easycv_config, + **kwargs + }) + return pipeline_op + + def __call__(self, inputs) -> Any: + outputs = self.predict_op(inputs) + + results = [{ + OutputKeys.KEYPOINTS: output['keypoints'], + OutputKeys.BOXES: output['boxes'] + } for output in outputs] + + return results diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 2a5ac694..4fa3d766 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -29,6 +29,7 @@ class CVTasks(object): body_3d_keypoints = 'body-3d-keypoints' hand_2d_keypoints = 'hand-2d-keypoints' general_recognition = 'general-recognition' + human_wholebody_keypoint = 'human-wholebody-keypoint' image_classification = 'image-classification' image_multilabel_classification = 'image-multilabel-classification' diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index eab74688..06a9bbaa 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -80,7 +80,7 @@ def realtime_object_detection_bbox_vis(image, bboxes): def draw_keypoints(output, original_image): - poses = np.array(output[OutputKeys.POSES]) + poses = np.array(output[OutputKeys.KEYPOINTS]) scores = np.array(output[OutputKeys.SCORES]) boxes = np.array(output[OutputKeys.BOXES]) assert len(poses) == len(scores) and len(poses) == len(boxes) @@ -234,3 +234,35 @@ def show_video_summarization_result(video_in_path, result, video_save_path): video_writer.write(frame) video_writer.release() cap.release() + + +def show_image_object_detection_auto_result(img_path, + detection_result, + save_path=None): + scores = detection_result[OutputKeys.SCORES] + labels = detection_result[OutputKeys.LABELS] + bboxes = detection_result[OutputKeys.BOXES] + img = cv2.imread(img_path) + assert img is not None, f"Can't read img: {img_path}" + + for (score, label, box) in zip(scores, labels, bboxes): + cv2.rectangle(img, (int(box[0]), int(box[1])), + (int(box[2]), int(box[3])), (0, 0, 255), 2) + cv2.putText( + img, + f'{score:.2f}', (int(box[0]), int(box[1])), + 1, + 1.0, (0, 255, 0), + thickness=1, + lineType=8) + cv2.putText( + img, + label, (int((box[0] + box[2]) * 0.5), int(box[1])), + 1, + 1.0, (0, 255, 0), + thickness=1, + lineType=8) + + if save_path is not None: + cv2.imwrite(save_path, img) + return img diff --git a/tests/pipelines/test_human_wholebody_keypoint.py b/tests/pipelines/test_human_wholebody_keypoint.py new file mode 100644 index 00000000..b214f4e1 --- /dev/null +++ b/tests/pipelines/test_human_wholebody_keypoint.py @@ -0,0 +1,40 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import cv2 + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_human_wholebody_keypoint(self): + img_path = 'data/test/images/keypoints_detect/img_test_wholebody.jpg' + model_id = 'damo/cv_hrnetw48_human-wholebody-keypoint_image' + + human_wholebody_keypoint_pipeline = pipeline( + task=Tasks.human_wholebody_keypoint, model=model_id) + output = human_wholebody_keypoint_pipeline(img_path)[0] + + output_keypoints = output[OutputKeys.KEYPOINTS] + output_pose = output[OutputKeys.BOXES] + + human_wholebody_keypoint_pipeline.predict_op.show_result( + img_path, + output_keypoints, + output_pose, + scale=1, + save_path='human_wholebody_keypoint_ret.jpg') + + for keypoint in output_keypoints: + self.assertEqual(keypoint.shape[0], 133) + for box in output_pose: + self.assertEqual(box.shape[0], 4) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py index 2a74eb41..2cb217d9 100644 --- a/tests/pipelines/test_object_detection.py +++ b/tests/pipelines/test_object_detection.py @@ -59,6 +59,18 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck): def test_demo_compatibility(self): self.compatibility_check() + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_image_object_detection_auto_pipeline(self): + model_id = 'damo/cv_yolox_image-object-detection-auto' + test_image = 'data/test/images/auto_demo.jpg' + + image_object_detection_auto = pipeline( + Tasks.image_object_detection, model=model_id) + + result = image_object_detection_auto(test_image)[0] + image_object_detection_auto.show_result(test_image, result, + 'auto_demo_ret.jpg') + if __name__ == '__main__': unittest.main() From 2989492bc08245ff02a71ac988b175d9e038d807 Mon Sep 17 00:00:00 2001 From: "yuxiang.tyx" Date: Wed, 12 Oct 2022 19:58:50 +0800 Subject: [PATCH 24/57] =?UTF-8?q?[to=20#42322933]=E6=9B=B4=E6=96=B0face=5F?= =?UTF-8?q?detection=5Fscrfd=E6=A8=A1=E5=9E=8B=E5=B9=B6=E6=94=AF=E6=8C=81f?= =?UTF-8?q?inetune,=20=E6=96=B0=E5=A2=9Ecard=5Fdetection=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 调整face_detection的文件层级(scrfd与其余新增face_detection方法平级); 2. 增加极大脸/旋转脸的检测方法,更新了新模型; 3. 支持读入数据集并finetune和eval; 4. 新增card_detection模型,支持读入datasethub数据集并finetune Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10244540 --- data/test/images/card_detection.jpg | 3 + data/test/images/face_detection2.jpeg | 3 + modelscope/metainfo.py | 3 + .../models/cv/face_detection/__init__.py | 4 +- .../datasets/pipelines/transforms.py | 189 ----- .../cv/face_detection/scrfd/__init__.py | 2 + .../{ => scrfd}/mmdet_patch/__init__.py | 0 .../{ => scrfd}/mmdet_patch/core/__init__.py | 0 .../mmdet_patch/core/bbox/__init__.py | 0 .../mmdet_patch/core/bbox/transforms.py | 4 +- .../core/post_processing/__init__.py | 0 .../core/post_processing/bbox_nms.py | 9 +- .../mmdet_patch/datasets/__init__.py | 0 .../datasets/pipelines/__init__.py | 8 +- .../datasets/pipelines/auto_augment.py | 271 +++++++ .../datasets/pipelines/formating.py | 113 +++ .../mmdet_patch/datasets/pipelines/loading.py | 225 ++++++ .../datasets/pipelines/transforms.py | 737 ++++++++++++++++++ .../mmdet_patch/datasets/retinaface.py | 5 +- .../mmdet_patch/models/__init__.py | 0 .../mmdet_patch/models/backbones/__init__.py | 0 .../mmdet_patch/models/backbones/resnet.py | 0 .../models/dense_heads/__init__.py | 0 .../models/dense_heads/scrfd_head.py | 11 +- .../mmdet_patch/models/detectors/__init__.py | 0 .../mmdet_patch/models/detectors/scrfd.py | 108 ++- .../cv/face_detection/scrfd/scrfd_detect.py | 71 ++ modelscope/outputs.py | 19 + modelscope/pipelines/builder.py | 4 + .../pipelines/cv/card_detection_pipeline.py | 23 + .../pipelines/cv/face_detection_pipeline.py | 39 +- .../pipelines/cv/face_recognition_pipeline.py | 2 +- .../cv/card_detection_scrfd_trainer.py | 18 + .../cv/face_detection_scrfd_trainer.py | 154 ++++ modelscope/utils/constant.py | 1 + modelscope/utils/cv/image_utils.py | 48 ++ tests/pipelines/test_card_detection.py | 66 ++ tests/pipelines/test_face_detection.py | 12 +- .../test_card_detection_scrfd_trainer.py | 151 ++++ .../test_face_detection_scrfd_trainer.py | 150 ++++ 40 files changed, 2174 insertions(+), 279 deletions(-) create mode 100644 data/test/images/card_detection.jpg create mode 100644 data/test/images/face_detection2.jpeg delete mode 100755 modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py create mode 100644 modelscope/models/cv/face_detection/scrfd/__init__.py rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/bbox/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/bbox/transforms.py (94%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/post_processing/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/core/post_processing/bbox_nms.py (89%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/pipelines/__init__.py (53%) create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py create mode 100644 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py create mode 100755 modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/datasets/retinaface.py (97%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/backbones/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/backbones/resnet.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/dense_heads/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/dense_heads/scrfd_head.py (99%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/detectors/__init__.py (100%) rename modelscope/models/cv/face_detection/{ => scrfd}/mmdet_patch/models/detectors/scrfd.py (50%) create mode 100644 modelscope/models/cv/face_detection/scrfd/scrfd_detect.py create mode 100644 modelscope/pipelines/cv/card_detection_pipeline.py create mode 100644 modelscope/trainers/cv/card_detection_scrfd_trainer.py create mode 100644 modelscope/trainers/cv/face_detection_scrfd_trainer.py create mode 100644 tests/pipelines/test_card_detection.py create mode 100644 tests/trainers/test_card_detection_scrfd_trainer.py create mode 100644 tests/trainers/test_face_detection_scrfd_trainer.py diff --git a/data/test/images/card_detection.jpg b/data/test/images/card_detection.jpg new file mode 100644 index 00000000..86728c2c --- /dev/null +++ b/data/test/images/card_detection.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c +size 254845 diff --git a/data/test/images/face_detection2.jpeg b/data/test/images/face_detection2.jpeg new file mode 100644 index 00000000..7f6025fa --- /dev/null +++ b/data/test/images/face_detection2.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae +size 48909 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 759f1688..0917bf3e 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -148,6 +148,7 @@ class Pipelines(object): salient_detection = 'u2net-salient-detection' image_classification = 'image-classification' face_detection = 'resnet-face-detection-scrfd10gkps' + card_detection = 'resnet-card-detection-scrfd34gkps' ulfd_face_detection = 'manual-face-detection-ulfd' facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' retina_face_detection = 'resnet50-face-detection-retinaface' @@ -270,6 +271,8 @@ class Trainers(object): image_portrait_enhancement = 'image-portrait-enhancement' video_summarization = 'video-summarization' movie_scene_segmentation = 'movie-scene-segmentation' + face_detection_scrfd = 'face-detection-scrfd' + card_detection_scrfd = 'card-detection-scrfd' image_inpainting = 'image-inpainting' # nlp trainers diff --git a/modelscope/models/cv/face_detection/__init__.py b/modelscope/models/cv/face_detection/__init__.py index a2a845d2..27d1bd4c 100644 --- a/modelscope/models/cv/face_detection/__init__.py +++ b/modelscope/models/cv/face_detection/__init__.py @@ -8,12 +8,14 @@ if TYPE_CHECKING: from .mtcnn import MtcnnFaceDetector from .retinaface import RetinaFaceDetection from .ulfd_slim import UlfdFaceDetector + from .scrfd import ScrfdDetect else: _import_structure = { 'ulfd_slim': ['UlfdFaceDetector'], 'retinaface': ['RetinaFaceDetection'], 'mtcnn': ['MtcnnFaceDetector'], - 'mogface': ['MogFaceDetector'] + 'mogface': ['MogFaceDetector'], + 'scrfd': ['ScrfdDetect'] } import sys diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py deleted file mode 100755 index 241f2c0e..00000000 --- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/transforms.py +++ /dev/null @@ -1,189 +0,0 @@ -""" -The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at -https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py -""" -import numpy as np -from mmdet.datasets.builder import PIPELINES -from numpy import random - - -@PIPELINES.register_module() -class RandomSquareCrop(object): - """Random crop the image & bboxes, the cropped patches have minimum IoU - requirement with original image & bboxes, the IoU threshold is randomly - selected from min_ious. - - Args: - min_ious (tuple): minimum IoU threshold for all intersections with - bounding boxes - min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, - where a >= min_crop_size). - - Note: - The keys for bboxes, labels and masks should be paired. That is, \ - `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \ - `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`. - """ - - def __init__(self, - crop_ratio_range=None, - crop_choice=None, - bbox_clip_border=True): - - self.crop_ratio_range = crop_ratio_range - self.crop_choice = crop_choice - self.bbox_clip_border = bbox_clip_border - - assert (self.crop_ratio_range is None) ^ (self.crop_choice is None) - if self.crop_ratio_range is not None: - self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range - - self.bbox2label = { - 'gt_bboxes': 'gt_labels', - 'gt_bboxes_ignore': 'gt_labels_ignore' - } - self.bbox2mask = { - 'gt_bboxes': 'gt_masks', - 'gt_bboxes_ignore': 'gt_masks_ignore' - } - - def __call__(self, results): - """Call function to crop images and bounding boxes with minimum IoU - constraint. - - Args: - results (dict): Result dict from loading pipeline. - - Returns: - dict: Result dict with images and bounding boxes cropped, \ - 'img_shape' key is updated. - """ - - if 'img_fields' in results: - assert results['img_fields'] == ['img'], \ - 'Only single img_fields is allowed' - img = results['img'] - assert 'bbox_fields' in results - assert 'gt_bboxes' in results - boxes = results['gt_bboxes'] - h, w, c = img.shape - scale_retry = 0 - if self.crop_ratio_range is not None: - max_scale = self.crop_ratio_max - else: - max_scale = np.amax(self.crop_choice) - while True: - scale_retry += 1 - - if scale_retry == 1 or max_scale > 1.0: - if self.crop_ratio_range is not None: - scale = np.random.uniform(self.crop_ratio_min, - self.crop_ratio_max) - elif self.crop_choice is not None: - scale = np.random.choice(self.crop_choice) - else: - scale = scale * 1.2 - - for i in range(250): - short_side = min(w, h) - cw = int(scale * short_side) - ch = cw - - # TODO +1 - if w == cw: - left = 0 - elif w > cw: - left = random.randint(0, w - cw) - else: - left = random.randint(w - cw, 0) - if h == ch: - top = 0 - elif h > ch: - top = random.randint(0, h - ch) - else: - top = random.randint(h - ch, 0) - - patch = np.array( - (int(left), int(top), int(left + cw), int(top + ch)), - dtype=np.int) - - # center of boxes should inside the crop img - # only adjust boxes and instance masks when the gt is not empty - # adjust boxes - def is_center_of_bboxes_in_patch(boxes, patch): - # TODO >= - center = (boxes[:, :2] + boxes[:, 2:]) / 2 - mask = \ - ((center[:, 0] > patch[0]) - * (center[:, 1] > patch[1]) - * (center[:, 0] < patch[2]) - * (center[:, 1] < patch[3])) - return mask - - mask = is_center_of_bboxes_in_patch(boxes, patch) - if not mask.any(): - continue - for key in results.get('bbox_fields', []): - boxes = results[key].copy() - mask = is_center_of_bboxes_in_patch(boxes, patch) - boxes = boxes[mask] - if self.bbox_clip_border: - boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) - boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) - boxes -= np.tile(patch[:2], 2) - - results[key] = boxes - # labels - label_key = self.bbox2label.get(key) - if label_key in results: - results[label_key] = results[label_key][mask] - - # keypoints field - if key == 'gt_bboxes': - for kps_key in results.get('keypoints_fields', []): - keypointss = results[kps_key].copy() - keypointss = keypointss[mask, :, :] - if self.bbox_clip_border: - keypointss[:, :, : - 2] = keypointss[:, :, :2].clip( - max=patch[2:]) - keypointss[:, :, : - 2] = keypointss[:, :, :2].clip( - min=patch[:2]) - keypointss[:, :, 0] -= patch[0] - keypointss[:, :, 1] -= patch[1] - results[kps_key] = keypointss - - # mask fields - mask_key = self.bbox2mask.get(key) - if mask_key in results: - results[mask_key] = results[mask_key][mask.nonzero() - [0]].crop(patch) - - # adjust the img no matter whether the gt is empty before crop - rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128 - patch_from = patch.copy() - patch_from[0] = max(0, patch_from[0]) - patch_from[1] = max(0, patch_from[1]) - patch_from[2] = min(img.shape[1], patch_from[2]) - patch_from[3] = min(img.shape[0], patch_from[3]) - patch_to = patch.copy() - patch_to[0] = max(0, patch_to[0] * -1) - patch_to[1] = max(0, patch_to[1] * -1) - patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0]) - patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1]) - rimg[patch_to[1]:patch_to[3], - patch_to[0]:patch_to[2], :] = img[ - patch_from[1]:patch_from[3], - patch_from[0]:patch_from[2], :] - img = rimg - results['img'] = img - results['img_shape'] = img.shape - - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(min_ious={self.min_iou}, ' - repr_str += f'crop_size={self.crop_size})' - return repr_str diff --git a/modelscope/models/cv/face_detection/scrfd/__init__.py b/modelscope/models/cv/face_detection/scrfd/__init__.py new file mode 100644 index 00000000..92f81f7a --- /dev/null +++ b/modelscope/models/cv/face_detection/scrfd/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .scrfd_detect import ScrfdDetect diff --git a/modelscope/models/cv/face_detection/mmdet_patch/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/core/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/core/bbox/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py similarity index 94% rename from modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py index d65480eb..75e32d85 100755 --- a/modelscope/models/cv/face_detection/mmdet_patch/core/bbox/transforms.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/bbox/transforms.py @@ -6,7 +6,7 @@ import numpy as np import torch -def bbox2result(bboxes, labels, num_classes, kps=None): +def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5): """Convert detection results to a list of numpy arrays. Args: @@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None): Returns: list(ndarray): bbox results of each class """ - bbox_len = 5 if kps is None else 5 + 10 # if has kps, add 10 kps into bbox + bbox_len = 5 if kps is None else 5 + num_kps * 2 # if has kps, add num_kps*2 into bbox if bboxes.shape[0] == 0: return [ np.zeros((0, bbox_len), dtype=np.float32) diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py similarity index 89% rename from modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py index 7a4f5b3a..697b7338 100644 --- a/modelscope/models/cv/face_detection/mmdet_patch/core/post_processing/bbox_nms.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/core/post_processing/bbox_nms.py @@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes, Args: multi_bboxes (Tensor): shape (n, #class*4) or (n, 4) + multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2) multi_scores (Tensor): shape (n, #class), where the last column contains scores of the background class, but this will be ignored. score_thr (float): bbox threshold, bboxes with scores lower than it @@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes, num_classes = multi_scores.size(1) - 1 # exclude background category kps = None + if multi_kps is not None: + num_kps = int((multi_kps.shape[1] / num_classes) / 2) if multi_bboxes.shape[1] > 4: bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4) if multi_kps is not None: - kps = multi_kps.view(multi_scores.size(0), -1, 10) + kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2) else: bboxes = multi_bboxes[:, None].expand( multi_scores.size(0), num_classes, 4) if multi_kps is not None: kps = multi_kps[:, None].expand( - multi_scores.size(0), num_classes, 10) + multi_scores.size(0), num_classes, num_kps * 2) scores = multi_scores[:, :-1] if score_factors is not None: @@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes, bboxes = bboxes.reshape(-1, 4) if kps is not None: - kps = kps.reshape(-1, 10) + kps = kps.reshape(-1, num_kps * 2) scores = scores.reshape(-1) labels = labels.reshape(-1) diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py similarity index 53% rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py index 85288910..a2cafd1a 100755 --- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/pipelines/__init__.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/__init__.py @@ -2,6 +2,12 @@ The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines """ +from .auto_augment import RotateV2 +from .formating import DefaultFormatBundleV2 +from .loading import LoadAnnotationsV2 from .transforms import RandomSquareCrop -__all__ = ['RandomSquareCrop'] +__all__ = [ + 'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2', + 'DefaultFormatBundleV2' +] diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py new file mode 100644 index 00000000..ee60c2e0 --- /dev/null +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/auto_augment.py @@ -0,0 +1,271 @@ +""" +The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at +https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py +""" +import copy + +import cv2 +import mmcv +import numpy as np +from mmdet.datasets.builder import PIPELINES + +_MAX_LEVEL = 10 + + +def level_to_value(level, max_value): + """Map from level to values based on max_value.""" + return (level / _MAX_LEVEL) * max_value + + +def random_negative(value, random_negative_prob): + """Randomly negate value based on random_negative_prob.""" + return -value if np.random.rand() < random_negative_prob else value + + +def bbox2fields(): + """The key correspondence from bboxes to labels, masks and + segmentations.""" + bbox2label = { + 'gt_bboxes': 'gt_labels', + 'gt_bboxes_ignore': 'gt_labels_ignore' + } + bbox2mask = { + 'gt_bboxes': 'gt_masks', + 'gt_bboxes_ignore': 'gt_masks_ignore' + } + bbox2seg = { + 'gt_bboxes': 'gt_semantic_seg', + } + return bbox2label, bbox2mask, bbox2seg + + +@PIPELINES.register_module() +class RotateV2(object): + """Apply Rotate Transformation to image (and its corresponding bbox, mask, + segmentation). + + Args: + level (int | float): The level should be in range (0,_MAX_LEVEL]. + scale (int | float): Isotropic scale factor. Same in + ``mmcv.imrotate``. + center (int | float | tuple[float]): Center point (w, h) of the + rotation in the source image. If None, the center of the + image will be used. Same in ``mmcv.imrotate``. + img_fill_val (int | float | tuple): The fill value for image border. + If float, the same value will be used for all the three + channels of image. If tuple, the should be 3 elements (e.g. + equals the number of channels for image). + seg_ignore_label (int): The fill value used for segmentation map. + Note this value must equals ``ignore_label`` in ``semantic_head`` + of the corresponding config. Default 255. + prob (float): The probability for perform transformation and + should be in range 0 to 1. + max_rotate_angle (int | float): The maximum angles for rotate + transformation. + random_negative_prob (float): The probability that turns the + offset negative. + """ + + def __init__(self, + level, + scale=1, + center=None, + img_fill_val=128, + seg_ignore_label=255, + prob=0.5, + max_rotate_angle=30, + random_negative_prob=0.5): + assert isinstance(level, (int, float)), \ + f'The level must be type int or float. got {type(level)}.' + assert 0 <= level <= _MAX_LEVEL, \ + f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.' + assert isinstance(scale, (int, float)), \ + f'The scale must be type int or float. got type {type(scale)}.' + if isinstance(center, (int, float)): + center = (center, center) + elif isinstance(center, tuple): + assert len(center) == 2, 'center with type tuple must have '\ + f'2 elements. got {len(center)} elements.' + else: + assert center is None, 'center must be None or type int, '\ + f'float or tuple, got type {type(center)}.' + if isinstance(img_fill_val, (float, int)): + img_fill_val = tuple([float(img_fill_val)] * 3) + elif isinstance(img_fill_val, tuple): + assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\ + f'have 3 elements. got {len(img_fill_val)}.' + img_fill_val = tuple([float(val) for val in img_fill_val]) + else: + raise ValueError( + 'img_fill_val must be float or tuple with 3 elements.') + assert np.all([0 <= val <= 255 for val in img_fill_val]), \ + 'all elements of img_fill_val should between range [0,255]. '\ + f'got {img_fill_val}.' + assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\ + f'got {prob}.' + assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\ + f'should be type int or float. got type {type(max_rotate_angle)}.' + self.level = level + self.scale = scale + # Rotation angle in degrees. Positive values mean + # clockwise rotation. + self.angle = level_to_value(level, max_rotate_angle) + self.center = center + self.img_fill_val = img_fill_val + self.seg_ignore_label = seg_ignore_label + self.prob = prob + self.max_rotate_angle = max_rotate_angle + self.random_negative_prob = random_negative_prob + + def _rotate_img(self, results, angle, center=None, scale=1.0): + """Rotate the image. + + Args: + results (dict): Result dict from loading pipeline. + angle (float): Rotation angle in degrees, positive values + mean clockwise rotation. Same in ``mmcv.imrotate``. + center (tuple[float], optional): Center point (w, h) of the + rotation. Same in ``mmcv.imrotate``. + scale (int | float): Isotropic scale factor. Same in + ``mmcv.imrotate``. + """ + for key in results.get('img_fields', ['img']): + img = results[key].copy() + img_rotated = mmcv.imrotate( + img, angle, center, scale, border_value=self.img_fill_val) + results[key] = img_rotated.astype(img.dtype) + results['img_shape'] = results[key].shape + + def _rotate_bboxes(self, results, rotate_matrix): + """Rotate the bboxes.""" + h, w, c = results['img_shape'] + for key in results.get('bbox_fields', []): + min_x, min_y, max_x, max_y = np.split( + results[key], results[key].shape[-1], axis=-1) + coordinates = np.stack([[min_x, min_y], [max_x, min_y], + [min_x, max_y], + [max_x, max_y]]) # [4, 2, nb_bbox, 1] + # pad 1 to convert from format [x, y] to homogeneous + # coordinates format [x, y, 1] + coordinates = np.concatenate( + (coordinates, + np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)), + axis=1) # [4, 3, nb_bbox, 1] + coordinates = coordinates.transpose( + (2, 0, 1, 3)) # [nb_bbox, 4, 3, 1] + rotated_coords = np.matmul(rotate_matrix, + coordinates) # [nb_bbox, 4, 2, 1] + rotated_coords = rotated_coords[..., 0] # [nb_bbox, 4, 2] + min_x, min_y = np.min( + rotated_coords[:, :, 0], axis=1), np.min( + rotated_coords[:, :, 1], axis=1) + max_x, max_y = np.max( + rotated_coords[:, :, 0], axis=1), np.max( + rotated_coords[:, :, 1], axis=1) + results[key] = np.stack([min_x, min_y, max_x, max_y], + axis=-1).astype(results[key].dtype) + + def _rotate_keypoints90(self, results, angle): + """Rotate the keypoints, only valid when angle in [-90,90,-180,180]""" + if angle not in [-90, 90, 180, -180 + ] or self.scale != 1 or self.center is not None: + return + for key in results.get('keypoints_fields', []): + k = results[key] + if angle == 90: + w, h, c = results['img'].shape + new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1) + elif angle == -90: + w, h, c = results['img'].shape + new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1) + else: + h, w, c = results['img'].shape + new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]], + axis=-1) + # a kps is invalid if thrid value is -1 + kps_invalid = new[..., -1][:, -1] == -1 + new[kps_invalid] = np.zeros(new.shape[1:]) - 1 + results[key] = new + + def _rotate_masks(self, + results, + angle, + center=None, + scale=1.0, + fill_val=0): + """Rotate the masks.""" + h, w, c = results['img_shape'] + for key in results.get('mask_fields', []): + masks = results[key] + results[key] = masks.rotate((h, w), angle, center, scale, fill_val) + + def _rotate_seg(self, + results, + angle, + center=None, + scale=1.0, + fill_val=255): + """Rotate the segmentation map.""" + for key in results.get('seg_fields', []): + seg = results[key].copy() + results[key] = mmcv.imrotate( + seg, angle, center, scale, + border_value=fill_val).astype(seg.dtype) + + def _filter_invalid(self, results, min_bbox_size=0): + """Filter bboxes and corresponding masks too small after rotate + augmentation.""" + bbox2label, bbox2mask, _ = bbox2fields() + for key in results.get('bbox_fields', []): + bbox_w = results[key][:, 2] - results[key][:, 0] + bbox_h = results[key][:, 3] - results[key][:, 1] + valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size) + valid_inds = np.nonzero(valid_inds)[0] + results[key] = results[key][valid_inds] + # label fields. e.g. gt_labels and gt_labels_ignore + label_key = bbox2label.get(key) + if label_key in results: + results[label_key] = results[label_key][valid_inds] + # mask fields, e.g. gt_masks and gt_masks_ignore + mask_key = bbox2mask.get(key) + if mask_key in results: + results[mask_key] = results[mask_key][valid_inds] + + def __call__(self, results): + """Call function to rotate images, bounding boxes, masks and semantic + segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Rotated results. + """ + if np.random.rand() > self.prob: + return results + h, w = results['img'].shape[:2] + center = self.center + if center is None: + center = ((w - 1) * 0.5, (h - 1) * 0.5) + angle = random_negative(self.angle, self.random_negative_prob) + self._rotate_img(results, angle, center, self.scale) + rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale) + self._rotate_bboxes(results, rotate_matrix) + self._rotate_keypoints90(results, angle) + self._rotate_masks(results, angle, center, self.scale, fill_val=0) + self._rotate_seg( + results, angle, center, self.scale, fill_val=self.seg_ignore_label) + self._filter_invalid(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(level={self.level}, ' + repr_str += f'scale={self.scale}, ' + repr_str += f'center={self.center}, ' + repr_str += f'img_fill_val={self.img_fill_val}, ' + repr_str += f'seg_ignore_label={self.seg_ignore_label}, ' + repr_str += f'prob={self.prob}, ' + repr_str += f'max_rotate_angle={self.max_rotate_angle}, ' + repr_str += f'random_negative_prob={self.random_negative_prob})' + return repr_str diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py new file mode 100644 index 00000000..bd2394a8 --- /dev/null +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/formating.py @@ -0,0 +1,113 @@ +""" +The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at +https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py +""" +import numpy as np +import torch +from mmcv.parallel import DataContainer as DC +from mmdet.datasets.builder import PIPELINES + + +def to_tensor(data): + """Convert objects of various python types to :obj:`torch.Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not mmcv.is_str(data): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +@PIPELINES.register_module() +class DefaultFormatBundleV2(object): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields, including "img", + "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". + These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ + (3)to DataContainer (stack=True) + """ + + def __call__(self, results): + """Call function to transform and format common fields in results. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data that is formatted with \ + default bundle. + """ + + if 'img' in results: + img = results['img'] + # add default meta keys + results = self._add_default_meta_keys(results) + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + results['img'] = DC(to_tensor(img), stack=True) + for key in [ + 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss', + 'gt_labels' + ]: + if key not in results: + continue + results[key] = DC(to_tensor(results[key])) + if 'gt_masks' in results: + results['gt_masks'] = DC(results['gt_masks'], cpu_only=True) + if 'gt_semantic_seg' in results: + results['gt_semantic_seg'] = DC( + to_tensor(results['gt_semantic_seg'][None, ...]), stack=True) + return results + + def _add_default_meta_keys(self, results): + """Add default meta keys. + + We set default meta keys including `pad_shape`, `scale_factor` and + `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and + `Pad` are implemented during the whole pipeline. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + results (dict): Updated result dict contains the data to convert. + """ + img = results['img'] + results.setdefault('pad_shape', img.shape) + results.setdefault('scale_factor', 1.0) + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results.setdefault( + 'img_norm_cfg', + dict( + mean=np.zeros(num_channels, dtype=np.float32), + std=np.ones(num_channels, dtype=np.float32), + to_rgb=False)) + return results + + def __repr__(self): + return self.__class__.__name__ diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py new file mode 100644 index 00000000..b4c2a385 --- /dev/null +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/loading.py @@ -0,0 +1,225 @@ +""" +The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at +https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py +""" +import os.path as osp + +import numpy as np +import pycocotools.mask as maskUtils +from mmdet.core import BitmapMasks, PolygonMasks +from mmdet.datasets.builder import PIPELINES + + +@PIPELINES.register_module() +class LoadAnnotationsV2(object): + """Load mutiple types of annotations. + + Args: + with_bbox (bool): Whether to parse and load the bbox annotation. + Default: True. + with_label (bool): Whether to parse and load the label annotation. + Default: True. + with_keypoints (bool): Whether to parse and load the keypoints annotation. + Default: False. + with_mask (bool): Whether to parse and load the mask annotation. + Default: False. + with_seg (bool): Whether to parse and load the semantic segmentation + annotation. Default: False. + poly2mask (bool): Whether to convert the instance masks from polygons + to bitmaps. Default: True. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + """ + + def __init__(self, + with_bbox=True, + with_label=True, + with_keypoints=False, + with_mask=False, + with_seg=False, + poly2mask=True, + file_client_args=dict(backend='disk')): + self.with_bbox = with_bbox + self.with_label = with_label + self.with_keypoints = with_keypoints + self.with_mask = with_mask + self.with_seg = with_seg + self.poly2mask = poly2mask + self.file_client_args = file_client_args.copy() + self.file_client = None + + def _load_bboxes(self, results): + """Private function to load bounding box annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + + ann_info = results['ann_info'] + results['gt_bboxes'] = ann_info['bboxes'].copy() + + gt_bboxes_ignore = ann_info.get('bboxes_ignore', None) + if gt_bboxes_ignore is not None: + results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy() + results['bbox_fields'].append('gt_bboxes_ignore') + results['bbox_fields'].append('gt_bboxes') + return results + + def _load_keypoints(self, results): + """Private function to load bounding box annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + + ann_info = results['ann_info'] + results['gt_keypointss'] = ann_info['keypointss'].copy() + + results['keypoints_fields'] = ['gt_keypointss'] + return results + + def _load_labels(self, results): + """Private function to load label annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded label annotations. + """ + + results['gt_labels'] = results['ann_info']['labels'].copy() + return results + + def _poly2mask(self, mask_ann, img_h, img_w): + """Private function to convert masks represented with polygon to + bitmaps. + + Args: + mask_ann (list | dict): Polygon mask annotation input. + img_h (int): The height of output mask. + img_w (int): The width of output mask. + + Returns: + numpy.ndarray: The decode bitmap mask of shape (img_h, img_w). + """ + + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) + rle = maskUtils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = maskUtils.decode(rle) + return mask + + def process_polygons(self, polygons): + """Convert polygons to list of ndarray and filter invalid polygons. + + Args: + polygons (list[list]): Polygons of one instance. + + Returns: + list[numpy.ndarray]: Processed polygons. + """ + + polygons = [np.array(p) for p in polygons] + valid_polygons = [] + for polygon in polygons: + if len(polygon) % 2 == 0 and len(polygon) >= 6: + valid_polygons.append(polygon) + return valid_polygons + + def _load_masks(self, results): + """Private function to load mask annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded mask annotations. + If ``self.poly2mask`` is set ``True``, `gt_mask` will contain + :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. + """ + + h, w = results['img_info']['height'], results['img_info']['width'] + gt_masks = results['ann_info']['masks'] + if self.poly2mask: + gt_masks = BitmapMasks( + [self._poly2mask(mask, h, w) for mask in gt_masks], h, w) + else: + gt_masks = PolygonMasks( + [self.process_polygons(polygons) for polygons in gt_masks], h, + w) + results['gt_masks'] = gt_masks + results['mask_fields'].append('gt_masks') + return results + + def _load_semantic_seg(self, results): + """Private function to load semantic segmentation annotations. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: The dict contains loaded semantic segmentation annotations. + """ + import mmcv + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + filename = osp.join(results['seg_prefix'], + results['ann_info']['seg_map']) + img_bytes = self.file_client.get(filename) + results['gt_semantic_seg'] = mmcv.imfrombytes( + img_bytes, flag='unchanged').squeeze() + results['seg_fields'].append('gt_semantic_seg') + return results + + def __call__(self, results): + """Call function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded bounding box, label, mask and + semantic segmentation annotations. + """ + + if self.with_bbox: + results = self._load_bboxes(results) + if results is None: + return None + if self.with_label: + results = self._load_labels(results) + if self.with_keypoints: + results = self._load_keypoints(results) + if self.with_mask: + results = self._load_masks(results) + if self.with_seg: + results = self._load_semantic_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(with_bbox={self.with_bbox}, ' + repr_str += f'with_label={self.with_label}, ' + repr_str += f'with_keypoints={self.with_keypoints}, ' + repr_str += f'with_mask={self.with_mask}, ' + repr_str += f'with_seg={self.with_seg})' + repr_str += f'poly2mask={self.poly2mask})' + repr_str += f'poly2mask={self.file_client_args})' + return repr_str diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py new file mode 100755 index 00000000..270c34da --- /dev/null +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/pipelines/transforms.py @@ -0,0 +1,737 @@ +""" +The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at +https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py +""" +import mmcv +import numpy as np +from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps +from mmdet.datasets.builder import PIPELINES +from numpy import random + + +@PIPELINES.register_module() +class ResizeV2(object): + """Resize images & bbox & mask &kps. + + This transform resizes the input image to some scale. Bboxes and masks are + then resized with the same scale factor. If the input dict contains the key + "scale", then the scale in the input dict is used, otherwise the specified + scale in the init method is used. If the input dict contains the key + "scale_factor" (if MultiScaleFlipAug does not give img_scale but + scale_factor), the actual scale will be computed by image shape and + scale_factor. + + `img_scale` can either be a tuple (single-scale) or a list of tuple + (multi-scale). There are 3 multiscale modes: + + - ``ratio_range is not None``: randomly sample a ratio from the ratio \ + range and multiply it with the image scale. + - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ + sample a scale from the multiscale range. + - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ + sample a scale from multiple scales. + + Args: + img_scale (tuple or list[tuple]): Images scales for resizing. + multiscale_mode (str): Either "range" or "value". + ratio_range (tuple[float]): (min_ratio, max_ratio) + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. + bbox_clip_border (bool, optional): Whether clip the objects outside + the border of the image. Defaults to True. + backend (str): Image resize backend, choices are 'cv2' and 'pillow'. + These two backends generates slightly different results. Defaults + to 'cv2'. + override (bool, optional): Whether to override `scale` and + `scale_factor` so as to call resize twice. Default False. If True, + after the first resizing, the existed `scale` and `scale_factor` + will be ignored so the second resizing can be allowed. + This option is a work-around for multiple times of resize in DETR. + Defaults to False. + """ + + def __init__(self, + img_scale=None, + multiscale_mode='range', + ratio_range=None, + keep_ratio=True, + bbox_clip_border=True, + backend='cv2', + override=False): + if img_scale is None: + self.img_scale = None + else: + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) + + if ratio_range is not None: + # mode 1: given a scale and a range of image ratio + assert len(self.img_scale) == 1 + else: + # mode 2: given multiple scales or a range of scales + assert multiscale_mode in ['value', 'range'] + + self.backend = backend + self.multiscale_mode = multiscale_mode + self.ratio_range = ratio_range + self.keep_ratio = keep_ratio + # TODO: refactor the override option in Resize + self.override = override + self.bbox_clip_border = bbox_clip_border + + @staticmethod + def random_select(img_scales): + """Randomly select an img_scale from given candidates. + + Args: + img_scales (list[tuple]): Images scales for selection. + + Returns: + (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ + where ``img_scale`` is the selected image scale and \ + ``scale_idx`` is the selected index in the given candidates. + """ + + assert mmcv.is_list_of(img_scales, tuple) + scale_idx = np.random.randint(len(img_scales)) + img_scale = img_scales[scale_idx] + return img_scale, scale_idx + + @staticmethod + def random_sample(img_scales): + """Randomly sample an img_scale when ``multiscale_mode=='range'``. + + Args: + img_scales (list[tuple]): Images scale range for sampling. + There must be two tuples in img_scales, which specify the lower + and uper bound of image scales. + + Returns: + (tuple, None): Returns a tuple ``(img_scale, None)``, where \ + ``img_scale`` is sampled scale and None is just a placeholder \ + to be consistent with :func:`random_select`. + """ + + assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), + max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), + max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale, None + + @staticmethod + def random_sample_ratio(img_scale, ratio_range): + """Randomly sample an img_scale when ``ratio_range`` is specified. + + A ratio will be randomly sampled from the range specified by + ``ratio_range``. Then it would be multiplied with ``img_scale`` to + generate sampled scale. + + Args: + img_scale (tuple): Images scale base to multiply with ratio. + ratio_range (tuple[float]): The minimum and maximum ratio to scale + the ``img_scale``. + + Returns: + (tuple, None): Returns a tuple ``(scale, None)``, where \ + ``scale`` is sampled ratio multiplied with ``img_scale`` and \ + None is just a placeholder to be consistent with \ + :func:`random_select`. + """ + + assert isinstance(img_scale, tuple) and len(img_scale) == 2 + min_ratio, max_ratio = ratio_range + assert min_ratio <= max_ratio + ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio + scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) + return scale, None + + def _random_scale(self, results): + """Randomly sample an img_scale according to ``ratio_range`` and + ``multiscale_mode``. + + If ``ratio_range`` is specified, a ratio will be sampled and be + multiplied with ``img_scale``. + If multiple scales are specified by ``img_scale``, a scale will be + sampled according to ``multiscale_mode``. + Otherwise, single scale will be used. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: Two new keys 'scale` and 'scale_idx` are added into \ + ``results``, which would be used by subsequent pipelines. + """ + + if self.ratio_range is not None: + scale, scale_idx = self.random_sample_ratio( + self.img_scale[0], self.ratio_range) + elif len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + elif self.multiscale_mode == 'range': + scale, scale_idx = self.random_sample(self.img_scale) + elif self.multiscale_mode == 'value': + scale, scale_idx = self.random_select(self.img_scale) + else: + raise NotImplementedError + + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + for key in results.get('img_fields', ['img']): + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results[key], + results['scale'], + return_scale=True, + backend=self.backend) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results[key].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results[key], + results['scale'], + return_scale=True, + backend=self.backend) + results[key] = img + + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img_shape'] = img.shape + # in case that there is no padding + results['pad_shape'] = img.shape + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_bboxes(self, results): + """Resize bounding boxes with ``results['scale_factor']``.""" + for key in results.get('bbox_fields', []): + bboxes = results[key] * results['scale_factor'] + if self.bbox_clip_border: + img_shape = results['img_shape'] + bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) + bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) + results[key] = bboxes + + def _resize_keypoints(self, results): + """Resize keypoints with ``results['scale_factor']``.""" + for key in results.get('keypoints_fields', []): + keypointss = results[key].copy() + factors = results['scale_factor'] + assert factors[0] == factors[2] + assert factors[1] == factors[3] + keypointss[:, :, 0] *= factors[0] + keypointss[:, :, 1] *= factors[1] + if self.bbox_clip_border: + img_shape = results['img_shape'] + keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0, + img_shape[1]) + keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0, + img_shape[0]) + results[key] = keypointss + + def _resize_masks(self, results): + """Resize masks with ``results['scale']``""" + for key in results.get('mask_fields', []): + if results[key] is None: + continue + if self.keep_ratio: + results[key] = results[key].rescale(results['scale']) + else: + results[key] = results[key].resize(results['img_shape'][:2]) + + def _resize_seg(self, results): + """Resize semantic segmentation map with ``results['scale']``.""" + for key in results.get('seg_fields', []): + if self.keep_ratio: + gt_seg = mmcv.imrescale( + results[key], + results['scale'], + interpolation='nearest', + backend=self.backend) + else: + gt_seg = mmcv.imresize( + results[key], + results['scale'], + interpolation='nearest', + backend=self.backend) + results['gt_semantic_seg'] = gt_seg + + def __call__(self, results): + """Call function to resize images, bounding boxes, masks, semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \ + 'keep_ratio' keys are added into result dict. + """ + + if 'scale' not in results: + if 'scale_factor' in results: + img_shape = results['img'].shape[:2] + scale_factor = results['scale_factor'] + assert isinstance(scale_factor, float) + results['scale'] = tuple( + [int(x * scale_factor) for x in img_shape][::-1]) + else: + self._random_scale(results) + else: + if not self.override: + assert 'scale_factor' not in results, ( + 'scale and scale_factor cannot be both set.') + else: + results.pop('scale') + if 'scale_factor' in results: + results.pop('scale_factor') + self._random_scale(results) + + self._resize_img(results) + self._resize_bboxes(results) + self._resize_keypoints(results) + self._resize_masks(results) + self._resize_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(img_scale={self.img_scale}, ' + repr_str += f'multiscale_mode={self.multiscale_mode}, ' + repr_str += f'ratio_range={self.ratio_range}, ' + repr_str += f'keep_ratio={self.keep_ratio})' + repr_str += f'bbox_clip_border={self.bbox_clip_border})' + return repr_str + + +@PIPELINES.register_module() +class RandomFlipV2(object): + """Flip the image & bbox & mask & kps. + + If the input dict contains the key "flip", then the flag will be used, + otherwise it will be randomly decided by a ratio specified in the init + method. + + When random flip is enabled, ``flip_ratio``/``direction`` can either be a + float/string or tuple of float/string. There are 3 flip modes: + + - ``flip_ratio`` is float, ``direction`` is string: the image will be + ``direction``ly flipped with probability of ``flip_ratio`` . + E.g., ``flip_ratio=0.5``, ``direction='horizontal'``, + then image will be horizontally flipped with probability of 0.5. + - ``flip_ratio`` is float, ``direction`` is list of string: the image wil + be ``direction[i]``ly flipped with probability of + ``flip_ratio/len(direction)``. + E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``, + then image will be horizontally flipped with probability of 0.25, + vertically with probability of 0.25. + - ``flip_ratio`` is list of float, ``direction`` is list of string: + given ``len(flip_ratio) == len(direction)``, the image wil + be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``. + E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal', + 'vertical']``, then image will be horizontally flipped with probability + of 0.3, vertically with probability of 0.5 + + Args: + flip_ratio (float | list[float], optional): The flipping probability. + Default: None. + direction(str | list[str], optional): The flipping direction. Options + are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'. + If input is a list, the length must equal ``flip_ratio``. Each + element in ``flip_ratio`` indicates the flip probability of + corresponding direction. + """ + + def __init__(self, flip_ratio=None, direction='horizontal'): + if isinstance(flip_ratio, list): + assert mmcv.is_list_of(flip_ratio, float) + assert 0 <= sum(flip_ratio) <= 1 + elif isinstance(flip_ratio, float): + assert 0 <= flip_ratio <= 1 + elif flip_ratio is None: + pass + else: + raise ValueError('flip_ratios must be None, float, ' + 'or list of float') + self.flip_ratio = flip_ratio + + valid_directions = ['horizontal', 'vertical', 'diagonal'] + if isinstance(direction, str): + assert direction in valid_directions + elif isinstance(direction, list): + assert mmcv.is_list_of(direction, str) + assert set(direction).issubset(set(valid_directions)) + else: + raise ValueError('direction must be either str or list of str') + self.direction = direction + + if isinstance(flip_ratio, list): + assert len(self.flip_ratio) == len(self.direction) + self.count = 0 + + def bbox_flip(self, bboxes, img_shape, direction): + """Flip bboxes horizontally. + + Args: + bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k) + img_shape (tuple[int]): Image shape (height, width) + direction (str): Flip direction. Options are 'horizontal', + 'vertical'. + + Returns: + numpy.ndarray: Flipped bounding boxes. + """ + + assert bboxes.shape[-1] % 4 == 0 + flipped = bboxes.copy() + if direction == 'horizontal': + w = img_shape[1] + flipped[..., 0::4] = w - bboxes[..., 2::4] + flipped[..., 2::4] = w - bboxes[..., 0::4] + elif direction == 'vertical': + h = img_shape[0] + flipped[..., 1::4] = h - bboxes[..., 3::4] + flipped[..., 3::4] = h - bboxes[..., 1::4] + elif direction == 'diagonal': + w = img_shape[1] + h = img_shape[0] + flipped[..., 0::4] = w - bboxes[..., 2::4] + flipped[..., 1::4] = h - bboxes[..., 3::4] + flipped[..., 2::4] = w - bboxes[..., 0::4] + flipped[..., 3::4] = h - bboxes[..., 1::4] + else: + raise ValueError(f"Invalid flipping direction '{direction}'") + return flipped + + def keypoints_flip(self, keypointss, img_shape, direction): + """Flip keypoints horizontally.""" + + assert direction == 'horizontal' + assert keypointss.shape[-1] == 3 + num_kps = keypointss.shape[1] + assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}' + assert keypointss.ndim == 3 + flipped = keypointss.copy() + if num_kps == 5: + flip_order = [1, 0, 2, 4, 3] + elif num_kps == 4: + flip_order = [3, 2, 1, 0] + for idx, a in enumerate(flip_order): + flipped[:, idx, :] = keypointss[:, a, :] + w = img_shape[1] + flipped[..., 0] = w - flipped[..., 0] + return flipped + + def __call__(self, results): + """Call function to flip bounding boxes, masks, semantic segmentation + maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Flipped results, 'flip', 'flip_direction' keys are added \ + into result dict. + """ + if 'flip' not in results: + if isinstance(self.direction, list): + # None means non-flip + direction_list = self.direction + [None] + else: + # None means non-flip + direction_list = [self.direction, None] + + if isinstance(self.flip_ratio, list): + non_flip_ratio = 1 - sum(self.flip_ratio) + flip_ratio_list = self.flip_ratio + [non_flip_ratio] + else: + non_flip_ratio = 1 - self.flip_ratio + # exclude non-flip + single_ratio = self.flip_ratio / (len(direction_list) - 1) + flip_ratio_list = [single_ratio] * (len(direction_list) + - 1) + [non_flip_ratio] + + cur_dir = np.random.choice(direction_list, p=flip_ratio_list) + + results['flip'] = cur_dir is not None + if 'flip_direction' not in results: + results['flip_direction'] = cur_dir + if results['flip']: + # flip image + for key in results.get('img_fields', ['img']): + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']) + # flip bboxes + for key in results.get('bbox_fields', []): + results[key] = self.bbox_flip(results[key], + results['img_shape'], + results['flip_direction']) + # flip kps + for key in results.get('keypoints_fields', []): + results[key] = self.keypoints_flip(results[key], + results['img_shape'], + results['flip_direction']) + # flip masks + for key in results.get('mask_fields', []): + results[key] = results[key].flip(results['flip_direction']) + + # flip segs + for key in results.get('seg_fields', []): + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})' + + +@PIPELINES.register_module() +class RandomSquareCrop(object): + """Random crop the image & bboxes, the cropped patches have minimum IoU + requirement with original image & bboxes, the IoU threshold is randomly + selected from min_ious. + + Args: + min_ious (tuple): minimum IoU threshold for all intersections with + bounding boxes + min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w, + where a >= min_crop_size). + + Note: + The keys for bboxes, labels and masks should be paired. That is, \ + `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \ + `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`. + """ + + def __init__(self, + crop_ratio_range=None, + crop_choice=None, + bbox_clip_border=True, + big_face_ratio=0, + big_face_crop_choice=None): + + self.crop_ratio_range = crop_ratio_range + self.crop_choice = crop_choice + self.big_face_crop_choice = big_face_crop_choice + self.bbox_clip_border = bbox_clip_border + + assert (self.crop_ratio_range is None) ^ (self.crop_choice is None) + if self.crop_ratio_range is not None: + self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range + + self.bbox2label = { + 'gt_bboxes': 'gt_labels', + 'gt_bboxes_ignore': 'gt_labels_ignore' + } + self.bbox2mask = { + 'gt_bboxes': 'gt_masks', + 'gt_bboxes_ignore': 'gt_masks_ignore' + } + assert big_face_ratio >= 0 and big_face_ratio <= 1.0 + self.big_face_ratio = big_face_ratio + + def __call__(self, results): + """Call function to crop images and bounding boxes with minimum IoU + constraint. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images and bounding boxes cropped, \ + 'img_shape' key is updated. + """ + + if 'img_fields' in results: + assert results['img_fields'] == ['img'], \ + 'Only single img_fields is allowed' + img = results['img'] + assert 'bbox_fields' in results + assert 'gt_bboxes' in results + # try augment big face images + find_bigface = False + if np.random.random() < self.big_face_ratio: + min_size = 100 # h and w + expand_ratio = 0.3 # expand ratio of croped face alongwith both w and h + bbox = results['gt_bboxes'].copy() + lmks = results['gt_keypointss'].copy() + label = results['gt_labels'].copy() + # filter small faces + size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * ( + (bbox[:, 3] - bbox[:, 1]) > min_size) + bbox = bbox[size_mask] + lmks = lmks[size_mask] + label = label[size_mask] + # randomly choose a face that has no overlap with others + if len(bbox) > 0: + overlaps = bbox_overlaps(bbox, bbox) + overlaps -= np.eye(overlaps.shape[0]) + iou_mask = np.sum(overlaps, axis=1) == 0 + bbox = bbox[iou_mask] + lmks = lmks[iou_mask] + label = label[iou_mask] + if len(bbox) > 0: + choice = np.random.randint(len(bbox)) + bbox = bbox[choice] + lmks = lmks[choice] + label = [label[choice]] + w = bbox[2] - bbox[0] + h = bbox[3] - bbox[1] + x1 = bbox[0] - w * expand_ratio + x2 = bbox[2] + w * expand_ratio + y1 = bbox[1] - h * expand_ratio + y2 = bbox[3] + h * expand_ratio + x1, x2 = np.clip([x1, x2], 0, img.shape[1]) + y1, y2 = np.clip([y1, y2], 0, img.shape[0]) + bbox -= np.tile([x1, y1], 2) + lmks -= (x1, y1, 0) + + find_bigface = True + img = img[int(y1):int(y2), int(x1):int(x2), :] + results['gt_bboxes'] = np.expand_dims(bbox, axis=0) + results['gt_keypointss'] = np.expand_dims(lmks, axis=0) + results['gt_labels'] = np.array(label) + results['img'] = img + + boxes = results['gt_bboxes'] + h, w, c = img.shape + + if self.crop_ratio_range is not None: + max_scale = self.crop_ratio_max + else: + max_scale = np.amax(self.crop_choice) + scale_retry = 0 + while True: + scale_retry += 1 + if scale_retry == 1 or max_scale > 1.0: + if self.crop_ratio_range is not None: + scale = np.random.uniform(self.crop_ratio_min, + self.crop_ratio_max) + elif self.crop_choice is not None: + scale = np.random.choice(self.crop_choice) + else: + scale = scale * 1.2 + + if find_bigface: + # select a scale from big_face_crop_choice if in big_face mode + scale = np.random.choice(self.big_face_crop_choice) + + for i in range(250): + long_side = max(w, h) + cw = int(scale * long_side) + ch = cw + + # TODO +1 + if w == cw: + left = 0 + elif w > cw: + left = random.randint(0, w - cw) + else: + left = random.randint(w - cw, 0) + if h == ch: + top = 0 + elif h > ch: + top = random.randint(0, h - ch) + else: + top = random.randint(h - ch, 0) + + patch = np.array( + (int(left), int(top), int(left + cw), int(top + ch)), + dtype=np.int32) + + # center of boxes should inside the crop img + # only adjust boxes and instance masks when the gt is not empty + # adjust boxes + def is_center_of_bboxes_in_patch(boxes, patch): + # TODO >= + center = (boxes[:, :2] + boxes[:, 2:]) / 2 + mask = \ + ((center[:, 0] > patch[0]) + * (center[:, 1] > patch[1]) + * (center[:, 0] < patch[2]) + * (center[:, 1] < patch[3])) + return mask + + mask = is_center_of_bboxes_in_patch(boxes, patch) + if not mask.any(): + continue + for key in results.get('bbox_fields', []): + boxes = results[key].copy() + mask = is_center_of_bboxes_in_patch(boxes, patch) + boxes = boxes[mask] + if self.bbox_clip_border: + boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:]) + boxes[:, :2] = boxes[:, :2].clip(min=patch[:2]) + boxes -= np.tile(patch[:2], 2) + + results[key] = boxes + # labels + label_key = self.bbox2label.get(key) + if label_key in results: + results[label_key] = results[label_key][mask] + + # keypoints field + if key == 'gt_bboxes': + for kps_key in results.get('keypoints_fields', []): + keypointss = results[kps_key].copy() + keypointss = keypointss[mask, :, :] + if self.bbox_clip_border: + keypointss[:, :, : + 2] = keypointss[:, :, :2].clip( + max=patch[2:]) + keypointss[:, :, : + 2] = keypointss[:, :, :2].clip( + min=patch[:2]) + keypointss[:, :, 0] -= patch[0] + keypointss[:, :, 1] -= patch[1] + results[kps_key] = keypointss + + # mask fields + mask_key = self.bbox2mask.get(key) + if mask_key in results: + results[mask_key] = results[mask_key][mask.nonzero() + [0]].crop(patch) + + # adjust the img no matter whether the gt is empty before crop + rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128 + patch_from = patch.copy() + patch_from[0] = max(0, patch_from[0]) + patch_from[1] = max(0, patch_from[1]) + patch_from[2] = min(img.shape[1], patch_from[2]) + patch_from[3] = min(img.shape[0], patch_from[3]) + patch_to = patch.copy() + patch_to[0] = max(0, patch_to[0] * -1) + patch_to[1] = max(0, patch_to[1] * -1) + patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0]) + patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1]) + rimg[patch_to[1]:patch_to[3], + patch_to[0]:patch_to[2], :] = img[ + patch_from[1]:patch_from[3], + patch_from[0]:patch_from[2], :] + img = rimg + results['img'] = img + results['img_shape'] = img.shape + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(min_ious={self.min_iou}, ' + repr_str += f'crop_size={self.crop_size})' + return repr_str diff --git a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py similarity index 97% rename from modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py index bbacd9be..40c440b9 100755 --- a/modelscope/models/cv/face_detection/mmdet_patch/datasets/retinaface.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/datasets/retinaface.py @@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset): CLASSES = ('FG', ) def __init__(self, min_size=None, **kwargs): - self.NK = 5 + self.NK = kwargs.pop('num_kps', 5) self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)} self.min_size = min_size self.gt_path = kwargs.get('gt_path') @@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset): if len(values) > 4: if len(values) > 5: kps = np.array( - values[4:19], dtype=np.float32).reshape((self.NK, 3)) + values[4:4 + self.NK * 3], dtype=np.float32).reshape( + (self.NK, 3)) for li in range(kps.shape[0]): if (kps[li, :] == -1).all(): kps[li][2] = 0.0 # weight = 0, ignore diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/models/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/models/backbones/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/models/backbones/resnet.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/resnet.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py similarity index 99% rename from modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py index acc45670..77ec99cf 100755 --- a/modelscope/models/cv/face_detection/mmdet_patch/models/dense_heads/scrfd_head.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/dense_heads/scrfd_head.py @@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead): scale_mode=1, dw_conv=False, use_kps=False, + num_kps=5, loss_kps=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1), **kwargs): @@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead): self.scale_mode = scale_mode self.use_dfl = True self.dw_conv = dw_conv - self.NK = 5 + self.NK = num_kps self.extra_flops = 0.0 if loss_dfl is None or not loss_dfl: self.use_dfl = False @@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead): batch_size, -1, self.cls_out_channels).sigmoid() bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 4) - kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10) - + kps_pred = kps_pred.permute(0, 2, 3, + 1).reshape(batch_size, -1, self.NK * 2) return cls_score, bbox_pred, kps_pred def forward_train(self, @@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead): if self.use_dfl: kps_pred = self.integral(kps_pred) * stride[0] else: - kps_pred = kps_pred.reshape((-1, 10)) * stride[0] + kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0] nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: @@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead): mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) if mlvl_kps is not None: scale_factor2 = torch.tensor( - [scale_factor[0], scale_factor[1]] * 5) + [scale_factor[0], scale_factor[1]] * self.NK) mlvl_kps /= scale_factor2.to(mlvl_kps.device) mlvl_scores = torch.cat(mlvl_scores) diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py similarity index 100% rename from modelscope/models/cv/face_detection/mmdet_patch/models/detectors/__init__.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/__init__.py diff --git a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py similarity index 50% rename from modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py rename to modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py index a5f5cac2..18b46be1 100755 --- a/modelscope/models/cv/face_detection/mmdet_patch/models/detectors/scrfd.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/scrfd.py @@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector): gt_bboxes_ignore) return losses - def simple_test(self, img, img_metas, rescale=False): + def simple_test(self, + img, + img_metas, + rescale=False, + repeat_head=1, + output_kps_var=0, + output_results=1): """Test function without test time augmentation. Args: @@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector): img_metas (list[dict]): List of image information. rescale (bool, optional): Whether to rescale the results. Defaults to False. + repeat_head (int): repeat inference times in head + output_kps_var (int): whether output kps var to calculate quality + output_results (int): 0: nothing 1: bbox 2: both bbox and kps Returns: list[list[np.ndarray]]: BBox results of each image and classes. @@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector): corresponds to each class. """ x = self.extract_feat(img) - outs = self.bbox_head(x) - if torch.onnx.is_in_onnx_export(): - print('single_stage.py in-onnx-export') - print(outs.__class__) - cls_score, bbox_pred, kps_pred = outs - for c in cls_score: - print(c.shape) - for c in bbox_pred: - print(c.shape) - if self.bbox_head.use_kps: - for c in kps_pred: - print(c.shape) - return (cls_score, bbox_pred, kps_pred) - else: - return (cls_score, bbox_pred) - bbox_list = self.bbox_head.get_bboxes( - *outs, img_metas, rescale=rescale) + assert repeat_head >= 1 + kps_out0 = [] + kps_out1 = [] + kps_out2 = [] + for i in range(repeat_head): + outs = self.bbox_head(x) + kps_out0 += [outs[2][0].detach().cpu().numpy()] + kps_out1 += [outs[2][1].detach().cpu().numpy()] + kps_out2 += [outs[2][2].detach().cpu().numpy()] + if output_kps_var: + var0 = np.var(np.vstack(kps_out0), axis=0).mean() + var1 = np.var(np.vstack(kps_out1), axis=0).mean() + var2 = np.var(np.vstack(kps_out2), axis=0).mean() + var = np.mean([var0, var1, var2]) + else: + var = None - # return kps if use_kps - if len(bbox_list[0]) == 2: - bbox_results = [ - bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes) - for det_bboxes, det_labels in bbox_list - ] - elif len(bbox_list[0]) == 3: - bbox_results = [ - bbox2result( - det_bboxes, - det_labels, - self.bbox_head.num_classes, - kps=det_kps) - for det_bboxes, det_labels, det_kps in bbox_list - ] - return bbox_results + if output_results > 0: + if torch.onnx.is_in_onnx_export(): + print('single_stage.py in-onnx-export') + print(outs.__class__) + cls_score, bbox_pred, kps_pred = outs + for c in cls_score: + print(c.shape) + for c in bbox_pred: + print(c.shape) + if self.bbox_head.use_kps: + for c in kps_pred: + print(c.shape) + return (cls_score, bbox_pred, kps_pred) + else: + return (cls_score, bbox_pred) + bbox_list = self.bbox_head.get_bboxes( + *outs, img_metas, rescale=rescale) + + # return kps if use_kps + if len(bbox_list[0]) == 2: + bbox_results = [ + bbox2result(det_bboxes, det_labels, + self.bbox_head.num_classes) + for det_bboxes, det_labels in bbox_list + ] + elif len(bbox_list[0]) == 3: + if output_results == 2: + bbox_results = [ + bbox2result( + det_bboxes, + det_labels, + self.bbox_head.num_classes, + kps=det_kps, + num_kps=self.bbox_head.NK) + for det_bboxes, det_labels, det_kps in bbox_list + ] + elif output_results == 1: + bbox_results = [ + bbox2result(det_bboxes, det_labels, + self.bbox_head.num_classes) + for det_bboxes, det_labels, _ in bbox_list + ] + else: + bbox_results = None + if var is not None: + return bbox_results, var + else: + return bbox_results def feature_test(self, img): x = self.extract_feat(img) diff --git a/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py new file mode 100644 index 00000000..59611604 --- /dev/null +++ b/modelscope/models/cv/face_detection/scrfd/scrfd_detect.py @@ -0,0 +1,71 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +from copy import deepcopy +from typing import Any, Dict + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['ScrfdDetect'] + + +@MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd) +class ScrfdDetect(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the face detection model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + from mmcv import Config + from mmcv.parallel import MMDataParallel + from mmcv.runner import load_checkpoint + from mmdet.models import build_detector + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD + cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py')) + ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) + cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3) + detector = build_detector(cfg.model) + logger.info(f'loading model from {ckpt_path}') + device = torch.device( + f'cuda:{0}' if torch.cuda.is_available() else 'cpu') + load_checkpoint(detector, ckpt_path, map_location=device) + detector = MMDataParallel(detector, device_ids=[0]) + detector.eval() + self.detector = detector + logger.info('load model done') + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + result = self.detector( + return_loss=False, + rescale=True, + img=[input['img'][0].unsqueeze(0)], + img_metas=[[dict(input['img_metas'][0].data)]], + output_results=2) + assert result is not None + result = result[0][0] + bboxes = result[:, :4].tolist() + kpss = result[:, 5:].tolist() + scores = result[:, 4].tolist() + return { + OutputKeys.SCORES: scores, + OutputKeys.BOXES: bboxes, + OutputKeys.KEYPOINTS: kpss + } + + def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]: + return input diff --git a/modelscope/outputs.py b/modelscope/outputs.py index ab3ea54a..3001c03c 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -90,6 +90,25 @@ TASK_OUTPUTS = { Tasks.face_detection: [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS], + # card detection result for single sample + # { + # "scores": [0.9, 0.1, 0.05, 0.05] + # "boxes": [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ], + # "keypoints": [ + # [x1, y1, x2, y2, x3, y3, x4, y4], + # [x1, y1, x2, y2, x3, y3, x4, y4], + # [x1, y1, x2, y2, x3, y3, x4, y4], + # [x1, y1, x2, y2, x3, y3, x4, y4], + # ], + # } + Tasks.card_detection: + [OutputKeys.SCORES, OutputKeys.BOXES, OutputKeys.KEYPOINTS], + # facial expression recognition result for single sample # { # "scores": [0.9, 0.1, 0.02, 0.02, 0.02, 0.02, 0.02], diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index bc9073bc..174d10b1 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -116,6 +116,10 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.hand_2d_keypoints: (Pipelines.hand_2d_keypoints, 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody'), + Tasks.face_detection: (Pipelines.face_detection, + 'damo/cv_resnet_facedetection_scrfd10gkps'), + Tasks.card_detection: (Pipelines.card_detection, + 'damo/cv_resnet_carddetection_scrfd34gkps'), Tasks.face_detection: (Pipelines.face_detection, 'damo/cv_resnet101_face-detection_cvpr22papermogface'), diff --git a/modelscope/pipelines/cv/card_detection_pipeline.py b/modelscope/pipelines/cv/card_detection_pipeline.py new file mode 100644 index 00000000..00b18024 --- /dev/null +++ b/modelscope/pipelines/cv/card_detection_pipeline.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.metainfo import Pipelines +from modelscope.pipelines.builder import PIPELINES +from modelscope.pipelines.cv.face_detection_pipeline import \ + FaceDetectionPipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.card_detection, module_name=Pipelines.card_detection) +class CardDetectionPipeline(FaceDetectionPipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a card detection pipeline for prediction + Args: + model: model id on modelscope hub. + """ + thr = 0.45 # card/face detect use different threshold + super().__init__(model=model, score_thr=thr, **kwargs) diff --git a/modelscope/pipelines/cv/face_detection_pipeline.py b/modelscope/pipelines/cv/face_detection_pipeline.py index eff5b70f..608567a4 100644 --- a/modelscope/pipelines/cv/face_detection_pipeline.py +++ b/modelscope/pipelines/cv/face_detection_pipeline.py @@ -8,6 +8,7 @@ import PIL import torch from modelscope.metainfo import Pipelines +from modelscope.models.cv.face_detection import ScrfdDetect from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, Pipeline from modelscope.pipelines.builder import PIPELINES @@ -29,27 +30,8 @@ class FaceDetectionPipeline(Pipeline): model: model id on modelscope hub. """ super().__init__(model=model, **kwargs) - from mmcv import Config - from mmcv.parallel import MMDataParallel - from mmcv.runner import load_checkpoint - from mmdet.models import build_detector - from modelscope.models.cv.face_detection.mmdet_patch.datasets import RetinaFaceDataset - from modelscope.models.cv.face_detection.mmdet_patch.datasets.pipelines import RandomSquareCrop - from modelscope.models.cv.face_detection.mmdet_patch.models.backbones import ResNetV1e - from modelscope.models.cv.face_detection.mmdet_patch.models.dense_heads import SCRFDHead - from modelscope.models.cv.face_detection.mmdet_patch.models.detectors import SCRFD - cfg = Config.fromfile(osp.join(model, 'mmcv_scrfd_10g_bnkps.py')) - detector = build_detector( - cfg.model, train_cfg=None, test_cfg=cfg.test_cfg) - ckpt_path = osp.join(model, ModelFile.TORCH_MODEL_BIN_FILE) - logger.info(f'loading model from {ckpt_path}') - device = torch.device( - f'cuda:{0}' if torch.cuda.is_available() else 'cpu') - load_checkpoint(detector, ckpt_path, map_location=device) - detector = MMDataParallel(detector, device_ids=[0]) - detector.eval() + detector = ScrfdDetect(model_dir=model, **kwargs) self.detector = detector - logger.info('load model done') def preprocess(self, input: Input) -> Dict[str, Any]: img = LoadImage.convert_to_ndarray(input) @@ -85,22 +67,7 @@ class FaceDetectionPipeline(Pipeline): return result def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - - result = self.detector( - return_loss=False, - rescale=True, - img=[input['img'][0].unsqueeze(0)], - img_metas=[[dict(input['img_metas'][0].data)]]) - assert result is not None - result = result[0][0] - bboxes = result[:, :4].tolist() - kpss = result[:, 5:].tolist() - scores = result[:, 4].tolist() - return { - OutputKeys.SCORES: scores, - OutputKeys.BOXES: bboxes, - OutputKeys.KEYPOINTS: kpss - } + return self.detector(input) def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: return inputs diff --git a/modelscope/pipelines/cv/face_recognition_pipeline.py b/modelscope/pipelines/cv/face_recognition_pipeline.py index 873e4a1f..abae69d4 100644 --- a/modelscope/pipelines/cv/face_recognition_pipeline.py +++ b/modelscope/pipelines/cv/face_recognition_pipeline.py @@ -49,7 +49,7 @@ class FaceRecognitionPipeline(Pipeline): # face detect pipeline det_model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' self.face_detection = pipeline( - Tasks.face_detection, model=det_model_id) + Tasks.face_detection, model=det_model_id, model_revision='v2') def _choose_face(self, det_result, diff --git a/modelscope/trainers/cv/card_detection_scrfd_trainer.py b/modelscope/trainers/cv/card_detection_scrfd_trainer.py new file mode 100644 index 00000000..e1f81bcf --- /dev/null +++ b/modelscope/trainers/cv/card_detection_scrfd_trainer.py @@ -0,0 +1,18 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from modelscope.metainfo import Trainers +from modelscope.trainers.builder import TRAINERS +from modelscope.trainers.cv.face_detection_scrfd_trainer import \ + FaceDetectionScrfdTrainer + + +@TRAINERS.register_module(module_name=Trainers.card_detection_scrfd) +class CardDetectionScrfdTrainer(FaceDetectionScrfdTrainer): + + def __init__(self, cfg_file: str, *args, **kwargs): + """ High-level finetune api for SCRFD. + + Args: + cfg_file: Path to configuration file. + """ + # card/face dataset use different img folder names + super().__init__(cfg_file, imgdir_name='', **kwargs) diff --git a/modelscope/trainers/cv/face_detection_scrfd_trainer.py b/modelscope/trainers/cv/face_detection_scrfd_trainer.py new file mode 100644 index 00000000..9cfae7dd --- /dev/null +++ b/modelscope/trainers/cv/face_detection_scrfd_trainer.py @@ -0,0 +1,154 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import copy +import os +import os.path as osp +import time +from typing import Callable, Dict, Optional + +from modelscope.metainfo import Trainers +from modelscope.trainers.base import BaseTrainer +from modelscope.trainers.builder import TRAINERS + + +@TRAINERS.register_module(module_name=Trainers.face_detection_scrfd) +class FaceDetectionScrfdTrainer(BaseTrainer): + + def __init__(self, + cfg_file: str, + cfg_modify_fn: Optional[Callable] = None, + *args, + **kwargs): + """ High-level finetune api for SCRFD. + + Args: + cfg_file: Path to configuration file. + cfg_modify_fn: An input fn which is used to modify the cfg read out of the file. + """ + import mmcv + from mmcv.runner import get_dist_info, init_dist + from mmcv.utils import get_git_hash + from mmdet.utils import collect_env, get_root_logger + from mmdet.apis import set_random_seed + from mmdet.models import build_detector + from mmdet.datasets import build_dataset + from mmdet import __version__ + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import DefaultFormatBundleV2 + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import LoadAnnotationsV2 + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RotateV2 + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead + from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD + super().__init__(cfg_file) + cfg = self.cfg + if 'work_dir' in kwargs: + cfg.work_dir = kwargs['work_dir'] + else: + # use config filename as default work_dir if work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(cfg_file))[0]) + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + + if 'resume_from' in kwargs: # pretrain model for finetune + cfg.resume_from = kwargs['resume_from'] + cfg.device = 'cuda' + if 'gpu_ids' in kwargs: + cfg.gpu_ids = kwargs['gpu_ids'] + else: + cfg.gpu_ids = range(1) + labelfile_name = kwargs.pop('labelfile_name', 'labelv2.txt') + imgdir_name = kwargs.pop('imgdir_name', 'images/') + if 'train_root' in kwargs: + cfg.data.train.ann_file = kwargs['train_root'] + labelfile_name + cfg.data.train.img_prefix = kwargs['train_root'] + imgdir_name + if 'val_root' in kwargs: + cfg.data.val.ann_file = kwargs['val_root'] + labelfile_name + cfg.data.val.img_prefix = kwargs['val_root'] + imgdir_name + if 'total_epochs' in kwargs: + cfg.total_epochs = kwargs['total_epochs'] + if cfg_modify_fn is not None: + cfg = cfg_modify_fn(cfg) + if 'launcher' in kwargs: + distributed = True + init_dist(kwargs['launcher'], **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + else: + distributed = False + # no_validate=True will not evaluate checkpoint during training + cfg.no_validate = kwargs.get('no_validate', False) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + meta['config'] = cfg.pretty_text + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + if 'seed' in kwargs: + cfg.seed = kwargs['seed'] + _deterministic = kwargs.get('deterministic', False) + logger.info(f'Set random seed to {kwargs["seed"]}, ' + f'deterministic: {_deterministic}') + set_random_seed(kwargs['seed'], deterministic=_deterministic) + else: + cfg.seed = None + meta['seed'] = cfg.seed + meta['exp_name'] = osp.basename(cfg_file) + + model = build_detector(cfg.model) + model.init_weights() + datasets = [build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + val_dataset.pipeline = cfg.data.train.pipeline + datasets.append(build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmdet version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmdet_version=__version__ + get_git_hash()[:7], + CLASSES=datasets[0].CLASSES) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + + self.cfg = cfg + self.datasets = datasets + self.model = model + self.distributed = distributed + self.timestamp = timestamp + self.meta = meta + self.logger = logger + + def train(self, *args, **kwargs): + from mmdet.apis import train_detector + train_detector( + self.model, + self.datasets, + self.cfg, + distributed=self.distributed, + validate=(not self.cfg.no_validate), + timestamp=self.timestamp, + meta=self.meta) + + def evaluate(self, + checkpoint_path: str = None, + *args, + **kwargs) -> Dict[str, float]: + cfg = self.cfg.evaluation + logger.info(f'eval cfg {cfg}') + logger.info(f'checkpoint_path {checkpoint_path}') diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 4fa3d766..5f0532ce 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -19,6 +19,7 @@ class CVTasks(object): # human face body related animal_recognition = 'animal-recognition' face_detection = 'face-detection' + card_detection = 'card-detection' face_recognition = 'face-recognition' facial_expression_recognition = 'facial-expression-recognition' face_2d_keypoints = 'face-2d-keypoints' diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index 06a9bbaa..2d420892 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -154,6 +154,54 @@ def draw_face_detection_result(img_path, detection_result): return img +def draw_card_detection_result(img_path, detection_result): + + def warp_img(src_img, kps, ratio): + short_size = 500 + if ratio > 1: + obj_h = short_size + obj_w = int(obj_h * ratio) + else: + obj_w = short_size + obj_h = int(obj_w / ratio) + input_pts = np.float32([kps[0], kps[1], kps[2], kps[3]]) + output_pts = np.float32([[0, obj_h - 1], [0, 0], [obj_w - 1, 0], + [obj_w - 1, obj_h - 1]]) + M = cv2.getPerspectiveTransform(input_pts, output_pts) + obj_img = cv2.warpPerspective(src_img, M, (obj_w, obj_h)) + return obj_img + + bboxes = np.array(detection_result[OutputKeys.BOXES]) + kpss = np.array(detection_result[OutputKeys.KEYPOINTS]) + scores = np.array(detection_result[OutputKeys.SCORES]) + img_list = [] + ver_col = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (0, 255, 255)] + img = cv2.imread(img_path) + img_list += [img] + assert img is not None, f"Can't read img: {img_path}" + for i in range(len(scores)): + bbox = bboxes[i].astype(np.int32) + kps = kpss[i].reshape(-1, 2).astype(np.int32) + _w = (kps[0][0] - kps[3][0])**2 + (kps[0][1] - kps[3][1])**2 + _h = (kps[0][0] - kps[1][0])**2 + (kps[0][1] - kps[1][1])**2 + ratio = 1.59 if _w >= _h else 1 / 1.59 + card_img = warp_img(img, kps, ratio) + img_list += [card_img] + score = scores[i] + x1, y1, x2, y2 = bbox + cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 4) + for k, kp in enumerate(kps): + cv2.circle(img, tuple(kp), 1, color=ver_col[k], thickness=10) + cv2.putText( + img, + f'{score:.2f}', (x1, y2), + 1, + 1.0, (0, 255, 0), + thickness=1, + lineType=8) + return img_list + + def created_boxed_image(image_in, box): image = load_image(image_in) img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) diff --git a/tests/pipelines/test_card_detection.py b/tests/pipelines/test_card_detection.py new file mode 100644 index 00000000..d913f494 --- /dev/null +++ b/tests/pipelines/test_card_detection.py @@ -0,0 +1,66 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +import unittest + +import cv2 + +from modelscope.msdatasets import MsDataset +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import draw_card_detection_result +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class CardDetectionTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.card_detection + self.model_id = 'damo/cv_resnet_carddetection_scrfd34gkps' + + def show_result(self, img_path, detection_result): + img_list = draw_card_detection_result(img_path, detection_result) + for i, img in enumerate(img_list): + if i == 0: + cv2.imwrite('result.jpg', img_list[0]) + print( + f'Found {len(img_list)-1} cards, output written to {osp.abspath("result.jpg")}' + ) + else: + cv2.imwrite(f'card_{i}.jpg', img_list[i]) + save_path = osp.abspath(f'card_{i}.jpg') + print(f'detect card_{i}: {save_path}') + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_dataset(self): + input_location = ['data/test/images/card_detection.jpg'] + + dataset = MsDataset.load(input_location, target='image') + card_detection = pipeline(Tasks.card_detection, model=self.model_id) + # note that for dataset output, the inference-output is a Generator that can be iterated. + result = card_detection(dataset) + result = next(result) + self.show_result(input_location[0], result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + card_detection = pipeline(Tasks.card_detection, model=self.model_id) + img_path = 'data/test/images/card_detection.jpg' + + result = card_detection(img_path) + self.show_result(img_path, result) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_modelhub_default_model(self): + card_detection = pipeline(Tasks.card_detection) + img_path = 'data/test/images/card_detection.jpg' + result = card_detection(img_path) + self.show_result(img_path, result) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/pipelines/test_face_detection.py b/tests/pipelines/test_face_detection.py index f89e9a94..31ae403e 100644 --- a/tests/pipelines/test_face_detection.py +++ b/tests/pipelines/test_face_detection.py @@ -25,10 +25,11 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_run_with_dataset(self): - input_location = ['data/test/images/face_detection.png'] + input_location = ['data/test/images/face_detection2.jpeg'] dataset = MsDataset.load(input_location, target='image') - face_detection = pipeline(Tasks.face_detection, model=self.model_id) + face_detection = pipeline( + Tasks.face_detection, model=self.model_id, model_revision='v2') # note that for dataset output, the inference-output is a Generator that can be iterated. result = face_detection(dataset) result = next(result) @@ -36,8 +37,9 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_modelhub(self): - face_detection = pipeline(Tasks.face_detection, model=self.model_id) - img_path = 'data/test/images/face_detection.png' + face_detection = pipeline( + Tasks.face_detection, model=self.model_id, model_revision='v2') + img_path = 'data/test/images/face_detection2.jpeg' result = face_detection(img_path) self.show_result(img_path, result) @@ -45,7 +47,7 @@ class FaceDetectionTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_modelhub_default_model(self): face_detection = pipeline(Tasks.face_detection) - img_path = 'data/test/images/face_detection.png' + img_path = 'data/test/images/face_detection2.jpeg' result = face_detection(img_path) self.show_result(img_path, result) diff --git a/tests/trainers/test_card_detection_scrfd_trainer.py b/tests/trainers/test_card_detection_scrfd_trainer.py new file mode 100644 index 00000000..af87000b --- /dev/null +++ b/tests/trainers/test_card_detection_scrfd_trainer.py @@ -0,0 +1,151 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import glob +import os +import shutil +import tempfile +import unittest + +import torch + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Trainers +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile +from modelscope.utils.test_utils import DistributedTestCase, test_level + + +def _setup(): + model_id = 'damo/cv_resnet_carddetection_scrfd34gkps' + # mini dataset only for unit test, remove '_mini' for full dataset. + ms_ds_syncards = MsDataset.load( + 'SyntheticCards_mini', namespace='shaoxuan') + + data_path = ms_ds_syncards.config_kwargs['split_config'] + train_dir = data_path['train'] + val_dir = data_path['validation'] + train_root = train_dir + '/' + os.listdir(train_dir)[0] + '/' + val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/' + max_epochs = 1 # run epochs in unit test + + cache_path = snapshot_download(model_id) + + tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + return train_root, val_root, max_epochs, cache_path, tmp_dir + + +def train_func(**kwargs): + trainer = build_trainer( + name=Trainers.card_detection_scrfd, default_args=kwargs) + trainer.train() + + +class TestCardDetectionScrfdTrainerSingleGPU(unittest.TestCase): + + def setUp(self): + print(('SingleGPU Testing %s.%s' % + (type(self).__name__, self._testMethodName))) + self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup( + ) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + def _cfg_modify_fn(self, cfg): + cfg.checkpoint_config.interval = 1 + cfg.log_config.interval = 10 + cfg.evaluation.interval = 1 + cfg.data.workers_per_gpu = 3 + cfg.data.samples_per_gpu = 4 # batch size + return cfg + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer_from_scratch(self): + kwargs = dict( + cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'), + work_dir=self.tmp_dir, + train_root=self.train_root, + val_root=self.val_root, + total_epochs=self.max_epochs, + cfg_modify_fn=self._cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.card_detection_scrfd, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_trainer_finetune(self): + pretrain_epoch = 640 + self.max_epochs += pretrain_epoch + kwargs = dict( + cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'), + work_dir=self.tmp_dir, + train_root=self.train_root, + val_root=self.val_root, + total_epochs=self.max_epochs, + resume_from=os.path.join(self.cache_path, + ModelFile.TORCH_MODEL_BIN_FILE), + cfg_modify_fn=self._cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.card_detection_scrfd, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(pretrain_epoch, self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + +@unittest.skipIf(not torch.cuda.is_available() + or torch.cuda.device_count() <= 1, 'distributed unittest') +class TestCardDetectionScrfdTrainerMultiGpus(DistributedTestCase): + + def setUp(self): + print(('MultiGPUs Testing %s.%s' % + (type(self).__name__, self._testMethodName))) + self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup( + ) + cfg_file_path = os.path.join(self.cache_path, 'mmcv_scrfd.py') + cfg = Config.from_file(cfg_file_path) + cfg.checkpoint_config.interval = 1 + cfg.log_config.interval = 10 + cfg.evaluation.interval = 1 + cfg.data.workers_per_gpu = 3 + cfg.data.samples_per_gpu = 4 + cfg.dump(cfg_file_path) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_multi_gpus_finetune(self): + pretrain_epoch = 640 + self.max_epochs += pretrain_epoch + kwargs = dict( + cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'), + work_dir=self.tmp_dir, + train_root=self.train_root, + val_root=self.val_root, + total_epochs=self.max_epochs, + resume_from=os.path.join(self.cache_path, + ModelFile.TORCH_MODEL_BIN_FILE), + launcher='pytorch') + self.start(train_func, num_gpus=2, **kwargs) + results_files = os.listdir(self.tmp_dir) + json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) + self.assertEqual(len(json_files), 1) + for i in range(pretrain_epoch, self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/trainers/test_face_detection_scrfd_trainer.py b/tests/trainers/test_face_detection_scrfd_trainer.py new file mode 100644 index 00000000..eb9440ef --- /dev/null +++ b/tests/trainers/test_face_detection_scrfd_trainer.py @@ -0,0 +1,150 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import glob +import os +import shutil +import tempfile +import unittest + +import torch + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Trainers +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile +from modelscope.utils.test_utils import DistributedTestCase, test_level + + +def _setup(): + model_id = 'damo/cv_resnet_facedetection_scrfd10gkps' + # mini dataset only for unit test, remove '_mini' for full dataset. + ms_ds_widerface = MsDataset.load('WIDER_FACE_mini', namespace='shaoxuan') + + data_path = ms_ds_widerface.config_kwargs['split_config'] + train_dir = data_path['train'] + val_dir = data_path['validation'] + train_root = train_dir + '/' + os.listdir(train_dir)[0] + '/' + val_root = val_dir + '/' + os.listdir(val_dir)[0] + '/' + max_epochs = 1 # run epochs in unit test + + cache_path = snapshot_download(model_id, revision='v2') + + tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + return train_root, val_root, max_epochs, cache_path, tmp_dir + + +def train_func(**kwargs): + trainer = build_trainer( + name=Trainers.face_detection_scrfd, default_args=kwargs) + trainer.train() + + +class TestFaceDetectionScrfdTrainerSingleGPU(unittest.TestCase): + + def setUp(self): + print(('SingleGPU Testing %s.%s' % + (type(self).__name__, self._testMethodName))) + self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup( + ) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + def _cfg_modify_fn(self, cfg): + cfg.checkpoint_config.interval = 1 + cfg.log_config.interval = 10 + cfg.evaluation.interval = 1 + cfg.data.workers_per_gpu = 3 + cfg.data.samples_per_gpu = 4 # batch size + return cfg + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer_from_scratch(self): + kwargs = dict( + cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'), + work_dir=self.tmp_dir, + train_root=self.train_root, + val_root=self.val_root, + total_epochs=self.max_epochs, + cfg_modify_fn=self._cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.face_detection_scrfd, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_trainer_finetune(self): + pretrain_epoch = 640 + self.max_epochs += pretrain_epoch + kwargs = dict( + cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'), + work_dir=self.tmp_dir, + train_root=self.train_root, + val_root=self.val_root, + total_epochs=self.max_epochs, + resume_from=os.path.join(self.cache_path, + ModelFile.TORCH_MODEL_BIN_FILE), + cfg_modify_fn=self._cfg_modify_fn) + + trainer = build_trainer( + name=Trainers.face_detection_scrfd, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files) + for i in range(pretrain_epoch, self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + +@unittest.skipIf(not torch.cuda.is_available() + or torch.cuda.device_count() <= 1, 'distributed unittest') +class TestFaceDetectionScrfdTrainerMultiGpus(DistributedTestCase): + + def setUp(self): + print(('MultiGPUs Testing %s.%s' % + (type(self).__name__, self._testMethodName))) + self.train_root, self.val_root, self.max_epochs, self.cache_path, self.tmp_dir = _setup( + ) + cfg_file_path = os.path.join(self.cache_path, 'mmcv_scrfd.py') + cfg = Config.from_file(cfg_file_path) + cfg.checkpoint_config.interval = 1 + cfg.log_config.interval = 10 + cfg.evaluation.interval = 1 + cfg.data.workers_per_gpu = 3 + cfg.data.samples_per_gpu = 4 + cfg.dump(cfg_file_path) + + def tearDown(self): + shutil.rmtree(self.tmp_dir) + super().tearDown() + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_multi_gpus_finetune(self): + pretrain_epoch = 640 + self.max_epochs += pretrain_epoch + kwargs = dict( + cfg_file=os.path.join(self.cache_path, 'mmcv_scrfd.py'), + work_dir=self.tmp_dir, + train_root=self.train_root, + val_root=self.val_root, + total_epochs=self.max_epochs, + resume_from=os.path.join(self.cache_path, + ModelFile.TORCH_MODEL_BIN_FILE), + launcher='pytorch') + self.start(train_func, num_gpus=2, **kwargs) + results_files = os.listdir(self.tmp_dir) + json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) + self.assertEqual(len(json_files), 1) + for i in range(pretrain_epoch, self.max_epochs): + self.assertIn(f'epoch_{i+1}.pth', results_files) + + +if __name__ == '__main__': + unittest.main() From 3863efc14d6da1786a93e7652d949d8d55ae8624 Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Thu, 13 Oct 2022 10:15:33 +0800 Subject: [PATCH 25/57] [to #42322933] add far field KWS trainer Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10275823 --- data/test/audios/noise_2ch.wav | 3 + .../test/audios/wake_word_with_label_xyxy.wav | 3 + modelscope/metainfo.py | 1 + modelscope/models/audio/kws/farfield/model.py | 63 ++-- .../task_datasets/audio/__init__.py | 21 ++ .../audio/kws_farfield_dataset.py | 280 ++++++++++++++++++ .../trainers/audio/kws_farfield_trainer.py | 279 +++++++++++++++++ modelscope/utils/audio/audio_utils.py | 18 ++ requirements/audio.txt | 6 +- .../audio/test_kws_farfield_trainer.py | 85 ++++++ 10 files changed, 721 insertions(+), 38 deletions(-) create mode 100644 data/test/audios/noise_2ch.wav create mode 100644 data/test/audios/wake_word_with_label_xyxy.wav create mode 100644 modelscope/msdatasets/task_datasets/audio/__init__.py create mode 100644 modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py create mode 100644 modelscope/trainers/audio/kws_farfield_trainer.py create mode 100644 tests/trainers/audio/test_kws_farfield_trainer.py diff --git a/data/test/audios/noise_2ch.wav b/data/test/audios/noise_2ch.wav new file mode 100644 index 00000000..c754e39a --- /dev/null +++ b/data/test/audios/noise_2ch.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d +size 2327764 diff --git a/data/test/audios/wake_word_with_label_xyxy.wav b/data/test/audios/wake_word_with_label_xyxy.wav new file mode 100644 index 00000000..b7999777 --- /dev/null +++ b/data/test/audios/wake_word_with_label_xyxy.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150 +size 68524 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 0917bf3e..46c3b138 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -285,6 +285,7 @@ class Trainers(object): # audio trainers speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' class Preprocessors(object): diff --git a/modelscope/models/audio/kws/farfield/model.py b/modelscope/models/audio/kws/farfield/model.py index fea82194..d63d1e2a 100644 --- a/modelscope/models/audio/kws/farfield/model.py +++ b/modelscope/models/audio/kws/farfield/model.py @@ -1,15 +1,14 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os -from typing import Dict - -import torch +from typing import Dict, Optional from modelscope.metainfo import Models from modelscope.models import TorchModel from modelscope.models.base import Tensor from modelscope.models.builder import MODELS -from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.audio.audio_utils import update_conf +from modelscope.utils.constant import Tasks from .fsmn_sele_v2 import FSMNSeleNetV2 @@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel): MODEL_TXT = 'model.txt' SC_CONFIG = 'sound_connect.conf' - SC_CONF_ITEM_KWS_MODEL = '${kws_model}' - def __init__(self, model_dir: str, *args, **kwargs): + def __init__(self, + model_dir: str, + training: Optional[bool] = False, + *args, + **kwargs): """initialize the dfsmn model from the `model_dir` path. Args: model_dir (str): the model path. """ super().__init__(model_dir, *args, **kwargs) - sc_config_file = os.path.join(model_dir, self.SC_CONFIG) - model_txt_file = os.path.join(model_dir, self.MODEL_TXT) - model_bin_file = os.path.join(model_dir, - ModelFile.TORCH_MODEL_BIN_FILE) - self._model = None - if os.path.exists(model_bin_file): - kwargs.pop('device') - self._model = FSMNSeleNetV2(*args, **kwargs) - checkpoint = torch.load(model_bin_file) - self._model.load_state_dict(checkpoint, strict=False) - - self._sc = None - if os.path.exists(model_txt_file): - with open(sc_config_file) as f: - lines = f.readlines() - with open(sc_config_file, 'w') as f: - for line in lines: - if self.SC_CONF_ITEM_KWS_MODEL in line: - line = line.replace(self.SC_CONF_ITEM_KWS_MODEL, - model_txt_file) - f.write(line) - import py_sound_connect - self._sc = py_sound_connect.SoundConnect(sc_config_file) - self.size_in = self._sc.bytesPerBlockIn() - self.size_out = self._sc.bytesPerBlockOut() - - if self._model is None and self._sc is None: - raise Exception( - f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.' - ) + if training: + self.model = FSMNSeleNetV2(*args, **kwargs) + else: + sc_config_file = os.path.join(model_dir, self.SC_CONFIG) + model_txt_file = os.path.join(model_dir, self.MODEL_TXT) + self._sc = None + if os.path.exists(model_txt_file): + conf_dict = dict(mode=56542, kws_model=model_txt_file) + update_conf(sc_config_file, sc_config_file, conf_dict) + import py_sound_connect + self._sc = py_sound_connect.SoundConnect(sc_config_file) + self.size_in = self._sc.bytesPerBlockIn() + self.size_out = self._sc.bytesPerBlockOut() + else: + raise Exception( + f'Invalid model directory! Failed to load model file: {model_txt_file}.' + ) def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]: - ... + return self.model.forward(input) def forward_decode(self, data: bytes): result = {'pcm': self._sc.process(data, self.size_out)} diff --git a/modelscope/msdatasets/task_datasets/audio/__init__.py b/modelscope/msdatasets/task_datasets/audio/__init__.py new file mode 100644 index 00000000..c62a8d9c --- /dev/null +++ b/modelscope/msdatasets/task_datasets/audio/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .kws_farfield_dataset import KWSDataset, KWSDataLoader + +else: + _import_structure = { + 'kws_farfield_dataset': ['KWSDataset', 'KWSDataLoader'], + } + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py b/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py new file mode 100644 index 00000000..8c518ec9 --- /dev/null +++ b/modelscope/msdatasets/task_datasets/audio/kws_farfield_dataset.py @@ -0,0 +1,280 @@ +""" +Used to prepare simulated data. +""" +import math +import os.path +import queue +import threading +import time + +import numpy as np +import torch + +from modelscope.utils.logger import get_logger + +logger = get_logger() + +BLOCK_DEC = 2 +BLOCK_CAT = 3 +FBANK_SIZE = 40 +LABEL_SIZE = 1 +LABEL_GAIN = 100.0 + + +class KWSDataset: + """ + dataset for keyword spotting and vad + conf_basetrain: basetrain configure file path + conf_finetune: finetune configure file path, null allowed + numworkers: no. of workers + basetrainratio: basetrain workers ratio + numclasses: no. of nn output classes, 2 classes to generate vad label + blockdec: block decimation + blockcat: block concatenation + """ + + def __init__(self, + conf_basetrain, + conf_finetune, + numworkers, + basetrainratio, + numclasses, + blockdec=BLOCK_CAT, + blockcat=BLOCK_CAT): + super().__init__() + self.numclasses = numclasses + self.blockdec = blockdec + self.blockcat = blockcat + self.sims_base = [] + self.sims_senior = [] + self.setup_sims(conf_basetrain, conf_finetune, numworkers, + basetrainratio) + + def release(self): + for sim in self.sims_base: + del sim + for sim in self.sims_senior: + del sim + del self.base_conf + del self.senior_conf + logger.info('KWSDataset: Released.') + + def setup_sims(self, conf_basetrain, conf_finetune, numworkers, + basetrainratio): + if not os.path.exists(conf_basetrain): + raise ValueError(f'{conf_basetrain} does not exist!') + if not os.path.exists(conf_finetune): + raise ValueError(f'{conf_finetune} does not exist!') + import py_sound_connect + logger.info('KWSDataset init SoundConnect...') + num_base = math.ceil(numworkers * basetrainratio) + num_senior = numworkers - num_base + # hold by fields to avoid python releasing conf object + self.base_conf = py_sound_connect.ConfigFile(conf_basetrain) + self.senior_conf = py_sound_connect.ConfigFile(conf_finetune) + for i in range(num_base): + fs = py_sound_connect.FeatSimuKWS(self.base_conf.params) + self.sims_base.append(fs) + for i in range(num_senior): + self.sims_senior.append( + py_sound_connect.FeatSimuKWS(self.senior_conf.params)) + logger.info('KWSDataset init SoundConnect finished.') + + def getBatch(self, id): + """ + Generate a data batch + + Args: + id: worker id + + Return: time x channel x feature, label + """ + fs = self.get_sim(id) + fs.processBatch() + # get multi-channel feature vector size + featsize = fs.featSize() + # get label vector size + labelsize = fs.labelSize() + # get minibatch size (time dimension) + # batchsize = fs.featBatchSize() + # no. of fe output channels + numchs = featsize // FBANK_SIZE + # get raw data + fs_feat = fs.feat() + data = np.frombuffer(fs_feat, dtype='float32') + data = data.reshape((-1, featsize + labelsize)) + + # convert float label to int + label = data[:, FBANK_SIZE * numchs:] + + if self.numclasses == 2: + # generate vad label + label[label > 0.0] = 1.0 + else: + # generate kws label + label = np.round(label * LABEL_GAIN) + label[label > self.numclasses - 1] = 0.0 + + # decimated size + size1 = int(np.ceil( + label.shape[0] / self.blockdec)) - self.blockcat + 1 + + # label decimation + label1 = np.zeros((size1, LABEL_SIZE), dtype='float32') + for tau in range(size1): + label1[tau, :] = label[(tau + self.blockcat // 2) + * self.blockdec, :] + + # feature decimation and concatenation + # time x channel x feature + featall = np.zeros((size1, numchs, FBANK_SIZE * self.blockcat), + dtype='float32') + for n in range(numchs): + feat = data[:, FBANK_SIZE * n:FBANK_SIZE * (n + 1)] + + for tau in range(size1): + for i in range(self.blockcat): + featall[tau, n, FBANK_SIZE * i:FBANK_SIZE * (i + 1)] = \ + feat[(tau + i) * self.blockdec, :] + + return torch.from_numpy(featall), torch.from_numpy(label1).long() + + def get_sim(self, id): + num_base = len(self.sims_base) + if id < num_base: + fs = self.sims_base[id] + else: + fs = self.sims_senior[id - num_base] + return fs + + +class Worker(threading.Thread): + """ + id: worker id + dataset: the dataset + pool: queue as the global data buffer + """ + + def __init__(self, id, dataset, pool): + threading.Thread.__init__(self) + + self.id = id + self.dataset = dataset + self.pool = pool + self.isrun = True + self.nn = 0 + + def run(self): + while self.isrun: + self.nn += 1 + logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:1') + # get simulated minibatch + if self.isrun: + data = self.dataset.getBatch(self.id) + logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:2') + + # put data into buffer + if self.isrun: + self.pool.put(data) + logger.debug(f'Worker {self.id:02d} running {self.nn:05d}:3') + + logger.info('KWSDataLoader: Worker {:02d} stopped.'.format(self.id)) + + def stopWorker(self): + """ + stop the worker thread + """ + self.isrun = False + + +class KWSDataLoader: + """ + dataset: the dataset reference + batchsize: data batch size + numworkers: no. of workers + prefetch: prefetch factor + """ + + def __init__(self, dataset, batchsize, numworkers, prefetch=2): + self.dataset = dataset + self.batchsize = batchsize + self.datamap = {} + self.isrun = True + + # data queue + self.pool = queue.Queue(batchsize * prefetch) + + # initialize workers + self.workerlist = [] + for id in range(numworkers): + w = Worker(id, dataset, self.pool) + self.workerlist.append(w) + + def __iter__(self): + return self + + def __next__(self): + while self.isrun: + # get data from common data pool + data = self.pool.get() + self.pool.task_done() + + # group minibatches with the same shape + key = str(data[0].shape) + + batchl = self.datamap.get(key) + if batchl is None: + batchl = [] + self.datamap.update({key: batchl}) + + batchl.append(data) + + # a full data batch collected + if len(batchl) >= self.batchsize: + featbatch = [] + labelbatch = [] + + for feat, label in batchl: + featbatch.append(feat) + labelbatch.append(label) + + batchl.clear() + + feattensor = torch.stack(featbatch, dim=0) + labeltensor = torch.stack(labelbatch, dim=0) + + if feattensor.shape[-2] == 1: + logger.debug('KWSDataLoader: Basetrain batch.') + else: + logger.debug('KWSDataLoader: Finetune batch.') + + return feattensor, labeltensor + + return None, None + + def start(self): + """ + start multi-thread data loader + """ + for w in self.workerlist: + w.start() + + def stop(self): + """ + stop data loader + """ + logger.info('KWSDataLoader: Stopping...') + self.isrun = False + + for w in self.workerlist: + w.stopWorker() + + while not self.pool.empty(): + self.pool.get(block=True, timeout=0.001) + + # wait workers terminated + for w in self.workerlist: + while not self.pool.empty(): + self.pool.get(block=True, timeout=0.001) + w.join() + logger.info('KWSDataLoader: All worker stopped.') diff --git a/modelscope/trainers/audio/kws_farfield_trainer.py b/modelscope/trainers/audio/kws_farfield_trainer.py new file mode 100644 index 00000000..a720ced5 --- /dev/null +++ b/modelscope/trainers/audio/kws_farfield_trainer.py @@ -0,0 +1,279 @@ +import datetime +import math +import os +from typing import Callable, Dict, Optional + +import numpy as np +import torch +from torch import nn as nn +from torch import optim as optim + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Trainers +from modelscope.models import Model, TorchModel +from modelscope.msdatasets.task_datasets.audio import KWSDataLoader, KWSDataset +from modelscope.trainers.base import BaseTrainer +from modelscope.trainers.builder import TRAINERS +from modelscope.utils.audio.audio_utils import update_conf +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile +from modelscope.utils.data_utils import to_device +from modelscope.utils.device import create_device +from modelscope.utils.logger import get_logger +from modelscope.utils.torch_utils import (get_dist_info, get_local_rank, + init_dist, is_master) + +logger = get_logger() + +BASETRAIN_CONF_EASY = 'basetrain_easy' +BASETRAIN_CONF_NORMAL = 'basetrain_normal' +BASETRAIN_CONF_HARD = 'basetrain_hard' +FINETUNE_CONF_EASY = 'finetune_easy' +FINETUNE_CONF_NORMAL = 'finetune_normal' +FINETUNE_CONF_HARD = 'finetune_hard' + +EASY_RATIO = 0.1 +NORMAL_RATIO = 0.6 +HARD_RATIO = 0.3 +BASETRAIN_RATIO = 0.5 + + +@TRAINERS.register_module(module_name=Trainers.speech_dfsmn_kws_char_farfield) +class KWSFarfieldTrainer(BaseTrainer): + DEFAULT_WORK_DIR = './work_dir' + conf_keys = (BASETRAIN_CONF_EASY, FINETUNE_CONF_EASY, + BASETRAIN_CONF_NORMAL, FINETUNE_CONF_NORMAL, + BASETRAIN_CONF_HARD, FINETUNE_CONF_HARD) + + def __init__(self, + model: str, + work_dir: str, + cfg_file: Optional[str] = None, + arg_parse_fn: Optional[Callable] = None, + model_revision: Optional[str] = DEFAULT_MODEL_REVISION, + custom_conf: Optional[dict] = None, + **kwargs): + + if isinstance(model, str): + if os.path.exists(model): + self.model_dir = model if os.path.isdir( + model) else os.path.dirname(model) + else: + self.model_dir = snapshot_download( + model, revision=model_revision) + if cfg_file is None: + cfg_file = os.path.join(self.model_dir, + ModelFile.CONFIGURATION) + else: + assert cfg_file is not None, 'Config file should not be None if model is not from pretrained!' + self.model_dir = os.path.dirname(cfg_file) + + super().__init__(cfg_file, arg_parse_fn) + + self.model = self.build_model() + self.work_dir = work_dir + # the number of model output dimension + # should update config outside the trainer, if user need more wake word + self._num_classes = self.cfg.model.num_syn + + if kwargs.get('launcher', None) is not None: + init_dist(kwargs['launcher']) + + _, world_size = get_dist_info() + self._dist = world_size > 1 + + device_name = kwargs.get('device', 'gpu') + if self._dist: + local_rank = get_local_rank() + device_name = f'cuda:{local_rank}' + + self.device = create_device(device_name) + # model placement + if self.device.type == 'cuda': + self.model.to(self.device) + + if 'max_epochs' not in kwargs: + assert hasattr( + self.cfg.train, 'max_epochs' + ), 'max_epochs is missing from the configuration file' + self._max_epochs = self.cfg.train.max_epochs + else: + self._max_epochs = kwargs['max_epochs'] + self._train_iters = kwargs.get('train_iters_per_epoch', None) + self._val_iters = kwargs.get('val_iters_per_epoch', None) + if self._train_iters is None: + self._train_iters = self.cfg.train.train_iters_per_epoch + if self._val_iters is None: + self._val_iters = self.cfg.evaluation.val_iters_per_epoch + dataloader_config = self.cfg.train.dataloader + self._threads = kwargs.get('workers', None) + if self._threads is None: + self._threads = dataloader_config.workers_per_gpu + self._single_rate = BASETRAIN_RATIO + if 'single_rate' in kwargs: + self._single_rate = kwargs['single_rate'] + self._batch_size = dataloader_config.batch_size_per_gpu + if 'model_bin' in kwargs: + model_bin_file = os.path.join(self.model_dir, kwargs['model_bin']) + checkpoint = torch.load(model_bin_file) + self.model.load_state_dict(checkpoint) + # build corresponding optimizer and loss function + lr = self.cfg.train.optimizer.lr + self.optimizer = optim.Adam(self.model.parameters(), lr) + self.loss_fn = nn.CrossEntropyLoss() + self.data_val = None + self.json_log_path = os.path.join(self.work_dir, + '{}.log.json'.format(self.timestamp)) + self.conf_files = [] + for conf_key in self.conf_keys: + template_file = os.path.join(self.model_dir, conf_key) + conf_file = os.path.join(self.model_dir, f'{conf_key}.conf') + update_conf(template_file, conf_file, custom_conf[conf_key]) + self.conf_files.append(conf_file) + self._current_epoch = 0 + self.stages = (math.floor(self._max_epochs * EASY_RATIO), + math.floor(self._max_epochs * NORMAL_RATIO), + math.floor(self._max_epochs * HARD_RATIO)) + + def build_model(self) -> nn.Module: + """ Instantiate a pytorch model and return. + + By default, we will create a model using config from configuration file. You can + override this method in a subclass. + + """ + model = Model.from_pretrained( + self.model_dir, cfg_dict=self.cfg, training=True) + if isinstance(model, TorchModel) and hasattr(model, 'model'): + return model.model + elif isinstance(model, nn.Module): + return model + + def train(self, *args, **kwargs): + if not self.data_val: + self.gen_val() + logger.info('Start training...') + totaltime = datetime.datetime.now() + + for stage, num_epoch in enumerate(self.stages): + self.run_stage(stage, num_epoch) + + # total time spent + totaltime = datetime.datetime.now() - totaltime + logger.info('Total time spent: {:.2f} hours\n'.format( + totaltime.total_seconds() / 3600.0)) + + def run_stage(self, stage, num_epoch): + """ + Run training stages with correspond data + + Args: + stage: id of stage + num_epoch: the number of epoch to run in this stage + """ + if num_epoch <= 0: + logger.warning(f'Invalid epoch number, stage {stage} exit!') + return + logger.info(f'Starting stage {stage}...') + dataset, dataloader = self.create_dataloader( + self.conf_files[stage * 2], self.conf_files[stage * 2 + 1]) + it = iter(dataloader) + for _ in range(num_epoch): + self._current_epoch += 1 + epochtime = datetime.datetime.now() + logger.info('Start epoch %d...', self._current_epoch) + loss_train_epoch = 0.0 + validbatchs = 0 + for bi in range(self._train_iters): + # prepare data + feat, label = next(it) + label = torch.reshape(label, (-1, )) + feat = to_device(feat, self.device) + label = to_device(label, self.device) + # apply model + self.optimizer.zero_grad() + predict = self.model(feat) + # calculate loss + loss = self.loss_fn( + torch.reshape(predict, (-1, self._num_classes)), label) + if not np.isnan(loss.item()): + loss.backward() + self.optimizer.step() + loss_train_epoch += loss.item() + validbatchs += 1 + train_result = 'Epoch: {:04d}/{:04d}, batch: {:04d}/{:04d}, loss: {:.4f}'.format( + self._current_epoch, self._max_epochs, bi + 1, + self._train_iters, loss.item()) + logger.info(train_result) + self._dump_log(train_result) + + # average training loss in one epoch + loss_train_epoch /= validbatchs + loss_val_epoch = self.evaluate('') + val_result = 'Evaluate epoch: {:04d}, loss_train: {:.4f}, loss_val: {:.4f}'.format( + self._current_epoch, loss_train_epoch, loss_val_epoch) + logger.info(val_result) + self._dump_log(val_result) + # check point + ckpt_name = 'checkpoint_{:04d}_loss_train_{:.4f}_loss_val_{:.4f}.pth'.format( + self._current_epoch, loss_train_epoch, loss_val_epoch) + torch.save(self.model, os.path.join(self.work_dir, ckpt_name)) + # time spent per epoch + epochtime = datetime.datetime.now() - epochtime + logger.info('Epoch {:04d} time spent: {:.2f} hours'.format( + self._current_epoch, + epochtime.total_seconds() / 3600.0)) + dataloader.stop() + dataset.release() + logger.info(f'Stage {stage} is finished.') + + def gen_val(self): + """ + generate validation set + """ + logger.info('Start generating validation set...') + dataset, dataloader = self.create_dataloader(self.conf_files[2], + self.conf_files[3]) + it = iter(dataloader) + + self.data_val = [] + for bi in range(self._val_iters): + logger.info('Iterating validation data %d', bi) + feat, label = next(it) + label = torch.reshape(label, (-1, )) + self.data_val.append([feat, label]) + + dataloader.stop() + dataset.release() + logger.info('Finish generating validation set!') + + def create_dataloader(self, base_path, finetune_path): + dataset = KWSDataset(base_path, finetune_path, self._threads, + self._single_rate, self._num_classes) + dataloader = KWSDataLoader( + dataset, batchsize=self._batch_size, numworkers=self._threads) + dataloader.start() + return dataset, dataloader + + def evaluate(self, checkpoint_path: str, *args, + **kwargs) -> Dict[str, float]: + logger.info('Start validation...') + loss_val_epoch = 0.0 + + with torch.no_grad(): + for feat, label in self.data_val: + feat = to_device(feat, self.device) + label = to_device(label, self.device) + # apply model + predict = self.model(feat) + # calculate loss + loss = self.loss_fn( + torch.reshape(predict, (-1, self._num_classes)), label) + loss_val_epoch += loss.item() + logger.info('Finish validation.') + return loss_val_epoch / self._val_iters + + def _dump_log(self, msg): + if is_master(): + with open(self.json_log_path, 'a+') as f: + f.write(msg) + f.write('\n') diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py index 4c2c45cc..647d9521 100644 --- a/modelscope/utils/audio/audio_utils.py +++ b/modelscope/utils/audio/audio_utils.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import re import struct from typing import Union from urllib.parse import urlparse @@ -37,6 +38,23 @@ def audio_norm(x): return x +def update_conf(origin_config_file, new_config_file, conf_item: [str, str]): + + def repl(matched): + key = matched.group(1) + if key in conf_item: + return conf_item[key] + else: + return None + + with open(origin_config_file) as f: + lines = f.readlines() + with open(new_config_file, 'w') as f: + for line in lines: + line = re.sub(r'\$\{(.*)\}', repl, line) + f.write(line) + + def extract_pcm_from_wav(wav: bytes) -> bytes: data = wav if len(data) > 44: diff --git a/requirements/audio.txt b/requirements/audio.txt index d22ad8f1..742cf166 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -14,7 +14,11 @@ nltk numpy<=1.18 # protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged. protobuf>3,<3.21.0 -py_sound_connect +ptflops +py_sound_connect>=0.1 +pytorch_wavelets +PyWavelets>=1.0.0 +scikit-learn SoundFile>0.10 sox torchaudio diff --git a/tests/trainers/audio/test_kws_farfield_trainer.py b/tests/trainers/audio/test_kws_farfield_trainer.py new file mode 100644 index 00000000..2631a542 --- /dev/null +++ b/tests/trainers/audio/test_kws_farfield_trainer.py @@ -0,0 +1,85 @@ +import os +import shutil +import tempfile +import unittest + +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +from modelscope.utils.test_utils import test_level + +POS_FILE = 'data/test/audios/wake_word_with_label_xyxy.wav' +NEG_FILE = 'data/test/audios/speech_with_noise.wav' +NOISE_FILE = 'data/test/audios/speech_with_noise.wav' +INTERF_FILE = 'data/test/audios/speech_with_noise.wav' +REF_FILE = 'data/test/audios/farend_speech.wav' +NOISE_2CH_FILE = 'data/test/audios/noise_2ch.wav' + + +class TestKwsFarfieldTrainer(unittest.TestCase): + REVISION = 'beta' + + def setUp(self): + self.tmp_dir = tempfile.TemporaryDirectory().name + print(f'tmp dir: {self.tmp_dir}') + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + self.model_id = 'damo/speech_dfsmn_kws_char_farfield_16k_nihaomiya' + + train_pos_list = self.create_list('pos.list', POS_FILE) + train_neg_list = self.create_list('neg.list', NEG_FILE) + train_noise1_list = self.create_list('noise.list', NOISE_FILE) + train_noise2_list = self.create_list('noise_2ch.list', NOISE_2CH_FILE) + train_interf_list = self.create_list('interf.list', INTERF_FILE) + train_ref_list = self.create_list('ref.list', REF_FILE) + + base_dict = dict( + train_pos_list=train_pos_list, + train_neg_list=train_neg_list, + train_noise1_list=train_noise1_list) + fintune_dict = dict( + train_pos_list=train_pos_list, + train_neg_list=train_neg_list, + train_noise1_list=train_noise1_list, + train_noise2_type='1', + train_noise1_ratio='0.2', + train_noise2_list=train_noise2_list, + train_interf_list=train_interf_list, + train_ref_list=train_ref_list) + self.custom_conf = dict( + basetrain_easy=base_dict, + basetrain_normal=base_dict, + basetrain_hard=base_dict, + finetune_easy=fintune_dict, + finetune_normal=fintune_dict, + finetune_hard=fintune_dict) + + def create_list(self, list_name, audio_file): + pos_list_file = os.path.join(self.tmp_dir, list_name) + with open(pos_list_file, 'w') as f: + for i in range(10): + f.write(f'{os.path.join(os.getcwd(), audio_file)}\n') + train_pos_list = f'{pos_list_file}, 1.0' + return train_pos_list + + def tearDown(self) -> None: + shutil.rmtree(self.tmp_dir, ignore_errors=True) + super().tearDown() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_normal(self): + kwargs = dict( + model=self.model_id, + work_dir=self.tmp_dir, + model_revision=self.REVISION, + workers=2, + max_epochs=2, + train_iters_per_epoch=2, + val_iters_per_epoch=1, + custom_conf=self.custom_conf) + + trainer = build_trainer( + Trainers.speech_dfsmn_kws_char_farfield, default_args=kwargs) + trainer.train() + results_files = os.listdir(self.tmp_dir) + self.assertIn(f'{trainer.timestamp}.log.json', results_files, + f'work_dir:{self.tmp_dir}') From 144ffee2cfaa89389930cba4c991ce03493502d2 Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Thu, 13 Oct 2022 10:16:07 +0800 Subject: [PATCH 26/57] [to #42322933] Add explict model id in tts UT Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371244 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371244 --- tests/pipelines/test_text_to_speech.py | 89 +++++++++++--------------- 1 file changed, 36 insertions(+), 53 deletions(-) diff --git a/tests/pipelines/test_text_to_speech.py b/tests/pipelines/test_text_to_speech.py index 0caf1c84..9a1cd7b1 100644 --- a/tests/pipelines/test_text_to_speech.py +++ b/tests/pipelines/test_text_to_speech.py @@ -27,67 +27,50 @@ class TextToSpeechSambertHifigan16kPipelineTest(unittest.TestCase, def setUp(self) -> None: self.task = Tasks.text_to_speech - zhcn_text = '今天北京天气怎么样' - en_text = 'How is the weather in Beijing?' - zhcn_voice = ['zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo'] - enus_voice = ['andy', 'annie'] - engb_voice = ['luca', 'luna'] - self.tts_test_cases = [] - for voice in zhcn_voice: - model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice, - 'zh-cn') - self.tts_test_cases.append({ - 'voice': voice, - 'model_id': model_id, - 'text': zhcn_text - }) - for voice in enus_voice: - model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice, - 'en-us') - self.tts_test_cases.append({ - 'voice': voice, - 'model_id': model_id, - 'text': en_text - }) - for voice in engb_voice: - model_id = 'damo/speech_sambert-hifigan_tts_%s_%s_16k' % (voice, - 'en-gb') - self.tts_test_cases.append({ - 'voice': voice, - 'model_id': model_id, - 'text': en_text - }) - zhcn_model_id = 'damo/speech_sambert-hifigan_tts_zh-cn_16k' - enus_model_id = 'damo/speech_sambert-hifigan_tts_en-us_16k' - engb_model_id = 'damo/speech_sambert-hifigan_tts_en-gb_16k' - self.tts_test_cases.append({ - 'voice': 'zhcn', - 'model_id': zhcn_model_id, - 'text': zhcn_text - }) - self.tts_test_cases.append({ - 'voice': 'enus', - 'model_id': enus_model_id, - 'text': en_text - }) - self.tts_test_cases.append({ - 'voice': 'engb', - 'model_id': engb_model_id, - 'text': en_text - }) + self.zhcn_text = '今天北京天气怎么样' + self.en_text = 'How is the weather in Beijing?' + self.zhcn_voices = [ + 'zhitian_emo', 'zhizhe_emo', 'zhiyan_emo', 'zhibei_emo', 'zhcn' + ] + self.zhcn_models = [ + 'damo/speech_sambert-hifigan_tts_zhitian_emo_zh-cn_16k', + 'damo/speech_sambert-hifigan_tts_zhizhe_emo_zh-cn_16k', + 'damo/speech_sambert-hifigan_tts_zhiyan_emo_zh-cn_16k', + 'damo/speech_sambert-hifigan_tts_zhibei_emo_zh-cn_16k', + 'damo/speech_sambert-hifigan_tts_zh-cn_16k' + ] + self.en_voices = ['luca', 'luna', 'andy', 'annie', 'engb', 'enus'] + self.en_models = [ + 'damo/speech_sambert-hifigan_tts_luca_en-gb_16k', + 'damo/speech_sambert-hifigan_tts_luna_en-gb_16k', + 'damo/speech_sambert-hifigan_tts_andy_en-us_16k', + 'damo/speech_sambert-hifigan_tts_annie_en-us_16k', + 'damo/speech_sambert-hifigan_tts_en-gb_16k', + 'damo/speech_sambert-hifigan_tts_en-us_16k' + ] @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_pipeline(self): - for case in self.tts_test_cases: - logger.info('test %s' % case['voice']) + for i in range(len(self.zhcn_voices)): + logger.info('test %s' % self.zhcn_voices[i]) model = Model.from_pretrained( - model_name_or_path=case['model_id'], revision='pytorch_am') + model_name_or_path=self.zhcn_models[i], revision='pytorch_am') sambert_hifigan_tts = pipeline(task=self.task, model=model) self.assertTrue(sambert_hifigan_tts is not None) - output = sambert_hifigan_tts(input=case['text']) + output = sambert_hifigan_tts(input=self.zhcn_text) self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM]) pcm = output[OutputKeys.OUTPUT_PCM] - write('output_%s.wav' % case['voice'], 16000, pcm) + write('output_%s.wav' % self.zhcn_voices[i], 16000, pcm) + for i in range(len(self.en_voices)): + logger.info('test %s' % self.en_voices[i]) + model = Model.from_pretrained( + model_name_or_path=self.en_models[i], revision='pytorch_am') + sambert_hifigan_tts = pipeline(task=self.task, model=model) + self.assertTrue(sambert_hifigan_tts is not None) + output = sambert_hifigan_tts(input=self.en_text) + self.assertIsNotNone(output[OutputKeys.OUTPUT_PCM]) + pcm = output[OutputKeys.OUTPUT_PCM] + write('output_%s.wav' % self.en_voices[i], 16000, pcm) @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): From f63d7f18f14dc919297ec104bef56cf2e1990bfc Mon Sep 17 00:00:00 2001 From: "jiangnana.jnn" Date: Thu, 13 Oct 2022 10:39:56 +0800 Subject: [PATCH 27/57] [to #42322933]remove sleep in train_loop Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/9910419 --- modelscope/trainers/trainer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modelscope/trainers/trainer.py b/modelscope/trainers/trainer.py index 4c21d63f..9eaff762 100644 --- a/modelscope/trainers/trainer.py +++ b/modelscope/trainers/trainer.py @@ -828,7 +828,6 @@ class EpochBasedTrainer(BaseTrainer): self.model.train() for _ in range(self._epoch, self._max_epochs): self.invoke_hook(TrainerStages.before_train_epoch) - time.sleep(2) # Prevent possible deadlock during epoch transition for i, data_batch in enumerate(data_loader): if i < self.inner_iter: # inner_iter may be read out from the checkpoint file, so skip the trained iters in the epoch. @@ -852,7 +851,6 @@ class EpochBasedTrainer(BaseTrainer): self._inner_iter = 0 self._epoch += 1 - time.sleep(1) # wait for some hooks like loggers to finish self.invoke_hook(TrainerStages.after_run) def evaluation_loop(self, data_loader, metric_classes): From 0eb823b76490bb3249bf1420143873293a132fb7 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Thu, 13 Oct 2022 10:52:40 +0800 Subject: [PATCH 28/57] [to #42322933] support t5_with_translation Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10383770 * T5 support translate --- modelscope/metainfo.py | 4 ++ .../nlp/text2text_generation_pipeline.py | 39 ++++++++++++++++--- modelscope/preprocessors/nlp/nlp_base.py | 3 +- modelscope/utils/config.py | 10 +++++ tests/pipelines/test_text2text_generation.py | 26 +++++++------ 5 files changed, 63 insertions(+), 19 deletions(-) diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 46c3b138..59c779e9 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -228,6 +228,9 @@ class Pipelines(object): relation_extraction = 'relation-extraction' document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' + translation_en_to_de = 'translation_en_to_de' # keep it underscore + translation_en_to_ro = 'translation_en_to_ro' # keep it underscore + translation_en_to_fr = 'translation_en_to_fr' # keep it underscore # audio tasks sambert_hifigan_tts = 'sambert-hifigan-tts' @@ -314,6 +317,7 @@ class Preprocessors(object): bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' text_gen_tokenizer = 'text-gen-tokenizer' text2text_gen_preprocessor = 'text2text-gen-preprocessor' + text2text_translate_preprocessor = 'text2text-translate-preprocessor' token_cls_tokenizer = 'token-cls-tokenizer' ner_tokenizer = 'ner-tokenizer' nli_tokenizer = 'nli-tokenizer' diff --git a/modelscope/pipelines/nlp/text2text_generation_pipeline.py b/modelscope/pipelines/nlp/text2text_generation_pipeline.py index 21aacf54..a739df69 100644 --- a/modelscope/pipelines/nlp/text2text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text2text_generation_pipeline.py @@ -1,21 +1,35 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union import torch +from numpy import isin from modelscope.metainfo import Pipelines from modelscope.models.base import Model from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Input, Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES from modelscope.preprocessors import Text2TextGenerationPreprocessor +from modelscope.utils.config import use_task_specific_params from modelscope.utils.constant import Tasks __all__ = ['Text2TextGenerationPipeline'] +TRANSLATE_PIPELINES = [ + Pipelines.translation_en_to_de, + Pipelines.translation_en_to_ro, + Pipelines.translation_en_to_fr, +] + @PIPELINES.register_module( Tasks.text2text_generation, module_name=Pipelines.text2text_generation) +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.translation_en_to_de) +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.translation_en_to_ro) +@PIPELINES.register_module( + Tasks.text2text_generation, module_name=Pipelines.translation_en_to_fr) class Text2TextGenerationPipeline(Pipeline): def __init__( @@ -39,13 +53,13 @@ class Text2TextGenerationPipeline(Pipeline): Example: >>> from modelscope.pipelines import pipeline - >>> pipeline_ins = pipeline(task='text-generation', - >>> model='damo/nlp_palm2.0_text-generation_chinese-base') - >>> sentence1 = '本文总结了十个可穿戴产品的设计原则,而这些原则,同样也是笔者认为是这个行业最吸引人的地方:' - >>> '1.为人们解决重复性问题;2.从人开始,而不是从机器开始;3.要引起注意,但不要刻意;4.提升用户能力,而不是取代' + >>> pipeline_ins = pipeline(task='text2text-generation', + >>> model='damo/nlp_t5_text2text-generation_chinese-base') + >>> sentence1 = '中国的首都位于。' >>> print(pipeline_ins(sentence1)) >>> # Or use the dict input: >>> print(pipeline_ins({'sentence': sentence1})) + >>> # 北京 To view other examples plese check the tests/pipelines/test_text_generation.py. """ @@ -56,9 +70,22 @@ class Text2TextGenerationPipeline(Pipeline): model.model_dir, sequence_length=kwargs.pop('sequence_length', 128)) self.tokenizer = preprocessor.tokenizer + self.pipeline = model.pipeline.type model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) + def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: + """ Provide specific preprocess for text2text generation pipeline in order to handl multi tasks + """ + if not isinstance(inputs, str): + raise ValueError(f'Not supported input type: {type(inputs)}') + + if self.pipeline in TRANSLATE_PIPELINES: + use_task_specific_params(self.model, self.pipeline) + inputs = self.model.config.prefix + inputs + + return super().preprocess(inputs, **preprocess_params) + def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index a9be0cb0..bec7e4e1 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -12,7 +12,8 @@ from modelscope.metainfo import Models, Preprocessors from modelscope.outputs import OutputKeys from modelscope.preprocessors.base import Preprocessor from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.config import Config, ConfigFields +from modelscope.utils.config import (Config, ConfigFields, + use_task_specific_params) from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile from modelscope.utils.hub import get_model_type, parse_label_mapping from modelscope.utils.logger import get_logger diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index 0b966bef..c4fa3c1b 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -633,6 +633,16 @@ def check_config(cfg: Union[str, ConfigDict]): check_attr(ConfigFields.evaluation) +def use_task_specific_params(model, task): + """Update config with summarization specific params.""" + task_specific_params = model.config.task_specific_params + + if task_specific_params is not None: + pars = task_specific_params.get(task, {}) + logger.info(f'using task specific params for {task}: {pars}') + model.config.update(pars) + + class JSONIteratorEncoder(json.JSONEncoder): """Implement this method in order that supporting arbitrary iterators, it returns a serializable object for ``obj``, or calls the base implementation diff --git a/tests/pipelines/test_text2text_generation.py b/tests/pipelines/test_text2text_generation.py index 2506547e..d90263c4 100644 --- a/tests/pipelines/test_text2text_generation.py +++ b/tests/pipelines/test_text2text_generation.py @@ -15,42 +15,44 @@ from modelscope.utils.test_utils import test_level class Text2TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): def setUp(self) -> None: - self.model_id = 'damo/t5-cn-base-test' - self.input = '中国的首都位于。' + self.model_id_generate = 'damo/t5-cn-base-test' + self.input_generate = '中国的首都位于。' + self.model_id_translate = 'damo/t5-translate-base-test' + self.input_translate = 'My name is Wolfgang and I live in Berlin' - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_T5(self): - cache_path = snapshot_download(self.model_id) - model = T5ForConditionalGeneration(cache_path) + cache_path = snapshot_download(self.model_id_generate) + model = T5ForConditionalGeneration.from_pretrained(cache_path) preprocessor = Text2TextGenerationPreprocessor(cache_path) pipeline1 = Text2TextGenerationPipeline(model, preprocessor) pipeline2 = pipeline( Tasks.text2text_generation, model=model, preprocessor=preprocessor) print( - f'pipeline1: {pipeline1(self.input)}\npipeline2: {pipeline2(self.input)}' + f'pipeline1: {pipeline1(self.input_generate)}\npipeline2: {pipeline2(self.input_generate)}' ) - @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_pipeline_with_model_instance(self): - model = Model.from_pretrained(self.model_id) + model = Model.from_pretrained(self.model_id_translate) preprocessor = Text2TextGenerationPreprocessor(model.model_dir) pipeline_ins = pipeline( task=Tasks.text2text_generation, model=model, preprocessor=preprocessor) - print(pipeline_ins(self.input)) + print(pipeline_ins(self.input_translate)) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_pipeline_with_model_id(self): pipeline_ins = pipeline( - task=Tasks.text2text_generation, model=self.model_id) - print(pipeline_ins(self.input)) + task=Tasks.text2text_generation, model=self.model_id_translate) + print(pipeline_ins(self.input_translate)) @unittest.skip( 'only for test cases, there is no default official model yet') def test_run_pipeline_without_model_id(self): pipeline_ins = pipeline(task=Tasks.text2text_generation) - print(pipeline_ins(self.input)) + print(pipeline_ins(self.input_generate)) @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): From 2d50c812df09c3423fe8ccbc12b15eaae5706c79 Mon Sep 17 00:00:00 2001 From: "hanyuan.chy" Date: Thu, 13 Oct 2022 13:48:11 +0800 Subject: [PATCH 29/57] [to #42322933] support finetune on cv/hand_2d_keypoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加2d手部关键点检测finetune功能 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10371710 --- modelscope/metainfo.py | 2 + .../models/cv/hand_2d_keypoints/__init__.py | 20 ++++++ .../cv/hand_2d_keypoints/hand_2d_keypoints.py | 16 +++++ .../cv/hand_2d_keypoints/__init__.py | 22 ++++++ .../hand_2d_keypoints_dataset.py | 38 ++++++++++ .../test_easycv_trainer_hand_2d_keypoints.py | 72 +++++++++++++++++++ 6 files changed, 170 insertions(+) create mode 100644 modelscope/models/cv/hand_2d_keypoints/__init__.py create mode 100644 modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py create mode 100644 modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py create mode 100644 modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py create mode 100644 tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 59c779e9..2e3fed98 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -50,6 +50,7 @@ class Models(object): # EasyCV models yolox = 'YOLOX' segformer = 'Segformer' + hand_2d_keypoints = 'HRNet-Hand2D-Keypoints' image_object_detection_auto = 'image-object-detection-auto' # nlp models @@ -439,6 +440,7 @@ class Datasets(object): """ ClsDataset = 'ClsDataset' Face2dKeypointsDataset = 'Face2dKeypointsDataset' + HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset' HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset' SegDataset = 'SegDataset' DetDataset = 'DetDataset' diff --git a/modelscope/models/cv/hand_2d_keypoints/__init__.py b/modelscope/models/cv/hand_2d_keypoints/__init__.py new file mode 100644 index 00000000..2b06f19a --- /dev/null +++ b/modelscope/models/cv/hand_2d_keypoints/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .hand_2d_keypoints import Hand2dKeyPoints + +else: + _import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py new file mode 100644 index 00000000..15a97c30 --- /dev/null +++ b/modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py @@ -0,0 +1,16 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.models.pose import TopDown + +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.models.cv.easycv_base import EasyCVBaseModel +from modelscope.utils.constant import Tasks + + +@MODELS.register_module( + group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints) +class Hand2dKeyPoints(EasyCVBaseModel, TopDown): + + def __init__(self, model_dir=None, *args, **kwargs): + EasyCVBaseModel.__init__(self, model_dir, args, kwargs) + TopDown.__init__(self, *args, **kwargs) diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py b/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py new file mode 100644 index 00000000..5c1c72c1 --- /dev/null +++ b/modelscope/msdatasets/cv/hand_2d_keypoints/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .hand_2d_keypoints_dataset import Hand2DKeypointDataset + +else: + _import_structure = { + 'hand_2d_keypoints_dataset': ['Hand2DKeypointDataset'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py b/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py new file mode 100644 index 00000000..89ee0bb8 --- /dev/null +++ b/modelscope/msdatasets/cv/hand_2d_keypoints/hand_2d_keypoints_dataset.py @@ -0,0 +1,38 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from easycv.datasets.pose import \ + HandCocoWholeBodyDataset as _HandCocoWholeBodyDataset + +from modelscope.metainfo import Datasets +from modelscope.msdatasets.cv.easycv_base import EasyCVBaseDataset +from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS +from modelscope.utils.constant import Tasks + + +@TASK_DATASETS.register_module( + group_key=Tasks.hand_2d_keypoints, + module_name=Datasets.HandCocoWholeBodyDataset) +class HandCocoWholeBodyDataset(EasyCVBaseDataset, _HandCocoWholeBodyDataset): + """EasyCV dataset for human hand 2d keypoints. + + Args: + split_config (dict): Dataset root path from MSDataset, e.g. + {"train":"local cache path"} or {"evaluation":"local cache path"}. + preprocessor (Preprocessor): An optional preprocessor instance, please make sure the preprocessor fits for + the model if supplied. Not support yet. + mode: Training or Evaluation. + """ + + def __init__(self, + split_config=None, + preprocessor=None, + mode=None, + *args, + **kwargs) -> None: + EasyCVBaseDataset.__init__( + self, + split_config=split_config, + preprocessor=preprocessor, + mode=mode, + args=args, + kwargs=kwargs) + _HandCocoWholeBodyDataset.__init__(self, *args, **kwargs) diff --git a/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py b/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py new file mode 100644 index 00000000..270ecbc4 --- /dev/null +++ b/tests/trainers/easycv/test_easycv_trainer_hand_2d_keypoints.py @@ -0,0 +1,72 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import glob +import os +import shutil +import tempfile +import unittest + +import torch + +from modelscope.metainfo import Trainers +from modelscope.msdatasets import MsDataset +from modelscope.trainers import build_trainer +from modelscope.utils.constant import DownloadMode, LogKeys, Tasks +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + + +@unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest') +class EasyCVTrainerTestHand2dKeypoints(unittest.TestCase): + model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody' + + def setUp(self): + self.logger = get_logger() + self.logger.info(('Testing %s.%s' % + (type(self).__name__, self._testMethodName))) + self.tmp_dir = tempfile.TemporaryDirectory().name + if not os.path.exists(self.tmp_dir): + os.makedirs(self.tmp_dir) + + def tearDown(self): + super().tearDown() + shutil.rmtree(self.tmp_dir, ignore_errors=True) + + def _train(self): + cfg_options = {'train.max_epochs': 20} + + trainer_name = Trainers.easycv + + train_dataset = MsDataset.load( + dataset_name='cv_hand_2d_keypoints_coco_wholebody', + namespace='chenhyer', + split='subtrain', + download_mode=DownloadMode.FORCE_REDOWNLOAD) + eval_dataset = MsDataset.load( + dataset_name='cv_hand_2d_keypoints_coco_wholebody', + namespace='chenhyer', + split='subtrain', + download_mode=DownloadMode.FORCE_REDOWNLOAD) + + kwargs = dict( + model=self.model_id, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + work_dir=self.tmp_dir, + cfg_options=cfg_options) + + trainer = build_trainer(trainer_name, kwargs) + trainer.train() + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_trainer_single_gpu(self): + self._train() + + results_files = os.listdir(self.tmp_dir) + json_files = glob.glob(os.path.join(self.tmp_dir, '*.log.json')) + self.assertEqual(len(json_files), 1) + self.assertIn(f'{LogKeys.EPOCH}_10.pth', results_files) + self.assertIn(f'{LogKeys.EPOCH}_20.pth', results_files) + + +if __name__ == '__main__': + unittest.main() From 14e52b308aa6e67564b230ae49b0615615d752ec Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Thu, 13 Oct 2022 14:41:26 +0800 Subject: [PATCH 30/57] fix token classification bugs Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10385225 * fix token classification bugs --- modelscope/models/nlp/bert/__init__.py | 12 ++------ modelscope/models/nlp/bert/modeling_bert.py | 25 ++++++++-------- modelscope/models/nlp/token_classification.py | 29 +++++++++++++++++-- .../nlp/token_classification_pipeline.py | 7 ++++- 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/modelscope/models/nlp/bert/__init__.py b/modelscope/models/nlp/bert/__init__.py index 705d9519..cca79c2f 100644 --- a/modelscope/models/nlp/bert/__init__.py +++ b/modelscope/models/nlp/bert/__init__.py @@ -5,7 +5,6 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .modeling_bert import ( - BERT_PRETRAINED_MODEL_ARCHIVE_LIST, BertForMaskedLM, BertForMultipleChoice, BertForNextSentencePrediction, @@ -20,21 +19,14 @@ if TYPE_CHECKING: load_tf_weights_in_bert, ) - from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig - from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer - from .tokenization_bert_fast import BertTokenizerFast + from .configuration_bert import BertConfig, BertOnnxConfig else: _import_structure = { - 'configuration_bert': - ['BERT_PRETRAINED_CONFIG_ARCHIVE_MAP', 'BertConfig', 'BertOnnxConfig'], - 'tokenization_bert': - ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer'], + 'configuration_bert': ['BertConfig', 'BertOnnxConfig'], } - _import_structure['tokenization_bert_fast'] = ['BertTokenizerFast'] _import_structure['modeling_bert'] = [ - 'BERT_PRETRAINED_MODEL_ARCHIVE_LIST', 'BertForMaskedLM', 'BertForMultipleChoice', 'BertForNextSentencePrediction', diff --git a/modelscope/models/nlp/bert/modeling_bert.py b/modelscope/models/nlp/bert/modeling_bert.py index f8fd5994..e91a6433 100755 --- a/modelscope/models/nlp/bert/modeling_bert.py +++ b/modelscope/models/nlp/bert/modeling_bert.py @@ -1872,19 +1872,18 @@ class BertForTokenClassification(BertPreTrainedModel): @add_start_docstrings_to_model_forward( BERT_INPUTS_DOCSTRING.format('batch_size, sequence_length')) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): + def forward(self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs): r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): diff --git a/modelscope/models/nlp/token_classification.py b/modelscope/models/nlp/token_classification.py index c63e8037..e58967a5 100644 --- a/modelscope/models/nlp/token_classification.py +++ b/modelscope/models/nlp/token_classification.py @@ -176,7 +176,7 @@ class SbertForTokenClassification(TokenClassification, SbertPreTrainedModel): @MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert) @MODELS.register_module(Tasks.token_classification, module_name=Models.bert) -class BertForSequenceClassification(TokenClassification, BertPreTrainedModel): +class BertForTokenClassification(TokenClassification, BertPreTrainedModel): """Bert token classification model. Inherited from TokenClassificationBase. @@ -187,7 +187,7 @@ class BertForSequenceClassification(TokenClassification, BertPreTrainedModel): def __init__(self, config, model_dir): if hasattr(config, 'base_model_prefix'): - BertForSequenceClassification.base_model_prefix = config.base_model_prefix + BertForTokenClassification.base_model_prefix = config.base_model_prefix super().__init__(config, model_dir) def build_base_model(self): @@ -218,3 +218,28 @@ class BertForSequenceClassification(TokenClassification, BertPreTrainedModel): output_hidden_states=output_hidden_states, return_dict=return_dict, **kwargs) + + @classmethod + def _instantiate(cls, **kwargs): + """Instantiate the model. + + @param kwargs: Input args. + model_dir: The model dir used to load the checkpoint and the label information. + num_labels: An optional arg to tell the model how many classes to initialize. + Method will call utils.parse_label_mapping if num_labels not supplied. + If num_labels is not found, the model will use the default setting (2 classes). + @return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained + """ + model_dir = kwargs.get('model_dir') + num_labels = kwargs.get('num_labels') + if num_labels is None: + label2id = parse_label_mapping(model_dir) + if label2id is not None and len(label2id) > 0: + num_labels = len(label2id) + + model_args = {} if num_labels is None else {'num_labels': num_labels} + return super(BertPreTrainedModel, + BertForTokenClassification).from_pretrained( + pretrained_model_name_or_path=kwargs.get('model_dir'), + model_dir=kwargs.get('model_dir'), + **model_args) diff --git a/modelscope/pipelines/nlp/token_classification_pipeline.py b/modelscope/pipelines/nlp/token_classification_pipeline.py index 5367c1a8..c57dbf20 100644 --- a/modelscope/pipelines/nlp/token_classification_pipeline.py +++ b/modelscope/pipelines/nlp/token_classification_pipeline.py @@ -40,7 +40,12 @@ class TokenClassificationPipeline(Pipeline): sequence_length=kwargs.pop('sequence_length', 128)) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.id2label = getattr(model, 'id2label') + if hasattr(model, 'id2label'): + self.id2label = getattr(model, 'id2label') + else: + model_config = getattr(model, 'config') + self.id2label = getattr(model_config, 'id2label') + assert self.id2label is not None, 'Cannot convert id to the original label, please pass in the mapping ' \ 'as a parameter or make sure the preprocessor has the attribute.' From 383452b0a4be12d3a5d15417042d7ccf3e285301 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Thu, 13 Oct 2022 17:16:17 +0800 Subject: [PATCH 31/57] [to #45452180] python 3.10.x compatible Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10394282 * python 3.10.x compatible --- modelscope/utils/tensor_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/utils/tensor_utils.py b/modelscope/utils/tensor_utils.py index b68a639c..406d671f 100644 --- a/modelscope/utils/tensor_utils.py +++ b/modelscope/utils/tensor_utils.py @@ -1,6 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. # Part of the implementation is borrowed from huggingface/transformers. -from collections import Mapping +from collections.abc import Mapping def torch_nested_numpify(tensors): From 5bdb8fb78b5cb0d01431891d8e55cb5510a4ece4 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Thu, 13 Oct 2022 18:30:06 +0800 Subject: [PATCH 32/57] [to #45451935]fix: add create model detail log for create failed. Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10382795 --- modelscope/hub/api.py | 24 +++++++++++------------- modelscope/hub/errors.py | 17 +++++++++++++++-- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 8dcfa5b0..214045dd 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -24,8 +24,8 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, DownloadMode) from modelscope.utils.logger import get_logger from .errors import (InvalidParameter, NotExistError, RequestError, - datahub_raise_on_error, handle_http_response, is_ok, - raise_on_error) + datahub_raise_on_error, handle_http_post_error, + handle_http_response, is_ok, raise_on_error) from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, model_id_to_group_owner_name) @@ -105,17 +105,15 @@ class HubApi: path = f'{self.endpoint}/api/v1/models' owner_or_group, name = model_id_to_group_owner_name(model_id) - r = requests.post( - path, - json={ - 'Path': owner_or_group, - 'Name': name, - 'ChineseName': chinese_name, - 'Visibility': visibility, # server check - 'License': license - }, - cookies=cookies) - r.raise_for_status() + body = { + 'Path': owner_or_group, + 'Name': name, + 'ChineseName': chinese_name, + 'Visibility': visibility, # server check + 'License': license + } + r = requests.post(path, json=body, cookies=cookies) + handle_http_post_error(r, path, body) raise_on_error(r.json()) model_repo_url = f'{get_endpoint()}/{model_id}' return model_repo_url diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py index c095a6ec..fb483287 100644 --- a/modelscope/hub/errors.py +++ b/modelscope/hub/errors.py @@ -4,6 +4,10 @@ from http import HTTPStatus from requests.exceptions import HTTPError +from modelscope.utils.logger import get_logger + +logger = get_logger() + class NotExistError(Exception): pass @@ -45,15 +49,24 @@ def is_ok(rsp): return rsp['Code'] == HTTPStatus.OK and rsp['Success'] +def handle_http_post_error(response, url, request_body): + try: + response.raise_for_status() + except HTTPError as error: + logger.error('Request %s with body: %s exception, respoonse body: %s' % + (url, request_body, response.body)) + raise error + + def handle_http_response(response, logger, cookies, model_id): try: response.raise_for_status() - except HTTPError: + except HTTPError as error: if cookies is None: # code in [403] and logger.error( f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ private. Please login first.') - raise + raise error def raise_on_error(rsp): From 6818ffdc8e598b5a8aeb525c05549b9bce5b3784 Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Thu, 13 Oct 2022 19:42:19 +0800 Subject: [PATCH 33/57] [to #42322933] feat: optimize ANS metric value Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10399100 --- modelscope/metrics/audio_noise_metric.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modelscope/metrics/audio_noise_metric.py b/modelscope/metrics/audio_noise_metric.py index f26db46d..8555e95b 100644 --- a/modelscope/metrics/audio_noise_metric.py +++ b/modelscope/metrics/audio_noise_metric.py @@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric): total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr return { 'total_loss': total_loss.item(), - 'avg_sisnr': avg_sisnr.item(), + # model use opposite number of sisnr as a calculation shortcut. + # revert it in evaluation result + 'avg_sisnr': -avg_sisnr.item(), MetricKeys.AVERAGE_LOSS: avg_loss.item() } From c5c14ad60a8ba573263078892ada19f47698fc1c Mon Sep 17 00:00:00 2001 From: "huizheng.hz" Date: Thu, 13 Oct 2022 22:25:57 +0800 Subject: [PATCH 34/57] [to #42322933]fix psnr/ssim metrics for NAFNet (image denoise) Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10403246 --- modelscope/metrics/image_denoise_metric.py | 203 +++++++++++++----- .../image_denoise/nafnet_for_image_denoise.py | 10 +- .../msdatasets/task_datasets/__init__.py | 1 + 3 files changed, 154 insertions(+), 60 deletions(-) diff --git a/modelscope/metrics/image_denoise_metric.py b/modelscope/metrics/image_denoise_metric.py index c6df8df1..1692f299 100644 --- a/modelscope/metrics/image_denoise_metric.py +++ b/modelscope/metrics/image_denoise_metric.py @@ -1,14 +1,16 @@ -# The code is modified based on BasicSR metrics: -# https://github.com/XPixelGroup/BasicSR/blob/master/basicsr/metrics/psnr_ssim.py +# ------------------------------------------------------------------------ +# Copyright (c) Alibaba, Inc. and its affiliates. +# ------------------------------------------------------------------------ +# modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py +# ------------------------------------------------------------------------ from typing import Dict import cv2 import numpy as np +import torch from modelscope.metainfo import Metrics from modelscope.utils.registry import default_group -from modelscope.utils.tensor_utils import (torch_nested_detach, - torch_nested_numpify) from .base import Metric from .builder import METRICS, MetricKeys @@ -22,16 +24,15 @@ class ImageDenoiseMetric(Metric): label_name = 'target' def __init__(self): + super(ImageDenoiseMetric, self).__init__() self.preds = [] self.labels = [] def add(self, outputs: Dict, inputs: Dict): ground_truths = outputs[ImageDenoiseMetric.label_name] eval_results = outputs[ImageDenoiseMetric.pred_name] - self.preds.append( - torch_nested_numpify(torch_nested_detach(eval_results))) - self.labels.append( - torch_nested_numpify(torch_nested_detach(ground_truths))) + self.preds.append(eval_results) + self.labels.append(ground_truths) def evaluate(self): psnr_list, ssim_list = [], [] @@ -69,80 +70,117 @@ def reorder_image(img, input_order='HWC'): return img -def calculate_psnr(img, img2, crop_border, input_order='HWC', **kwargs): +def calculate_psnr(img1, img2, crop_border, input_order='HWC'): """Calculate PSNR (Peak Signal-to-Noise Ratio). - Reference: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio + Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio Args: - img (ndarray): Images with range [0, 255]. - img2 (ndarray): Images with range [0, 255]. - crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation. - input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'. + img1 (ndarray/tensor): Images with range [0, 255]/[0, 1]. + img2 (ndarray/tensor): Images with range [0, 255]/[0, 1]. + crop_border (int): Cropped pixels in each edge of an image. These + pixels are not involved in the PSNR calculation. + input_order (str): Whether the input order is 'HWC' or 'CHW'. + Default: 'HWC'. + test_y_channel (bool): Test on Y channel of YCbCr. Default: False. Returns: - float: PSNR result. + float: psnr result. """ - assert img.shape == img2.shape, ( - f'Image shapes are different: {img.shape}, {img2.shape}.') + assert img1.shape == img2.shape, ( + f'Image shapes are differnet: {img1.shape}, {img2.shape}.') if input_order not in ['HWC', 'CHW']: raise ValueError( - f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"' - ) - img = reorder_image(img, input_order=input_order) + f'Wrong input_order {input_order}. Supported input_orders are ' + '"HWC" and "CHW"') + if type(img1) == torch.Tensor: + if len(img1.shape) == 4: + img1 = img1.squeeze(0) + img1 = img1.detach().cpu().numpy().transpose(1, 2, 0) + if type(img2) == torch.Tensor: + if len(img2.shape) == 4: + img2 = img2.squeeze(0) + img2 = img2.detach().cpu().numpy().transpose(1, 2, 0) + + img1 = reorder_image(img1, input_order=input_order) img2 = reorder_image(img2, input_order=input_order) - - if crop_border != 0: - img = img[crop_border:-crop_border, crop_border:-crop_border, ...] - img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] - - img = img.astype(np.float64) + img1 = img1.astype(np.float64) img2 = img2.astype(np.float64) - mse = np.mean((img - img2)**2) - if mse == 0: - return float('inf') - return 10. * np.log10(255. * 255. / mse) + if crop_border != 0: + img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...] + img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] + + def _psnr(img1, img2): + + mse = np.mean((img1 - img2)**2) + if mse == 0: + return float('inf') + max_value = 1. if img1.max() <= 1 else 255. + return 20. * np.log10(max_value / np.sqrt(mse)) + + return _psnr(img1, img2) -def calculate_ssim(img, img2, crop_border, input_order='HWC', **kwargs): +def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True): """Calculate SSIM (structural similarity). - ``Paper: Image quality assessment: From error visibility to structural similarity`` + Ref: + Image quality assessment: From error visibility to structural similarity The results are the same as that of the official released MATLAB code in https://ece.uwaterloo.ca/~z70wang/research/ssim/. For three-channel images, SSIM is calculated for each channel and then averaged. Args: - img (ndarray): Images with range [0, 255]. + img1 (ndarray): Images with range [0, 255]. img2 (ndarray): Images with range [0, 255]. - crop_border (int): Cropped pixels in each edge of an image. These pixels are not involved in the calculation. + crop_border (int): Cropped pixels in each edge of an image. These + pixels are not involved in the SSIM calculation. input_order (str): Whether the input order is 'HWC' or 'CHW'. Default: 'HWC'. + test_y_channel (bool): Test on Y channel of YCbCr. Default: False. Returns: - float: SSIM result. + float: ssim result. """ - assert img.shape == img2.shape, ( - f'Image shapes are different: {img.shape}, {img2.shape}.') + assert img1.shape == img2.shape, ( + f'Image shapes are differnet: {img1.shape}, {img2.shape}.') if input_order not in ['HWC', 'CHW']: raise ValueError( - f'Wrong input_order {input_order}. Supported input_orders are "HWC" and "CHW"' - ) - img = reorder_image(img, input_order=input_order) + f'Wrong input_order {input_order}. Supported input_orders are ' + '"HWC" and "CHW"') + + if type(img1) == torch.Tensor: + if len(img1.shape) == 4: + img1 = img1.squeeze(0) + img1 = img1.detach().cpu().numpy().transpose(1, 2, 0) + if type(img2) == torch.Tensor: + if len(img2.shape) == 4: + img2 = img2.squeeze(0) + img2 = img2.detach().cpu().numpy().transpose(1, 2, 0) + + img1 = reorder_image(img1, input_order=input_order) img2 = reorder_image(img2, input_order=input_order) - if crop_border != 0: - img = img[crop_border:-crop_border, crop_border:-crop_border, ...] - img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] - - img = img.astype(np.float64) + img1 = img1.astype(np.float64) img2 = img2.astype(np.float64) - ssims = [] - for i in range(img.shape[2]): - ssims.append(_ssim(img[..., i], img2[..., i])) - return np.array(ssims).mean() + if crop_border != 0: + img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...] + img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] + + def _cal_ssim(img1, img2): + ssims = [] + + max_value = 1 if img1.max() <= 1 else 255 + with torch.no_grad(): + final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim( + img1, img2, max_value) + ssims.append(final_ssim) + + return np.array(ssims).mean() + + return _cal_ssim(img1, img2) -def _ssim(img, img2): +def _ssim(img, img2, max_value): """Calculate SSIM (structural similarity) for one channel images. It is called by func:`calculate_ssim`. Args: @@ -152,8 +190,11 @@ def _ssim(img, img2): float: SSIM result. """ - c1 = (0.01 * 255)**2 - c2 = (0.03 * 255)**2 + c1 = (0.01 * max_value)**2 + c2 = (0.03 * max_value)**2 + + img = img.astype(np.float64) + img2 = img2.astype(np.float64) kernel = cv2.getGaussianKernel(11, 1.5) window = np.outer(kernel, kernel.transpose()) @@ -171,3 +212,61 @@ def _ssim(img, img2): tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) ssim_map = tmp1 / tmp2 return ssim_map.mean() + + +def _3d_gaussian_calculator(img, conv3d): + out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0) + return out + + +def _generate_3d_gaussian_kernel(): + kernel = cv2.getGaussianKernel(11, 1.5) + window = np.outer(kernel, kernel.transpose()) + kernel_3 = cv2.getGaussianKernel(11, 1.5) + kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0)) + conv3d = torch.nn.Conv3d( + 1, + 1, (11, 11, 11), + stride=1, + padding=(5, 5, 5), + bias=False, + padding_mode='replicate') + conv3d.weight.requires_grad = False + conv3d.weight[0, 0, :, :, :] = kernel + return conv3d + + +def _ssim_3d(img1, img2, max_value): + assert len(img1.shape) == 3 and len(img2.shape) == 3 + """Calculate SSIM (structural similarity) for one channel images. + It is called by func:`calculate_ssim`. + Args: + img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'. + img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'. + Returns: + float: ssim result. + """ + C1 = (0.01 * max_value)**2 + C2 = (0.03 * max_value)**2 + img1 = img1.astype(np.float64) + img2 = img2.astype(np.float64) + + kernel = _generate_3d_gaussian_kernel().cuda() + + img1 = torch.tensor(img1).float().cuda() + img2 = torch.tensor(img2).float().cuda() + + mu1 = _3d_gaussian_calculator(img1, kernel) + mu2 = _3d_gaussian_calculator(img2, kernel) + + mu1_sq = mu1**2 + mu2_sq = mu2**2 + mu1_mu2 = mu1 * mu2 + sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq + sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq + sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2 + + tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2) + tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2) + ssim_map = tmp1 / tmp2 + return float(ssim_map.mean()) diff --git a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py index a6fbf22f..4e8fc0ed 100644 --- a/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py +++ b/modelscope/models/cv/image_denoise/nafnet_for_image_denoise.py @@ -3,7 +3,6 @@ import os from copy import deepcopy from typing import Any, Dict, Union -import numpy as np import torch.cuda from torch.nn.parallel import DataParallel, DistributedDataParallel @@ -78,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel): def _evaluate_postprocess(self, input: Tensor, target: Tensor) -> Dict[str, list]: preds = self.model(input) - preds = list(torch.split(preds, 1, 0)) - targets = list(torch.split(target, 1, 0)) - - preds = [(pred.data * 255.).squeeze(0).permute( - 1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds] - targets = [(target.data * 255.).squeeze(0).permute( - 1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets] + preds = list(torch.split(preds.clamp(0, 1), 1, 0)) + targets = list(torch.split(target.clamp(0, 1), 1, 0)) return {'pred': preds, 'target': targets} diff --git a/modelscope/msdatasets/task_datasets/__init__.py b/modelscope/msdatasets/task_datasets/__init__.py index 35c060f0..7c31969a 100644 --- a/modelscope/msdatasets/task_datasets/__init__.py +++ b/modelscope/msdatasets/task_datasets/__init__.py @@ -26,6 +26,7 @@ else: 'video_summarization_dataset': ['VideoSummarizationDataset'], 'movie_scene_segmentation': ['MovieSceneSegmentationDataset'], 'image_inpainting': ['ImageInpaintingDataset'], + 'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'], } import sys From 275f8b432328cfe9df38a105e616233c63efb6a1 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Fri, 14 Oct 2022 13:55:09 +0800 Subject: [PATCH 35/57] Revert "[to #45071449] fix setup error " This reverts commit a26e6e38697a8795b99de4c7929b415baef78268. --- modelscope/models/audio/tts/models/datasets/__init__.py | 0 requirements/framework.txt | 1 - 2 files changed, 1 deletion(-) mode change 100755 => 100644 modelscope/models/audio/tts/models/datasets/__init__.py diff --git a/modelscope/models/audio/tts/models/datasets/__init__.py b/modelscope/models/audio/tts/models/datasets/__init__.py old mode 100755 new mode 100644 diff --git a/requirements/framework.txt b/requirements/framework.txt index aae200da..b51faeda 100644 --- a/requirements/framework.txt +++ b/requirements/framework.txt @@ -15,7 +15,6 @@ pyyaml requests scipy setuptools -setuptools_scm tensorboard tqdm>=4.64.0 yapf From 155856301f0e4f61be0d4753734f1496e7cbf7ce Mon Sep 17 00:00:00 2001 From: "bin.xue" Date: Fri, 14 Oct 2022 14:00:57 +0800 Subject: [PATCH 36/57] [to #42322933] do not check training config in pipeline() Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10407849 --- modelscope/utils/config.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py index c4fa3c1b..e46da7df 100644 --- a/modelscope/utils/config.py +++ b/modelscope/utils/config.py @@ -609,11 +609,12 @@ class Config: return parse_fn(args) -def check_config(cfg: Union[str, ConfigDict]): +def check_config(cfg: Union[str, ConfigDict], is_training=False): """ Check whether configuration file is valid, If anything wrong, exception will be raised. Args: cfg (str or ConfigDict): Config file path or config object. + is_training: indicate if checking training related elements """ if isinstance(cfg, str): @@ -627,8 +628,9 @@ def check_config(cfg: Union[str, ConfigDict]): check_attr(ConfigFields.task) check_attr(ConfigFields.pipeline) - if hasattr(cfg, ConfigFields.train): + if is_training: check_attr(ConfigFields.model) + check_attr(ConfigFields.train) check_attr(ConfigFields.preprocessor) check_attr(ConfigFields.evaluation) From 355da866c553216a2b45b5f1ae68a27eebcf62ec Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Fri, 14 Oct 2022 18:07:29 +0800 Subject: [PATCH 37/57] [to #42322933] limit tranformers version temporarily --- requirements/nlp.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements/nlp.txt b/requirements/nlp.txt index f18dde2e..2e0838fc 100644 --- a/requirements/nlp.txt +++ b/requirements/nlp.txt @@ -3,7 +3,7 @@ fasttext jieba>=0.42.1 megatron_util pai-easynlp -# “protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged.” +# protobuf version beyond 3.20.0 is not compatible with TensorFlow 1.x, therefore is discouraged. protobuf>=3.19.0,<3.21.0 # rough-score was just recently updated from 0.0.4 to 0.0.7 # which introduced compatability issues that are being investigated @@ -14,4 +14,5 @@ spacy>=2.3.5 subword_nmt>=0.3.8 text2sql_lgesql tokenizers -transformers>=4.12.0 +# recent 4.23.1 update introduce breaking api change, limit upper version temporarily. +transformers>=4.12.0,<=4.22.0 From 876058556deabcdf1a399e79983444d97ec790f2 Mon Sep 17 00:00:00 2001 From: hemu Date: Fri, 14 Oct 2022 18:15:52 +0800 Subject: [PATCH 38/57] fix generate --- modelscope/models/nlp/gpt3/modeling_gpt3.py | 3 +++ requirements/nlp.txt | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py index 498d15de..ade36e36 100644 --- a/modelscope/models/nlp/gpt3/modeling_gpt3.py +++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py @@ -346,3 +346,6 @@ class GPT3Model(PreTrainedModel): } model.load_state_dict(state_dict) return model + + def prepare_inputs_for_generation(self, input_ids, *args, **kwargs): + return {'input_ids': input_ids} diff --git a/requirements/nlp.txt b/requirements/nlp.txt index 2e0838fc..123c238e 100644 --- a/requirements/nlp.txt +++ b/requirements/nlp.txt @@ -14,5 +14,4 @@ spacy>=2.3.5 subword_nmt>=0.3.8 text2sql_lgesql tokenizers -# recent 4.23.1 update introduce breaking api change, limit upper version temporarily. -transformers>=4.12.0,<=4.22.0 +transformers From 1b4d5ccb9c8b7a7d93c91aa85e43b017826df2c0 Mon Sep 17 00:00:00 2001 From: "xingjun.wxj" Date: Fri, 14 Oct 2022 18:32:38 +0800 Subject: [PATCH 39/57] [to #42322933]MsDataset upload and load supports directory. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 上传和下载支持多文件操作 --- modelscope/hub/api.py | 34 ++++--- modelscope/hub/utils/utils.py | 8 +- modelscope/msdatasets/ms_dataset.py | 52 +++++++---- modelscope/msdatasets/utils/dataset_utils.py | 90 ++++++++++++++++++- modelscope/msdatasets/utils/download_utils.py | 18 ++-- modelscope/msdatasets/utils/oss_utils.py | 9 +- modelscope/msdatasets/utils/upload_utils.py | 40 ++++++++- modelscope/utils/constant.py | 7 ++ tests/msdatasets/test_dataset_upload.py | 43 ++++++++- 9 files changed, 250 insertions(+), 51 deletions(-) diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index 214045dd..dc4d0ab2 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -26,18 +26,15 @@ from modelscope.utils.logger import get_logger from .errors import (InvalidParameter, NotExistError, RequestError, datahub_raise_on_error, handle_http_post_error, handle_http_response, is_ok, raise_on_error) -from .utils.utils import (get_dataset_hub_endpoint, get_endpoint, - model_id_to_group_owner_name) +from .utils.utils import get_endpoint, model_id_to_group_owner_name logger = get_logger() class HubApi: - def __init__(self, endpoint=None, dataset_endpoint=None): + def __init__(self, endpoint=None): self.endpoint = endpoint if endpoint is not None else get_endpoint() - self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint( - ) def login( self, @@ -288,7 +285,7 @@ class HubApi: return files def list_datasets(self): - path = f'{self.dataset_endpoint}/api/v1/datasets' + path = f'{self.endpoint}/api/v1/datasets' headers = None params = {} r = requests.get(path, params=params, headers=headers) @@ -315,13 +312,13 @@ class HubApi: cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir, exist_ok=True) - datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}' + datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}' r = requests.get(datahub_url) resp = r.json() datahub_raise_on_error(datahub_url, resp) dataset_id = resp['Data']['Id'] dataset_type = resp['Data']['Type'] - datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' + datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' r = requests.get(datahub_url) resp = r.json() datahub_raise_on_error(datahub_url, resp) @@ -339,7 +336,7 @@ class HubApi: file_path = file_info['Path'] extension = os.path.splitext(file_path)[-1] if extension in dataset_meta_format: - datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ + datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ f'Revision={revision}&FilePath={file_path}' r = requests.get(datahub_url) r.raise_for_status() @@ -363,7 +360,7 @@ class HubApi: namespace: str, revision: Optional[str] = DEFAULT_DATASET_REVISION): if file_name.endswith('.csv'): - file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ + file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \ f'Revision={revision}&FilePath={file_name}' return file_name @@ -372,7 +369,7 @@ class HubApi: dataset_name: str, namespace: str, revision: Optional[str] = DEFAULT_DATASET_REVISION): - datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ + datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ f'ststoken?Revision={revision}' return self.datahub_remote_call(datahub_url) @@ -383,7 +380,7 @@ class HubApi: namespace: str, revision: Optional[str] = DEFAULT_DATASET_REVISION): - datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ + datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \ f'ststoken?Revision={revision}' cookies = requests.utils.dict_from_cookiejar(cookies) @@ -392,6 +389,19 @@ class HubApi: raise_on_error(resp) return resp['Data'] + def list_oss_dataset_objects(self, dataset_name, namespace, max_limit, + is_recursive, is_filter_dir, revision, + cookies): + url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \ + f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}' + cookies = requests.utils.dict_from_cookiejar(cookies) + + resp = requests.get(url=url, cookies=cookies) + resp = resp.json() + raise_on_error(resp) + resp = resp['Data'] + return resp + def on_dataset_download(self, dataset_name: str, namespace: str) -> None: url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase' r = requests.post(url) diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index d84b78ea..7d3c2499 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -4,8 +4,7 @@ import hashlib import os from typing import Optional -from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT, - DEFAULT_MODELSCOPE_DOMAIN, +from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN, DEFAULT_MODELSCOPE_GROUP, MODEL_ID_SEPARATOR, MODELSCOPE_URL_SCHEME) @@ -44,11 +43,6 @@ def get_endpoint(): return MODELSCOPE_URL_SCHEME + modelscope_domain -def get_dataset_hub_endpoint(): - return os.environ.get('HUB_DATASET_ENDPOINT', - DEFAULT_MODELSCOPE_DATA_ENDPOINT) - - def compute_hash(file_path): BUFFER_SIZE = 1024 * 64 # 64k buffer size sha256_hash = hashlib.sha256() diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index 361b8ae0..cf055d6d 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -1,6 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import math import os from typing import (Any, Callable, Dict, Iterable, List, Mapping, Optional, Sequence, Union) @@ -17,19 +16,18 @@ from datasets.utils.file_utils import (is_relative_path, relative_to_absolute_path) from modelscope.hub.repository import DatasetRepository +from modelscope.msdatasets.task_datasets.builder import build_task_dataset +from modelscope.msdatasets.utils.dataset_builder import ExternalDataset +from modelscope.msdatasets.utils.dataset_utils import ( + get_dataset_files, get_target_dataset_structure, load_dataset_builder) +from modelscope.msdatasets.utils.download_utils import DatasetDownloadManager +from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager from modelscope.utils.config import ConfigDict from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION, DatasetFormations, DownloadMode, Hubs) from modelscope.utils.logger import get_logger -from .task_datasets.builder import build_task_dataset -from .utils.dataset_builder import ExternalDataset -from .utils.dataset_utils import (get_dataset_files, - get_target_dataset_structure, - load_dataset_builder) -from .utils.download_utils import DatasetDownloadManager -from .utils.upload_utils import DatasetUploadManager logger = get_logger() @@ -234,7 +232,6 @@ class MsDataset: # dataset organized to be compatible with hf format if dataset_formation == DatasetFormations.hf_compatible: dataset_name = dataset_scripts['.py'][0] - download_dataset = dataset_name else: raise FileNotFoundError( f"Couldn't find a dataset script at {relative_to_absolute_path(dataset_name)} " @@ -270,7 +267,8 @@ class MsDataset: raise TypeError('path must be a str or a list, but got' f' {type(dataset_name)}') - if download_dataset: + is_ci_test = os.getenv('CI_TEST') == 'True' + if download_dataset and not is_ci_test: try: api.on_dataset_download( dataset_name=download_dataset, namespace=namespace) @@ -570,15 +568,26 @@ class MsDataset: local_file_path: str, dataset_name: str, namespace: Optional[str] = DEFAULT_DATASET_NAMESPACE, - version: Optional[str] = DEFAULT_DATASET_REVISION) -> None: - """Upload dataset file to the ModelScope Hub. Please login to the ModelScope Hub first. + version: Optional[str] = DEFAULT_DATASET_REVISION, + num_processes: Optional[int] = None, + chunksize: Optional[int] = 1, + filter_hidden_files: Optional[bool] = True) -> None: + """Upload dataset file or directory to the ModelScope Hub. Please login to the ModelScope Hub first. Args: - object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip - local_file_path (str): Local file to upload + object_name (str): The object name on ModelScope, in the form of your-dataset-name.zip or your-dataset-name + local_file_path (str): Local file or directory to upload dataset_name (str): Name of the dataset namespace(str, optional): Namespace of the dataset version: Optional[str]: Version of the dataset + num_processes: Optional[int]: The number of processes used for multi-process uploading. + This is only applicable when local_file_path is a directory, and we are uploading mutliple-files + insided the directory. When None provided, the number returned by os.cpu_count() is used as default. + chunksize: Optional[int]: The chunksize of objects to upload. + For very long iterables using a large value for chunksize can make the job complete much faster than + using the default value of 1. Available if local_file_path is a directory. + filter_hidden_files: Optional[bool]: Whether to filter hidden files. + Available if local_file_path is a directory. Returns: None @@ -586,7 +595,20 @@ class MsDataset: """ _upload_manager = DatasetUploadManager( dataset_name=dataset_name, namespace=namespace, version=version) - _upload_manager.upload(object_name, local_file_path) + + if os.path.isfile(local_file_path): + _upload_manager.upload( + object_name=object_name, local_file_path=local_file_path) + elif os.path.isdir(local_file_path): + _upload_manager.upload_dir( + object_dir_name=object_name, + local_dir_path=local_file_path, + num_processes=num_processes, + chunksize=chunksize, + filter_hidden_files=filter_hidden_files) + else: + raise ValueError( + f'{local_file_path} is not a valid file path or directory') @staticmethod def clone_meta(dataset_work_dir: str, diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index ef42f75f..db9d1fee 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -6,7 +6,8 @@ from typing import Any, Mapping, Optional, Sequence, Union from datasets.builder import DatasetBuilder -from modelscope.utils.constant import DEFAULT_DATASET_REVISION +from modelscope.hub.api import HubApi +from modelscope.utils.constant import DEFAULT_DATASET_REVISION, DownloadParams from modelscope.utils.logger import get_logger from .dataset_builder import MsCsvDatasetBuilder, TaskSpecificDatasetBuilder @@ -77,6 +78,81 @@ def get_target_dataset_structure(dataset_structure: dict, return target_subset_name, target_dataset_structure +def list_dataset_objects(hub_api: HubApi, max_limit: int, is_recursive: bool, + dataset_name: str, namespace: str, + version: str) -> list: + """ + List all of objects for specific dataset. + + Args: + hub_api (class HubApi): HubApi instance. + max_limit (int): Max number of objects. + is_recursive (bool): Whether to list objects recursively. + dataset_name (str): Dataset name. + namespace (str): Namespace. + version (str): Dataset version. + Returns: + res (list): List of objects, i.e., ['train/images/001.png', 'train/images/002.png', 'val/images/001.png', ...] + """ + res = [] + cookies = hub_api.check_cookies_upload_data(use_cookies=True) + objects = hub_api.list_oss_dataset_objects( + dataset_name=dataset_name, + namespace=namespace, + max_limit=max_limit, + is_recursive=is_recursive, + is_filter_dir=True, + revision=version, + cookies=cookies) + + for item in objects: + object_key = item.get('Key') + res.append(object_key) + + return res + + +def contains_dir(file_map) -> bool: + """ + To check whether input contains at least one directory. + + Args: + file_map (dict): Structure of data files. e.g., {'train': 'train.zip', 'validation': 'val.zip'} + Returns: + True if input contains at least one directory, False otherwise. + """ + res = False + for k, v in file_map.items(): + if isinstance(v, str) and not v.endswith('.zip'): + res = True + break + return res + + +def get_split_objects_map(file_map, objects): + """ + Get the map between dataset split and oss objects. + + Args: + file_map (dict): Structure of data files. e.g., {'train': 'train', 'validation': 'val'}, both of train and val + are dirs. + objects (list): List of oss objects. e.g., ['train/001/1_123.png', 'train/001/1_124.png', 'val/003/3_38.png'] + Returns: + A map of split-objects. e.g., {'train': ['train/001/1_123.png', 'train/001/1_124.png'], + 'validation':['val/003/3_38.png']} + """ + res = {} + for k, v in file_map.items(): + res[k] = [] + + for obj_key in objects: + for k, v in file_map.items(): + if obj_key.startswith(v): + res[k].append(obj_key) + + return res + + def get_dataset_files(subset_split_into: dict, dataset_name: str, namespace: str, @@ -95,14 +171,24 @@ def get_dataset_files(subset_split_into: dict, meta_map = defaultdict(dict) file_map = defaultdict(dict) args_map = defaultdict(dict) - from modelscope.hub.api import HubApi modelscope_api = HubApi() + objects = list_dataset_objects( + hub_api=modelscope_api, + max_limit=DownloadParams.MAX_LIST_OBJECTS_NUM.value, + is_recursive=True, + dataset_name=dataset_name, + namespace=namespace, + version=revision) + for split, info in subset_split_into.items(): meta_map[split] = modelscope_api.get_dataset_file_url( info.get('meta', ''), dataset_name, namespace, revision) if info.get('file'): file_map[split] = info['file'] args_map[split] = info.get('args') + + if contains_dir(file_map): + file_map = get_split_objects_map(file_map, objects) return meta_map, file_map, args_map diff --git a/modelscope/msdatasets/utils/download_utils.py b/modelscope/msdatasets/utils/download_utils.py index 2e21bf50..b1c7a5ab 100644 --- a/modelscope/msdatasets/utils/download_utils.py +++ b/modelscope/msdatasets/utils/download_utils.py @@ -10,16 +10,14 @@ from .oss_utils import OssUtilities class DatasetDownloadManager(DownloadManager): - def __init__( - self, - dataset_name: str, - namespace: str, - version: str, - data_dir: Optional[str] = None, - download_config: Optional[DownloadConfig] = None, - base_path: Optional[str] = None, - record_checksums=True, - ): + def __init__(self, + dataset_name: str, + namespace: str, + version: str, + data_dir: Optional[str] = None, + download_config: Optional[DownloadConfig] = None, + base_path: Optional[str] = None, + record_checksums=True): super().__init__(dataset_name, data_dir, download_config, base_path, record_checksums) self._namespace = namespace diff --git a/modelscope/msdatasets/utils/oss_utils.py b/modelscope/msdatasets/utils/oss_utils.py index 4a403876..d7d61e89 100644 --- a/modelscope/msdatasets/utils/oss_utils.py +++ b/modelscope/msdatasets/utils/oss_utils.py @@ -50,11 +50,16 @@ class OssUtilities: progress_callback=self._percentage) return local_path - def upload(self, oss_object_name: str, local_file_path: str) -> str: + def upload(self, oss_object_name: str, local_file_path: str, + indicate_individual_progress: bool) -> str: retry_count = 0 object_key = os.path.join(self.oss_dir, oss_object_name) resumable_store = oss2.ResumableStore( root=self.upload_resumable_tmp_store) + if indicate_individual_progress: + progress_callback = self._percentage + else: + progress_callback = None while True: try: @@ -66,7 +71,7 @@ class OssUtilities: store=resumable_store, multipart_threshold=self.upload_multipart_threshold, part_size=self.upload_part_size, - progress_callback=self._percentage, + progress_callback=progress_callback, num_threads=self.upload_num_threads) break except Exception: diff --git a/modelscope/msdatasets/utils/upload_utils.py b/modelscope/msdatasets/utils/upload_utils.py index 4813b89f..2b4422b2 100644 --- a/modelscope/msdatasets/utils/upload_utils.py +++ b/modelscope/msdatasets/utils/upload_utils.py @@ -1,5 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os +from multiprocessing.dummy import Pool as ThreadPool + +from tqdm import tqdm + from .oss_utils import OssUtilities @@ -19,5 +24,38 @@ class DatasetUploadManager(object): def upload(self, object_name: str, local_file_path: str) -> str: object_key = self.oss_utilities.upload( - oss_object_name=object_name, local_file_path=local_file_path) + oss_object_name=object_name, + local_file_path=local_file_path, + indicate_individual_progress=True) return object_key + + def upload_dir(self, object_dir_name: str, local_dir_path: str, + num_processes: int, chunksize: int, + filter_hidden_files: bool) -> int: + + def run_upload(args): + self.oss_utilities.upload( + oss_object_name=args[0], + local_file_path=args[1], + indicate_individual_progress=False) + + files_list = [] + for root, dirs, files in os.walk(local_dir_path): + for file_name in files: + if filter_hidden_files and file_name.startswith('.'): + continue + # Concatenate directory name and relative path into a oss object key. e.g., train/001/1_1230.png + object_name = os.path.join( + object_dir_name, + root.replace(local_dir_path, '', 1).strip('/'), file_name) + + local_file_path = os.path.join(root, file_name) + files_list.append((object_name, local_file_path)) + + with ThreadPool(processes=num_processes) as pool: + result = list( + tqdm( + pool.imap(run_upload, files_list, chunksize=chunksize), + total=len(files_list))) + + return len(result) diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 5f0532ce..9e10e802 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -227,6 +227,13 @@ class DownloadMode(enum.Enum): FORCE_REDOWNLOAD = 'force_redownload' +class DownloadParams(enum.Enum): + """ + Parameters for downloading dataset. + """ + MAX_LIST_OBJECTS_NUM = 50000 + + class DatasetFormations(enum.Enum): """ How a dataset is organized and interpreted """ diff --git a/tests/msdatasets/test_dataset_upload.py b/tests/msdatasets/test_dataset_upload.py index 1179414d..3d35d480 100644 --- a/tests/msdatasets/test_dataset_upload.py +++ b/tests/msdatasets/test_dataset_upload.py @@ -6,9 +6,13 @@ import unittest import zipfile from modelscope.msdatasets import MsDataset -from modelscope.utils.constant import ModelFile +from modelscope.msdatasets.utils.dataset_utils import list_dataset_objects +from modelscope.utils import logger as logging +from modelscope.utils.constant import DEFAULT_DATASET_REVISION, ModelFile from modelscope.utils.test_utils import test_level +logger = logging.get_logger(__name__) + KEY_EXTRACTED = 'extracted' @@ -39,7 +43,8 @@ class DatasetUploadTest(unittest.TestCase): def tearDown(self): os.chdir(self.old_dir) shutil.rmtree(self.temp_dir, ignore_errors=True) - print('The test dir successfully removed!') + logger.info( + f'Temporary directory {self.temp_dir} successfully removed!') @staticmethod def get_raw_downloaded_file_path(extracted_path): @@ -68,6 +73,40 @@ class DatasetUploadTest(unittest.TestCase): dataset_name=self.dataset_name, namespace=self.namespace) + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_ds_upload_dir(self): + ms_ds_train = MsDataset.load(self.prepared_dataset_name, split='train') + config_train = ms_ds_train._hf_ds.config_kwargs + extracted_path_train = config_train.get('split_config').get('train') + + MsDataset.upload( + object_name='train', + local_file_path=os.path.join(extracted_path_train, + 'Pets/images/train'), + dataset_name=self.dataset_name, + namespace=self.namespace) + MsDataset.upload( + object_name='val', + local_file_path=os.path.join(extracted_path_train, + 'Pets/images/val'), + dataset_name=self.dataset_name, + namespace=self.namespace) + + objects = list_dataset_objects( + hub_api=self.api, + max_limit=-1, + is_recursive=True, + dataset_name=self.dataset_name, + namespace=self.namespace, + version=DEFAULT_DATASET_REVISION) + + logger.info(f'{len(objects)} objects have been uploaded: {objects}') + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_ds_download_dir(self): + test_ds = MsDataset.load(self.dataset_name, self.namespace) + assert test_ds.config_kwargs['split_config'].values() + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') def test_ds_clone_meta(self): MsDataset.clone_meta( From deb847614a518537a22567209519dbea89feabcd Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Fri, 14 Oct 2022 21:59:52 +0800 Subject: [PATCH 40/57] [to #42322933] limit espnet version --- requirements/audio.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/audio.txt b/requirements/audio.txt index 742cf166..bef32121 100644 --- a/requirements/audio.txt +++ b/requirements/audio.txt @@ -1,5 +1,5 @@ easyasr>=0.0.2 -espnet>=202204 +espnet==202204 h5py inflect keras From 202fcdf2984a214e8d4a55b11607eefafa77af0f Mon Sep 17 00:00:00 2001 From: "caorongyu.cry" Date: Fri, 14 Oct 2022 23:11:19 +0800 Subject: [PATCH 41/57] [to #42322933] change tableqa output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修改output的结构,直接返回可转化成json format的结构 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10415403 --- .../models/nlp/table_question_answering.py | 6 +++--- modelscope/outputs.py | 7 ++++++- .../nlp/table_question_answering_pipeline.py | 3 ++- .../pipelines/test_table_question_answering.py | 18 +++++++++++------- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/modelscope/models/nlp/table_question_answering.py b/modelscope/models/nlp/table_question_answering.py index c6a03ef3..c2134df2 100644 --- a/modelscope/models/nlp/table_question_answering.py +++ b/modelscope/models/nlp/table_question_answering.py @@ -691,11 +691,11 @@ class TableQuestionAnswering(Model): sels.append(l_hs[ib] - 1) aggs.append(sql['agg'][ia]) continue - sels.append(sel) + sels.append(int(sel)) if sql['agg'][ia] == -1: aggs.append(0) else: - aggs.append(sql['agg'][ia]) + aggs.append(int(sql['agg'][ia])) if len(sels) == 0: sels.append(l_hs[ib] - 1) aggs.append(0) @@ -712,7 +712,7 @@ class TableQuestionAnswering(Model): for i in range(wl): if wc_os[i] == -1: continue - conds.append([wc_os[i], wo_os[i], pr_wvi_str[ib][i]]) + conds.append([int(wc_os[i]), int(wo_os[i]), pr_wvi_str[ib][i]]) if len(conds) == 0: conds.append([l_hs[ib] - 1, 2, 'Nulll']) sql['conds'] = conds diff --git a/modelscope/outputs.py b/modelscope/outputs.py index 3001c03c..c08779b4 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -36,6 +36,8 @@ class OutputKeys(object): UUID = 'uuid' WORD = 'word' KWS_LIST = 'kws_list' + SQL_STRING = 'sql_string' + SQL_QUERY = 'sql_query' HISTORY = 'history' QUERT_RESULT = 'query_result' TIMESTAMPS = 'timestamps' @@ -583,7 +585,10 @@ TASK_OUTPUTS = { # "sql": "SELECT shop.Name FROM shop." # "sql_history": {sel: 0, agg: 0, conds: [[0, 0, 'val']]} # } - Tasks.table_question_answering: [OutputKeys.OUTPUT, OutputKeys.HISTORY], + Tasks.table_question_answering: [ + OutputKeys.SQL_STRING, OutputKeys.SQL_QUERY, OutputKeys.HISTORY, + OutputKeys.QUERT_RESULT + ], # ============ audio tasks =================== # asr result for single sample diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py index e1b2b07b..ca17c9b1 100644 --- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py @@ -311,7 +311,8 @@ class TableQuestionAnsweringPipeline(Pipeline): tabledata = {'headers': [], 'cells': []} output = { - OutputKeys.OUTPUT: sql, + OutputKeys.SQL_STRING: sql.string, + OutputKeys.SQL_QUERY: sql.query, OutputKeys.HISTORY: result['sql'], OutputKeys.QUERT_RESULT: json.dumps(tabledata, ensure_ascii=False), } diff --git a/tests/pipelines/test_table_question_answering.py b/tests/pipelines/test_table_question_answering.py index 68e0564f..3d943e51 100644 --- a/tests/pipelines/test_table_question_answering.py +++ b/tests/pipelines/test_table_question_answering.py @@ -3,10 +3,12 @@ import os import unittest from typing import List +import json from transformers import BertTokenizer from modelscope.hub.snapshot_download import snapshot_download from modelscope.models import Model +from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.pipelines.nlp import TableQuestionAnsweringPipeline from modelscope.preprocessors import TableQuestionAnsweringPreprocessor @@ -38,11 +40,12 @@ def tableqa_tracking_and_print_results_with_history( 'history_sql': historical_queries }) print('question', question) - print('sql text:', output_dict['output'].string) - print('sql query:', output_dict['output'].query) - print('query result:', output_dict['query_result']) + print('sql text:', output_dict[OutputKeys.SQL_STRING]) + print('sql query:', output_dict[OutputKeys.SQL_QUERY]) + print('query result:', output_dict[OutputKeys.QUERT_RESULT]) + print('json dumps', json.dumps(output_dict)) print() - historical_queries = output_dict['history'] + historical_queries = output_dict[OutputKeys.HISTORY] def tableqa_tracking_and_print_results_without_history( @@ -60,9 +63,10 @@ def tableqa_tracking_and_print_results_without_history( for question in test_case['utterance']: output_dict = p({'question': question}) print('question', question) - print('sql text:', output_dict['output'].string) - print('sql query:', output_dict['output'].query) - print('query result:', output_dict['query_result']) + print('sql text:', output_dict[OutputKeys.SQL_STRING]) + print('sql query:', output_dict[OutputKeys.SQL_QUERY]) + print('query result:', output_dict[OutputKeys.QUERT_RESULT]) + print('json dumps', json.dumps(output_dict)) print() From 7e7303a658fae50a027420547035bb7319c64c76 Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Sat, 15 Oct 2022 08:51:06 +0800 Subject: [PATCH 42/57] [to #42322933] remove fasttext from nlp requirements --- modelscope/utils/error.py | 9 +++++++++ modelscope/utils/import_utils.py | 1 + requirements/nlp.txt | 3 +-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/modelscope/utils/error.py b/modelscope/utils/error.py index a6bbc8b3..a894063c 100644 --- a/modelscope/utils/error.py +++ b/modelscope/utils/error.py @@ -111,3 +111,12 @@ You can install it with pip on linux: On windows, please checkout the instructions on the installation page: https://github.com/facebookresearch/fairseq and follow the ones that match your environment. """ + +# docstyle-ignore +FASTTEXT_IMPORT_ERROR = """ +{0} requires the fasttext library but it was not found in your environment. +You can install it with pip on linux or mac: +`pip install fasttext` +Or you can checkout the instructions on the +installation page: https://github.com/facebookresearch/fastText and follow the ones that match your environment. +""" diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py index 2a6fdc80..5db5ea98 100644 --- a/modelscope/utils/import_utils.py +++ b/modelscope/utils/import_utils.py @@ -292,6 +292,7 @@ REQUIREMENTS_MAAPING = OrderedDict([ ('decord', (is_package_available('decord'), DECORD_IMPORT_ERROR)), ('deepspeed', (is_package_available('deepspeed'), DEEPSPEED_IMPORT_ERROR)), ('fairseq', (is_package_available('fairseq'), FAIRSEQ_IMPORT_ERROR)), + ('fasttext', (is_package_available('fasttext'), FASTTEXT_IMPORT_ERROR)), ]) SYSTEM_PACKAGE = set(['os', 'sys', 'typing']) diff --git a/requirements/nlp.txt b/requirements/nlp.txt index 123c238e..a5f3cbd9 100644 --- a/requirements/nlp.txt +++ b/requirements/nlp.txt @@ -1,5 +1,4 @@ en_core_web_sm>=2.3.5 -fasttext jieba>=0.42.1 megatron_util pai-easynlp @@ -14,4 +13,4 @@ spacy>=2.3.5 subword_nmt>=0.3.8 text2sql_lgesql tokenizers -transformers +transformers>=4.12.0 From 4682783619f86726d0bb3c880c2100e91d126355 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Sat, 15 Oct 2022 20:33:55 +0800 Subject: [PATCH 43/57] [to #44902165] bump version to 0.5.0 --- modelscope/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelscope/version.py b/modelscope/version.py index 1e4826d6..2b8877c5 100644 --- a/modelscope/version.py +++ b/modelscope/version.py @@ -1 +1 @@ -__version__ = '0.4.7' +__version__ = '0.5.0' From f6e542cdcb6c1a1be690750bebda791ed5c90589 Mon Sep 17 00:00:00 2001 From: "wenmeng.zwm" Date: Mon, 17 Oct 2022 10:40:08 +0800 Subject: [PATCH 44/57] refine pipeline input to support demo service * image_captioninig support single image and dict input * image_style_transfer use dict input Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10417330 --- modelscope/pipeline_inputs.py | 16 ++++++++++------ modelscope/pipelines/base.py | 6 +++++- tests/pipelines/test_image_style_transfer.py | 15 +++++++++------ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py index 2b14c278..34b731c6 100644 --- a/modelscope/pipeline_inputs.py +++ b/modelscope/pipeline_inputs.py @@ -97,8 +97,10 @@ TASK_INPUTS = { InputType.IMAGE, Tasks.image_to_image_translation: InputType.IMAGE, - Tasks.image_style_transfer: - InputType.IMAGE, + Tasks.image_style_transfer: { + 'content': InputType.IMAGE, + 'style': InputType.IMAGE, + }, Tasks.image_portrait_stylization: InputType.IMAGE, Tasks.live_category: @@ -147,8 +149,9 @@ TASK_INPUTS = { InputType.TEXT, Tasks.translation: InputType.TEXT, - Tasks.word_segmentation: - InputType.TEXT, + Tasks.word_segmentation: [InputType.TEXT, { + 'text': InputType.TEXT, + }], Tasks.part_of_speech: InputType.TEXT, Tasks.named_entity_recognition: @@ -194,8 +197,9 @@ TASK_INPUTS = { InputType.AUDIO, # ============ multi-modal tasks =================== - Tasks.image_captioning: - InputType.IMAGE, + Tasks.image_captioning: [InputType.IMAGE, { + 'image': InputType.IMAGE, + }], Tasks.visual_grounding: { 'image': InputType.IMAGE, 'text': InputType.TEXT diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 5732a9d7..ea329be4 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -236,7 +236,11 @@ class Pipeline(ABC): if isinstance(input_type, list): matched_type = None for t in input_type: - if type(t) == type(input): + if isinstance(input, (dict, tuple)): + if type(t) == type(input): + matched_type = t + break + elif isinstance(t, str): matched_type = t break if matched_type is None: diff --git a/tests/pipelines/test_image_style_transfer.py b/tests/pipelines/test_image_style_transfer.py index a02d5308..5f37f204 100644 --- a/tests/pipelines/test_image_style_transfer.py +++ b/tests/pipelines/test_image_style_transfer.py @@ -25,8 +25,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck): Tasks.image_style_transfer, model=snapshot_path) result = image_style_transfer( - 'data/test/images/style_transfer_content.jpg', - style='data/test/images/style_transfer_style.jpg') + dict( + content='data/test/images/style_transfer_content.jpg', + style='data/test/images/style_transfer_style.jpg')) cv2.imwrite('result_styletransfer1.png', result[OutputKeys.OUTPUT_IMG]) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') @@ -35,8 +36,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck): Tasks.image_style_transfer, model=self.model_id) result = image_style_transfer( - 'data/test/images/style_transfer_content.jpg', - style='data/test/images/style_transfer_style.jpg') + dict( + content='data/test/images/style_transfer_content.jpg', + style='data/test/images/style_transfer_style.jpg')) cv2.imwrite('result_styletransfer2.png', result[OutputKeys.OUTPUT_IMG]) print('style_transfer.test_run_modelhub done') @@ -45,8 +47,9 @@ class ImageStyleTransferTest(unittest.TestCase, DemoCompatibilityCheck): image_style_transfer = pipeline(Tasks.image_style_transfer) result = image_style_transfer( - 'data/test/images/style_transfer_content.jpg', - style='data/test/images/style_transfer_style.jpg') + dict( + content='data/test/images/style_transfer_content.jpg', + style='data/test/images/style_transfer_style.jpg')) cv2.imwrite('result_styletransfer3.png', result[OutputKeys.OUTPUT_IMG]) print('style_transfer.test_run_modelhub_default_model done') From 88a7599efb0168cd1914e19d0990ab1b37fc7406 Mon Sep 17 00:00:00 2001 From: "wenqi.oywq" Date: Mon, 17 Oct 2022 14:05:12 +0800 Subject: [PATCH 45/57] [to #42322933]change output channels from RGB to BGR, to consistent with demo-service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 默认输出为array的,通道格式统一为BGR格式,本次修改是为了与这个格式一致 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10422508 --- modelscope/pipelines/cv/image_color_enhance_pipeline.py | 2 +- tests/pipelines/test_image_color_enhance.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modelscope/pipelines/cv/image_color_enhance_pipeline.py b/modelscope/pipelines/cv/image_color_enhance_pipeline.py index d21d879c..3a4cf8bc 100644 --- a/modelscope/pipelines/cv/image_color_enhance_pipeline.py +++ b/modelscope/pipelines/cv/image_color_enhance_pipeline.py @@ -55,5 +55,5 @@ class ImageColorEnhancePipeline(Pipeline): def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: output_img = (inputs['outputs'].squeeze(0) * 255.).type( - torch.uint8).cpu().permute(1, 2, 0).numpy() + torch.uint8).cpu().permute(1, 2, 0).numpy()[:, :, ::-1] return {OutputKeys.OUTPUT_IMG: output_img} diff --git a/tests/pipelines/test_image_color_enhance.py b/tests/pipelines/test_image_color_enhance.py index 9b72999e..7c3ae8c0 100644 --- a/tests/pipelines/test_image_color_enhance.py +++ b/tests/pipelines/test_image_color_enhance.py @@ -21,8 +21,7 @@ class ImageColorEnhanceTest(unittest.TestCase, DemoCompatibilityCheck): def pipeline_inference(self, pipeline: Pipeline, input_location: str): result = pipeline(input_location) if result is not None: - cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG][:, :, - [2, 1, 0]]) + cv2.imwrite('result.png', result[OutputKeys.OUTPUT_IMG]) print(f'Output written to {osp.abspath("result.png")}') @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') From 674e1a7878f63603aa3bbc669fbac6a8b8a5b8a5 Mon Sep 17 00:00:00 2001 From: "wendi.hwd" Date: Mon, 17 Oct 2022 14:06:07 +0800 Subject: [PATCH 46/57] [to #42322933]cv/cvdet_fix_outputs->master fix outputs Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10421413 * fix outputs --- .../pipelines/cv/image_detection_pipeline.py | 8 ++++++-- tests/pipelines/test_object_detection.py | 20 ++++--------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/modelscope/pipelines/cv/image_detection_pipeline.py b/modelscope/pipelines/cv/image_detection_pipeline.py index f5554ca2..08633c35 100644 --- a/modelscope/pipelines/cv/image_detection_pipeline.py +++ b/modelscope/pipelines/cv/image_detection_pipeline.py @@ -43,11 +43,15 @@ class ImageDetectionPipeline(Pipeline): bboxes, scores, labels = self.model.postprocess(inputs['data']) if bboxes is None: - return None + outputs = { + OutputKeys.SCORES: [], + OutputKeys.LABELS: [], + OutputKeys.BOXES: [] + } + return outputs outputs = { OutputKeys.SCORES: scores, OutputKeys.LABELS: labels, OutputKeys.BOXES: bboxes } - return outputs diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py index 2cb217d9..00a71371 100644 --- a/tests/pipelines/test_object_detection.py +++ b/tests/pipelines/test_object_detection.py @@ -19,20 +19,14 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck): model_id = 'damo/cv_vit_object-detection_coco' object_detect = pipeline(Tasks.image_object_detection, model=model_id) result = object_detect(input_location) - if result: - print(result) - else: - raise ValueError('process error') + print(result) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_object_detection_with_default_task(self): input_location = 'data/test/images/image_detection.jpg' object_detect = pipeline(Tasks.image_object_detection) result = object_detect(input_location) - if result: - print(result) - else: - raise ValueError('process error') + print(result) @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_human_detection(self): @@ -40,20 +34,14 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck): model_id = 'damo/cv_resnet18_human-detection' human_detect = pipeline(Tasks.human_detection, model=model_id) result = human_detect(input_location) - if result: - print(result) - else: - raise ValueError('process error') + print(result) @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_human_detection_with_default_task(self): input_location = 'data/test/images/image_detection.jpg' human_detect = pipeline(Tasks.human_detection) result = human_detect(input_location) - if result: - print(result) - else: - raise ValueError('process error') + print(result) @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): From 542c4ce1b3433ca1d51ac0c0349b3d8f87c51f41 Mon Sep 17 00:00:00 2001 From: "shichen.fsc" Date: Mon, 17 Oct 2022 14:07:05 +0800 Subject: [PATCH 47/57] [to #42322933] Fix bug in KWS when setting customized keyword Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10412829 --- .../audio/asr/generic_automatic_speech_recognition.py | 2 ++ modelscope/models/audio/kws/generic_key_word_spotting.py | 2 ++ modelscope/pipelines/audio/kws_kwsbp_pipeline.py | 9 +++++++++ modelscope/preprocessors/asr.py | 2 ++ modelscope/preprocessors/kws.py | 2 ++ tests/pipelines/test_key_word_spotting.py | 2 +- 6 files changed, 18 insertions(+), 1 deletion(-) diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py index 11accf0a..aebc6751 100644 --- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py +++ b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Any, Dict diff --git a/modelscope/models/audio/kws/generic_key_word_spotting.py b/modelscope/models/audio/kws/generic_key_word_spotting.py index c1b7a0e4..2f70327d 100644 --- a/modelscope/models/audio/kws/generic_key_word_spotting.py +++ b/modelscope/models/audio/kws/generic_key_word_spotting.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Any, Dict diff --git a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py index 450a12bb..5555c9e6 100644 --- a/modelscope/pipelines/audio/kws_kwsbp_pipeline.py +++ b/modelscope/pipelines/audio/kws_kwsbp_pipeline.py @@ -37,6 +37,12 @@ class KeyWordSpottingKwsbpPipeline(Pipeline): **kwargs) -> Dict[str, Any]: if 'keywords' in kwargs.keys(): self.keywords = kwargs['keywords'] + if isinstance(self.keywords, str): + word_list = [] + word = {} + word['keyword'] = self.keywords + word_list.append(word) + self.keywords = word_list else: self.keywords = None @@ -96,6 +102,9 @@ class KeyWordSpottingKwsbpPipeline(Pipeline): pos_list=pos_kws_list, neg_list=neg_kws_list) + if 'kws_list' not in rst_dict: + rst_dict['kws_list'] = [] + return rst_dict def run_with_kwsbp(self, inputs: Dict[str, Any]) -> Dict[str, Any]: diff --git a/modelscope/preprocessors/asr.py b/modelscope/preprocessors/asr.py index d58383d7..facaa132 100644 --- a/modelscope/preprocessors/asr.py +++ b/modelscope/preprocessors/asr.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Any, Dict, List, Union diff --git a/modelscope/preprocessors/kws.py b/modelscope/preprocessors/kws.py index 9c370ed5..6f09d545 100644 --- a/modelscope/preprocessors/kws.py +++ b/modelscope/preprocessors/kws.py @@ -1,3 +1,5 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + import os from typing import Any, Dict, List, Union diff --git a/tests/pipelines/test_key_word_spotting.py b/tests/pipelines/test_key_word_spotting.py index 91f9f566..f31d212b 100644 --- a/tests/pipelines/test_key_word_spotting.py +++ b/tests/pipelines/test_key_word_spotting.py @@ -245,7 +245,7 @@ class KeyWordSpottingTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_wav_by_customized_keywords(self): - keywords = [{'keyword': '播放音乐'}] + keywords = '播放音乐' kws_result = self.run_pipeline( model_id=self.model_id, From 8fa385e27cc9a949c8544e838a6250ab527b0685 Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Mon, 17 Oct 2022 15:42:24 +0800 Subject: [PATCH 48/57] [to #42322933] Add upload in hub api Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10386689 --- modelscope/hub/git.py | 10 +++ modelscope/hub/upload.py | 117 +++++++++++++++++++++++++ tests/hub/test_hub_upload.py | 164 +++++++++++++++++++++++++++++++++++ 3 files changed, 291 insertions(+) create mode 100644 modelscope/hub/upload.py create mode 100644 tests/hub/test_hub_upload.py diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py index a149ede1..db76506e 100644 --- a/modelscope/hub/git.py +++ b/modelscope/hub/git.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os +import re import subprocess from typing import List from xmlrpc.client import Boolean @@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton): cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision] return self._run_git_command(*cmds) + def get_remote_branches(self, repo_dir: str): + cmds = ['-C', '%s' % repo_dir, 'branch', '-r'] + rsp = self._run_git_command(*cmds) + info = [ + line.strip() + for line in rsp.stdout.decode('utf8').strip().split(os.linesep) + ][1:] + return ['/'.join(line.split('/')[1:]) for line in info] + def pull(self, repo_dir: str): cmds = ['-C', repo_dir, 'pull'] return self._run_git_command(*cmds) diff --git a/modelscope/hub/upload.py b/modelscope/hub/upload.py new file mode 100644 index 00000000..9dffc60e --- /dev/null +++ b/modelscope/hub/upload.py @@ -0,0 +1,117 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import datetime +import os +import shutil +import tempfile +import uuid +from typing import Dict, Optional +from uuid import uuid4 + +from filelock import FileLock + +from modelscope import __version__ +from modelscope.hub.api import HubApi, ModelScopeConfig +from modelscope.hub.errors import InvalidParameter, NotLoginException +from modelscope.hub.git import GitCommandWrapper +from modelscope.hub.repository import Repository +from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +def upload_folder(model_id: str, + model_dir: str, + visibility: int = 0, + license: str = None, + chinese_name: Optional[str] = None, + commit_message: Optional[str] = None, + revision: Optional[str] = DEFAULT_MODEL_REVISION): + """ + Upload model from a given directory to given repository. A valid model directory + must contain a configuration.json file. + + This function upload the files in given directory to given repository. If the + given repository is not exists in remote, it will automatically create it with + given visibility, license and chinese_name parameters. If the revision is also + not exists in remote repository, it will create a new branch for it. + + This function must be called before calling HubApi's login with a valid token + which can be obtained from ModelScope's website. + + Args: + model_id (`str`): + The model id to be uploaded, caller must have write permission for it. + model_dir(`str`): + The Absolute Path of the finetune result. + visibility(`int`, defaults to `0`): + Visibility of the new created model(1-private, 5-public). If the model is + not exists in ModelScope, this function will create a new model with this + visibility and this parameter is required. You can ignore this parameter + if you make sure the model's existence. + license(`str`, defaults to `None`): + License of the new created model(see License). If the model is not exists + in ModelScope, this function will create a new model with this license + and this parameter is required. You can ignore this parameter if you + make sure the model's existence. + chinese_name(`str`, *optional*, defaults to `None`): + chinese name of the new created model. + commit_message(`str`, *optional*, defaults to `None`): + commit message of the push request. + revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION): + which branch to push. If the branch is not exists, It will create a new + branch and push to it. + """ + if model_id is None: + raise InvalidParameter('model_id cannot be empty!') + if model_dir is None: + raise InvalidParameter('model_dir cannot be empty!') + if not os.path.exists(model_dir) or os.path.isfile(model_dir): + raise InvalidParameter('model_dir must be a valid directory.') + cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) + if not os.path.exists(cfg_file): + raise ValueError(f'{model_dir} must contain a configuration.json.') + cookies = ModelScopeConfig.get_cookies() + if cookies is None: + raise NotLoginException('Must login before upload!') + files_to_save = os.listdir(model_dir) + api = HubApi() + try: + api.get_model(model_id=model_id) + except Exception: + if visibility is None or license is None: + raise InvalidParameter( + 'visibility and license cannot be empty if want to create new repo' + ) + logger.info('Create new model %s' % model_id) + api.create_model( + model_id=model_id, + visibility=visibility, + license=license, + chinese_name=chinese_name) + tmp_dir = tempfile.mkdtemp() + git_wrapper = GitCommandWrapper() + try: + repo = Repository(model_dir=tmp_dir, clone_from=model_id) + branches = git_wrapper.get_remote_branches(tmp_dir) + if revision not in branches: + logger.info('Create new branch %s' % revision) + git_wrapper.new_branch(tmp_dir, revision) + git_wrapper.checkout(tmp_dir, revision) + for f in files_to_save: + if f[0] != '.': + src = os.path.join(model_dir, f) + if os.path.isdir(src): + shutil.copytree(src, os.path.join(tmp_dir, f)) + else: + shutil.copy(src, tmp_dir) + if not commit_message: + date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + commit_message = '[automsg] push model %s to hub at %s' % ( + model_id, date) + repo.push(commit_message=commit_message, branch=revision) + except Exception: + raise + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py new file mode 100644 index 00000000..d7e6e439 --- /dev/null +++ b/tests/hub/test_hub_upload.py @@ -0,0 +1,164 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +import tempfile +import unittest + +from modelscope.hub.api import HubApi +from modelscope.hub.constants import Licenses, ModelVisibility +from modelscope.hub.repository import Repository +from modelscope.hub.upload import upload_folder +from modelscope.utils.constant import ModelFile +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level +from .test_utils import TEST_ACCESS_TOKEN1, delete_credential + +logger = get_logger() + + +class HubUploadTest(unittest.TestCase): + + def setUp(self): + logger.info('SetUp') + self.api = HubApi() + self.user = os.environ.get('TEST_MODEL_ORG', 'citest') + logger.info(self.user) + self.create_model_name = '%s/%s' % (self.user, 'test_model_upload') + temporary_dir = tempfile.mkdtemp() + self.work_dir = temporary_dir + self.model_dir = os.path.join(temporary_dir, self.create_model_name) + self.finetune_path = os.path.join(self.work_dir, 'finetune_path') + self.repo_path = os.path.join(self.work_dir, 'repo_path') + os.mkdir(self.finetune_path) + os.system("echo '{}'>%s" + % os.path.join(self.finetune_path, ModelFile.CONFIGURATION)) + + def tearDown(self): + logger.info('TearDown') + shutil.rmtree(self.model_dir, ignore_errors=True) + self.api.delete_model(model_id=self.create_model_name) + + def test_upload_exits_repo_master(self): + logger.info('basic test for upload!') + self.api.login(TEST_ACCESS_TOKEN1) + self.api.create_model( + model_id=self.create_model_name, + visibility=ModelVisibility.PUBLIC, + license=Licenses.APACHE_V2) + os.system("echo '111'>%s" + % os.path.join(self.finetune_path, 'add1.py')) + upload_folder( + model_id=self.create_model_name, model_dir=self.finetune_path) + Repository(model_dir=self.repo_path, clone_from=self.create_model_name) + assert os.path.exists(os.path.join(self.repo_path, 'add1.py')) + shutil.rmtree(self.repo_path, ignore_errors=True) + os.system("echo '222'>%s" + % os.path.join(self.finetune_path, 'add2.py')) + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + revision='new_revision/version1') + Repository( + model_dir=self.repo_path, + clone_from=self.create_model_name, + revision='new_revision/version1') + assert os.path.exists(os.path.join(self.repo_path, 'add2.py')) + shutil.rmtree(self.repo_path, ignore_errors=True) + os.system("echo '333'>%s" + % os.path.join(self.finetune_path, 'add3.py')) + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + revision='new_revision/version2', + commit_message='add add3.py') + Repository( + model_dir=self.repo_path, + clone_from=self.create_model_name, + revision='new_revision/version2') + assert os.path.exists(os.path.join(self.repo_path, 'add2.py')) + assert os.path.exists(os.path.join(self.repo_path, 'add3.py')) + shutil.rmtree(self.repo_path, ignore_errors=True) + add4_path = os.path.join(self.finetune_path, 'temp') + os.mkdir(add4_path) + os.system("echo '444'>%s" % os.path.join(add4_path, 'add4.py')) + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + revision='new_revision/version1') + Repository( + model_dir=self.repo_path, + clone_from=self.create_model_name, + revision='new_revision/version1') + assert os.path.exists(os.path.join(add4_path, 'add4.py')) + shutil.rmtree(self.repo_path, ignore_errors=True) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_upload_non_exists_repo(self): + logger.info('test upload non exists repo!') + self.api.login(TEST_ACCESS_TOKEN1) + os.system("echo '111'>%s" + % os.path.join(self.finetune_path, 'add1.py')) + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + revision='new_model_new_revision', + visibility=ModelVisibility.PUBLIC, + license=Licenses.APACHE_V2) + Repository( + model_dir=self.repo_path, + clone_from=self.create_model_name, + revision='new_model_new_revision') + assert os.path.exists(os.path.join(self.repo_path, 'add1.py')) + shutil.rmtree(self.repo_path, ignore_errors=True) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_upload_without_token(self): + logger.info('test upload without login!') + self.api.login(TEST_ACCESS_TOKEN1) + delete_credential() + try: + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + visibility=ModelVisibility.PUBLIC, + license=Licenses.APACHE_V2) + except Exception as e: + logger.info(e) + self.api.login(TEST_ACCESS_TOKEN1) + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + visibility=ModelVisibility.PUBLIC, + license=Licenses.APACHE_V2) + Repository( + model_dir=self.repo_path, clone_from=self.create_model_name) + assert os.path.exists( + os.path.join(self.repo_path, 'configuration.json')) + shutil.rmtree(self.repo_path, ignore_errors=True) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_upload_invalid_repo(self): + logger.info('test upload to invalid repo!') + self.api.login(TEST_ACCESS_TOKEN1) + try: + upload_folder( + model_id='%s/%s' % ('speech_tts', 'invalid_model_test'), + model_dir=self.finetune_path, + visibility=ModelVisibility.PUBLIC, + license=Licenses.APACHE_V2) + except Exception as e: + logger.info(e) + upload_folder( + model_id=self.create_model_name, + model_dir=self.finetune_path, + visibility=ModelVisibility.PUBLIC, + license=Licenses.APACHE_V2) + Repository( + model_dir=self.repo_path, clone_from=self.create_model_name) + assert os.path.exists( + os.path.join(self.repo_path, 'configuration.json')) + shutil.rmtree(self.repo_path, ignore_errors=True) + + +if __name__ == '__main__': + unittest.main() From 7720ae50e241ed3a5cf319d9410b774228d8126c Mon Sep 17 00:00:00 2001 From: "jiangnana.jnn" Date: Mon, 17 Oct 2022 20:30:42 +0800 Subject: [PATCH 49/57] return dict values when input single sample for easycv pipeline Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10423383 --- .../pipelines/cv/easycv_pipelines/base.py | 18 +++++++++++++++++- .../cv/easycv_pipelines/detection_pipeline.py | 3 +++ .../face_2d_keypoints_pipeline.py | 3 +++ .../human_wholebody_keypoint_pipeline.py | 3 +++ tests/pipelines/test_face_2d_keypoints.py | 2 +- tests/pipelines/test_hand_2d_keypoints.py | 9 ++------- .../pipelines/test_human_wholebody_keypoint.py | 2 +- tests/pipelines/test_object_detection.py | 2 +- 8 files changed, 31 insertions(+), 11 deletions(-) diff --git a/modelscope/pipelines/cv/easycv_pipelines/base.py b/modelscope/pipelines/cv/easycv_pipelines/base.py index 8aea1146..c130aea0 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/base.py +++ b/modelscope/pipelines/cv/easycv_pipelines/base.py @@ -4,7 +4,9 @@ import os import os.path as osp from typing import Any +import numpy as np from easycv.utils.ms_utils import EasyCVMeta +from PIL import ImageFile from modelscope.hub.snapshot_download import snapshot_download from modelscope.pipelines.util import is_official_hub_path @@ -94,5 +96,19 @@ class EasyCVPipeline(object): return easycv_config + def _is_single_inputs(self, inputs): + if isinstance(inputs, str) or (isinstance(inputs, list) + and len(inputs) == 1) or isinstance( + inputs, np.ndarray) or isinstance( + inputs, ImageFile.ImageFile): + return True + + return False + def __call__(self, inputs) -> Any: - return self.predict_op(inputs) + outputs = self.predict_op(inputs) + + if self._is_single_inputs(inputs): + outputs = outputs[0] + + return outputs diff --git a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py index 0c2058d5..a1173bc4 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/detection_pipeline.py @@ -57,4 +57,7 @@ class EasyCVDetectionPipeline(EasyCVPipeline): OutputKeys.BOXES: boxes } for output in outputs] + if self._is_single_inputs(inputs): + results = results[0] + return results diff --git a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py index 7c32e0fc..b48d013e 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/face_2d_keypoints_pipeline.py @@ -40,4 +40,7 @@ class Face2DKeypointsPipeline(EasyCVPipeline): OutputKeys.POSES: output['pose'] } for output in outputs] + if self._is_single_inputs(inputs): + results = results[0] + return results diff --git a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py index 263f8225..936accbf 100644 --- a/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py +++ b/modelscope/pipelines/cv/easycv_pipelines/human_wholebody_keypoint_pipeline.py @@ -62,4 +62,7 @@ class HumanWholebodyKeypointsPipeline(EasyCVPipeline): OutputKeys.BOXES: output['boxes'] } for output in outputs] + if self._is_single_inputs(inputs): + results = results[0] + return results diff --git a/tests/pipelines/test_face_2d_keypoints.py b/tests/pipelines/test_face_2d_keypoints.py index 667ecddc..a5e347e8 100644 --- a/tests/pipelines/test_face_2d_keypoints.py +++ b/tests/pipelines/test_face_2d_keypoints.py @@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): face_2d_keypoints_align = pipeline( task=Tasks.face_2d_keypoints, model=model_id) - output = face_2d_keypoints_align(img_path)[0] + output = face_2d_keypoints_align(img_path) output_keypoints = output[OutputKeys.KEYPOINTS] output_pose = output[OutputKeys.POSES] diff --git a/tests/pipelines/test_hand_2d_keypoints.py b/tests/pipelines/test_hand_2d_keypoints.py index 86cd2d06..43b569d0 100644 --- a/tests/pipelines/test_hand_2d_keypoints.py +++ b/tests/pipelines/test_hand_2d_keypoints.py @@ -15,10 +15,8 @@ class Hand2DKeypointsPipelineTest(unittest.TestCase): model_id = 'damo/cv_hrnetw18_hand-pose-keypoints_coco-wholebody' hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints, model=model_id) - outputs = hand_keypoint(img_path) - self.assertEqual(len(outputs), 1) + results = hand_keypoint(img_path) - results = outputs[0] self.assertIn(OutputKeys.KEYPOINTS, results.keys()) self.assertIn(OutputKeys.BOXES, results.keys()) self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21) @@ -30,10 +28,7 @@ class Hand2DKeypointsPipelineTest(unittest.TestCase): img_path = 'data/test/images/hand_keypoints.jpg' hand_keypoint = pipeline(task=Tasks.hand_2d_keypoints) - outputs = hand_keypoint(img_path) - self.assertEqual(len(outputs), 1) - - results = outputs[0] + results = hand_keypoint(img_path) self.assertIn(OutputKeys.KEYPOINTS, results.keys()) self.assertIn(OutputKeys.BOXES, results.keys()) self.assertEqual(results[OutputKeys.KEYPOINTS].shape[1], 21) diff --git a/tests/pipelines/test_human_wholebody_keypoint.py b/tests/pipelines/test_human_wholebody_keypoint.py index b214f4e1..7c5946cc 100644 --- a/tests/pipelines/test_human_wholebody_keypoint.py +++ b/tests/pipelines/test_human_wholebody_keypoint.py @@ -18,7 +18,7 @@ class EasyCVFace2DKeypointsPipelineTest(unittest.TestCase): human_wholebody_keypoint_pipeline = pipeline( task=Tasks.human_wholebody_keypoint, model=model_id) - output = human_wholebody_keypoint_pipeline(img_path)[0] + output = human_wholebody_keypoint_pipeline(img_path) output_keypoints = output[OutputKeys.KEYPOINTS] output_pose = output[OutputKeys.BOXES] diff --git a/tests/pipelines/test_object_detection.py b/tests/pipelines/test_object_detection.py index 00a71371..64766c77 100644 --- a/tests/pipelines/test_object_detection.py +++ b/tests/pipelines/test_object_detection.py @@ -55,7 +55,7 @@ class ObjectDetectionTest(unittest.TestCase, DemoCompatibilityCheck): image_object_detection_auto = pipeline( Tasks.image_object_detection, model=model_id) - result = image_object_detection_auto(test_image)[0] + result = image_object_detection_auto(test_image) image_object_detection_auto.show_result(test_image, result, 'auto_demo_ret.jpg') From ac07b719e9b83c5da6c108e75e0767211f343016 Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Mon, 17 Oct 2022 20:51:58 +0800 Subject: [PATCH 50/57] [to #45546922]feat: add fasttext package Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10431169 * [to #45546922]feat: add fasttext package --- docker/Dockerfile.ubuntu | 2 +- modelscope/hub/errors.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index a9a409b5..6dafbc3e 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \ ENV SHELL=/bin/bash # install special package -RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq +RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl RUN if [ "$USE_GPU" = "True" ] ; then \ pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \ diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py index fb483287..bd7a20ac 100644 --- a/modelscope/hub/errors.py +++ b/modelscope/hub/errors.py @@ -53,8 +53,8 @@ def handle_http_post_error(response, url, request_body): try: response.raise_for_status() except HTTPError as error: - logger.error('Request %s with body: %s exception, respoonse body: %s' % - (url, request_body, response.body)) + logger.error('Request %s with body: %s exception' % + (url, request_body)) raise error From 271e2a2a9916de3bd64e40dd4c836d341fed4b77 Mon Sep 17 00:00:00 2001 From: "hemu.zp" Date: Mon, 17 Oct 2022 20:54:29 +0800 Subject: [PATCH 51/57] [to #42322933] Add gpt_neo model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. 添加 gpt_neo 模型,因 checkpoint 归属于 Langboat 还未上传到模型库,已线下完成测试 2. 添加 text-generation task models 与 head,后续会将 gpt3,palm 等已上线文本生成模型统一为 backbone + head 结构的 task models Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10404249 --- modelscope/metainfo.py | 5 ++ modelscope/models/nlp/__init__.py | 4 +- modelscope/models/nlp/backbones/gpt_neo.py | 15 ++++ .../models/nlp/heads/text_generation_head.py | 35 ++++++++ modelscope/models/nlp/task_models/__init__.py | 2 + .../models/nlp/task_models/text_generation.py | 79 +++++++++++++++++++ .../pipelines/nlp/text_generation_pipeline.py | 38 ++++++--- modelscope/preprocessors/__init__.py | 2 + modelscope/preprocessors/nlp/__init__.py | 2 + modelscope/preprocessors/nlp/nlp_base.py | 21 +++++ tests/pipelines/test_text_generation.py | 13 +++ tests/utils/test_ast.py | 2 +- 12 files changed, 207 insertions(+), 11 deletions(-) create mode 100644 modelscope/models/nlp/backbones/gpt_neo.py create mode 100644 modelscope/models/nlp/heads/text_generation_head.py create mode 100644 modelscope/models/nlp/task_models/text_generation.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 2e3fed98..fb99bc71 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -71,6 +71,7 @@ class Models(object): gcnncrf = 'gcnn-crf' bart = 'bart' gpt3 = 'gpt3' + gpt_neo = 'gpt-neo' plug = 'plug' bert_for_ds = 'bert-for-document-segmentation' ponet = 'ponet' @@ -101,6 +102,7 @@ class TaskModels(object): information_extraction = 'information-extraction' fill_mask = 'fill-mask' feature_extraction = 'feature-extraction' + text_generation = 'text-generation' class Heads(object): @@ -116,6 +118,8 @@ class Heads(object): token_classification = 'token-classification' # extraction information_extraction = 'information-extraction' + # text gen + text_generation = 'text-generation' class Pipelines(object): @@ -341,6 +345,7 @@ class Preprocessors(object): re_tokenizer = 're-tokenizer' document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' + sentence_piece = 'sentence-piece' # audio preprocessor linear_aec_fbank = 'linear-aec-fbank' diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 8ef96365..9e830d17 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -30,7 +30,8 @@ if TYPE_CHECKING: InformationExtractionModel, SequenceClassificationModel, SingleBackboneTaskModelBase, - TokenClassificationModel) + TokenClassificationModel, + TaskModelForTextGeneration) from .token_classification import SbertForTokenClassification from .sentence_embedding import SentenceEmbedding from .passage_ranking import PassageRanking @@ -69,6 +70,7 @@ else: 'SequenceClassificationModel', 'SingleBackboneTaskModelBase', 'TokenClassificationModel', + 'TaskModelForTextGeneration', ], 'token_classification': ['SbertForTokenClassification'], 'table_question_answering': ['TableQuestionAnswering'], diff --git a/modelscope/models/nlp/backbones/gpt_neo.py b/modelscope/models/nlp/backbones/gpt_neo.py new file mode 100644 index 00000000..a2d0c374 --- /dev/null +++ b/modelscope/models/nlp/backbones/gpt_neo.py @@ -0,0 +1,15 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from transformers import GPTNeoConfig +from transformers import GPTNeoModel as GPTNeoModelTransform + +from modelscope.metainfo import Models +from modelscope.models.builder import BACKBONES +from modelscope.utils.constant import Fields + + +@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo) +class GPTNeoModel(GPTNeoModelTransform): + + def __init__(self, **kwargs): + config = GPTNeoConfig(**kwargs) + super().__init__(config) diff --git a/modelscope/models/nlp/heads/text_generation_head.py b/modelscope/models/nlp/heads/text_generation_head.py new file mode 100644 index 00000000..606d5a1f --- /dev/null +++ b/modelscope/models/nlp/heads/text_generation_head.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Dict + +import torch +import torch.nn.functional as F +from torch import nn + +from modelscope.metainfo import Heads +from modelscope.models.base import TorchHead +from modelscope.models.builder import HEADS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks + + +@HEADS.register_module( + Tasks.text_generation, module_name=Heads.text_generation) +class TextGenerationHead(TorchHead): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + config = self.config + self.linear = nn.Linear( + config['hidden_size'], config['vocab_size'], bias=False) + + def get_output_embeddings(self): + return self.linear + + def forward(self, inputs=None): + logits = self.linear(inputs) + return {OutputKeys.LOGITS: logits} + + def compute_loss(self, outputs: Dict[str, torch.Tensor], + labels) -> Dict[str, torch.Tensor]: + logits = outputs[OutputKeys.LOGITS] + return {OutputKeys.LOSS: F.cross_entropy(logits, labels)} diff --git a/modelscope/models/nlp/task_models/__init__.py b/modelscope/models/nlp/task_models/__init__.py index 90f22aa1..38359044 100644 --- a/modelscope/models/nlp/task_models/__init__.py +++ b/modelscope/models/nlp/task_models/__init__.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: from .sequence_classification import SequenceClassificationModel from .task_model import SingleBackboneTaskModelBase from .token_classification import TokenClassificationModel + from .text_generation import TaskModelForTextGeneration else: _import_structure = { @@ -19,6 +20,7 @@ else: 'sequence_classification': ['SequenceClassificationModel'], 'task_model': ['SingleBackboneTaskModelBase'], 'token_classification': ['TokenClassificationModel'], + 'text_generation': ['TaskModelForTextGeneration'], } import sys diff --git a/modelscope/models/nlp/task_models/text_generation.py b/modelscope/models/nlp/task_models/text_generation.py new file mode 100644 index 00000000..973198ae --- /dev/null +++ b/modelscope/models/nlp/task_models/text_generation.py @@ -0,0 +1,79 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict + +import addict +import numpy as np +from transformers.modeling_utils import PreTrainedModel + +from modelscope.metainfo import TaskModels +from modelscope.models.builder import MODELS +from modelscope.models.nlp.task_models.task_model import \ + SingleBackboneTaskModelBase +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks + +__all__ = ['TaskModelForTextGeneration'] + + +@MODELS.register_module( + Tasks.text_generation, module_name=TaskModels.text_generation) +class TaskModelForTextGeneration(SingleBackboneTaskModelBase, PreTrainedModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the text generation model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + if 'base_model_prefix' in kwargs: + self._base_model_prefix = kwargs['base_model_prefix'] + + self.build_backbone(self.backbone_cfg) + self.build_head(self.head_cfg) + if self.config.get('shared_embedding', False): + input_embeddings = self.backbone.get_input_embeddings() + output_embeddings = self.head.get_output_embeddings() + output_embeddings.weight = input_embeddings.weight + + def forward(self, **input: Dict[str, Any]) -> Dict[str, np.ndarray]: + # backbone do not need labels, only head need for loss compute + labels = input.pop(OutputKeys.LABELS, None) + + backbone_outputs = super().forward(input) + hidden_states = backbone_outputs[0] + + outputs = self.head.forward(hidden_states) + if labels is not None: + input[OutputKeys.LABELS] = labels + loss = self.compute_loss(outputs, labels) + outputs.update(loss) + return addict.Dict(outputs) + + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + token_type_ids = kwargs.get('token_type_ids', None) + # only last token for inputs_ids if past is defined in kwargs + if past: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get('attention_mask', None) + position_ids = kwargs.get('position_ids', None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + return { + 'input_ids': input_ids, + 'past_key_values': past, + 'use_cache': kwargs.get('use_cache'), + 'position_ids': position_ids, + 'attention_mask': attention_mask, + 'token_type_ids': token_type_ids, + } diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index ea35763f..ae92f26a 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -6,10 +6,12 @@ import torch from modelscope.metainfo import Pipelines from modelscope.models.base import Model +from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline, Tensor from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import TextGenerationPreprocessor -from modelscope.utils.constant import Tasks +from modelscope.preprocessors import Preprocessor, build_preprocessor +from modelscope.utils.constant import Fields, Tasks +from modelscope.utils.hub import read_config __all__ = ['TextGenerationPipeline'] @@ -20,7 +22,7 @@ class TextGenerationPipeline(Pipeline): def __init__(self, model: Union[Model, str], - preprocessor: Optional[TextGenerationPreprocessor] = None, + preprocessor: Optional[Preprocessor] = None, first_sequence='sentence', **kwargs): """Use `model` and `preprocessor` to create a generation pipeline for prediction. @@ -50,19 +52,34 @@ class TextGenerationPipeline(Pipeline): """ model = model if isinstance(model, Model) else Model.from_pretrained(model) + cfg = read_config(model.model_dir) + self.postprocessor = cfg.pop('postprocessor', None) if preprocessor is None: - preprocessor = TextGenerationPreprocessor( + preprocessor_cfg = cfg.preprocessor + preprocessor_cfg.update({ + 'model_dir': model.model_dir, - first_sequence=first_sequence, - second_sequence=None, - sequence_length=kwargs.pop('sequence_length', 128)) + 'first_sequence': + first_sequence, + 'second_sequence': + None, + 'sequence_length': + kwargs.pop('sequence_length', 128) + }) + preprocessor = build_preprocessor(preprocessor_cfg, Fields.nlp) model.eval() super().__init__(model=model, preprocessor=preprocessor, **kwargs) + def _sanitize_parameters(self, **pipeline_parameters): + return {}, pipeline_parameters, {} + def forward(self, inputs: Dict[str, Any], **forward_params) -> Dict[str, Any]: with torch.no_grad(): - return self.model.generate(inputs) + return self.model.generate(inputs, **forward_params) + + def sentence_piece(self, inputs) -> Dict[str, Tensor]: + return self.preprocessor.tokenizer.decode(inputs.tolist())[0] def postprocess(self, inputs: Dict[str, Tensor], **postprocess_params) -> Dict[str, str]: @@ -74,4 +91,7 @@ class TextGenerationPipeline(Pipeline): Returns: Dict[str, str]: the prediction results """ - return inputs + return inputs if self.postprocessor is None else { + OutputKeys.TEXT: + getattr(self, self.postprocessor.replace('-', '_'))(inputs) + } diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 90303b65..43fa64a7 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: Tokenize, WordSegmentationBlankSetToLabelPreprocessor, ZeroShotClassificationPreprocessor, + SentencePiecePreprocessor, ) from .space import (DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, @@ -71,6 +72,7 @@ else: 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'ZeroShotClassificationPreprocessor', + 'SentencePiecePreprocessor', ], 'space': [ 'DialogIntentPredictionPreprocessor', 'DialogModelingPreprocessor', diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index dfbb5c81..a753fe6c 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: Tokenize, WordSegmentationBlankSetToLabelPreprocessor, ZeroShotClassificationPreprocessor, + SentencePiecePreprocessor, ) else: @@ -41,6 +42,7 @@ else: 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'ZeroShotClassificationPreprocessor', + 'SentencePiecePreprocessor', ], 'text_error_correction': [ 'TextErrorCorrectionPreprocessor', diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index bec7e4e1..3d708634 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -5,6 +5,7 @@ import re from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np +import sentencepiece as spm import torch from transformers import AutoTokenizer @@ -1160,3 +1161,23 @@ class FillMaskPoNetPreprocessor(NLPTokenizerPreprocessorBase): self.labels_to_id(labels, output) return output + + +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.sentence_piece) +class SentencePiecePreprocessor(Preprocessor): + + def __init__(self, model_dir: str, *args, **kwargs): + import os + + super().__init__(*args, **kwargs) + self.tokenizer = None + for file_name in os.listdir(model_dir): + if file_name.endswith('.model'): + m_file = osp.join(model_dir, file_name) + self.tokenizer = spm.SentencePieceProcessor(model_file=m_file) + break + assert self.tokenizer is not None, 'Can not find .model file' + + def __call__(self, data: str) -> Dict[str, Any]: + return torch.tensor(self.tokenizer.encode([data]), dtype=torch.long) diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py index 66f9c9da..5a270f83 100644 --- a/tests/pipelines/test_text_generation.py +++ b/tests/pipelines/test_text_generation.py @@ -133,6 +133,19 @@ class TextGenerationTest(unittest.TestCase, DemoCompatibilityCheck): def test_demo_compatibility(self): self.compatibility_check() + @unittest.skip("Langboat's checkpoint has not been uploaded to modelhub") + def test_gpt_neo(self): + pipe = pipeline( + task=Tasks.text_generation, model='Langboat/mengzi-gpt-neo-base') + print( + pipe( + '我是', + do_sample=True, + top_k=5, + top_p=1, + max_length=20, + repetition_penalty=0.5)) + if __name__ == '__main__': unittest.main() diff --git a/tests/utils/test_ast.py b/tests/utils/test_ast.py index 9a8ab828..c0624679 100644 --- a/tests/utils/test_ast.py +++ b/tests/utils/test_ast.py @@ -41,7 +41,7 @@ class AstScaningTest(unittest.TestCase): self.assertIsInstance(from_imports, dict) self.assertIsInstance(decorators, list) self.assertListEqual(list(set(imports.keys()) - set(['torch'])), []) - self.assertEqual(len(from_imports.keys()), 7) + self.assertEqual(len(from_imports.keys()), 9) self.assertTrue(from_imports['modelscope.metainfo'] is not None) self.assertEqual(from_imports['modelscope.metainfo'], ['Pipelines']) self.assertEqual(decorators, From 172522d19654a9e6c3d872170753086cf2452411 Mon Sep 17 00:00:00 2001 From: "leyuan.hjy" Date: Mon, 17 Oct 2022 20:58:23 +0800 Subject: [PATCH 52/57] [to #42322933]video-object-detection init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增video-object-detection 算法 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10247489 --- data/test/videos/test_realtime_vod.mp4 | 3 + modelscope/metainfo.py | 2 + .../cv/realtime_object_detection/__init__.py | 2 + .../realtime_video_detector.py | 117 +++++++ .../yolox/exp/build.py | 2 + .../yolox/exp/default/__init__.py | 2 +- .../yolox/exp/default/streamyolo.py | 43 +++ .../yolox/exp/yolox_base.py | 1 - .../yolox/models/__init__.py | 3 + .../yolox/models/dfp_pafpn.py | 307 ++++++++++++++++++ .../yolox/models/network_blocks.py | 1 - .../yolox/models/streamyolo.py | 41 +++ .../yolox/models/tal_head.py | 170 ++++++++++ modelscope/outputs.py | 31 +- ...ealtime_video_object_detection_pipeline.py | 59 ++++ modelscope/utils/constant.py | 1 + modelscope/utils/cv/image_utils.py | 60 ++++ .../test_realtime_video_object_detection.py | 46 +++ 18 files changed, 886 insertions(+), 5 deletions(-) create mode 100644 data/test/videos/test_realtime_vod.mp4 create mode 100644 modelscope/models/cv/realtime_object_detection/realtime_video_detector.py create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py create mode 100644 modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py create mode 100644 modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py create mode 100644 tests/pipelines/test_realtime_video_object_detection.py diff --git a/data/test/videos/test_realtime_vod.mp4 b/data/test/videos/test_realtime_vod.mp4 new file mode 100644 index 00000000..a0e44852 --- /dev/null +++ b/data/test/videos/test_realtime_vod.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87 +size 4068783 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index fb99bc71..e4a26303 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -14,6 +14,7 @@ class Models(object): # vision models detection = 'detection' realtime_object_detection = 'realtime-object-detection' + realtime_video_object_detection = 'realtime-video-object-detection' scrfd = 'scrfd' classification_model = 'ClassificationModel' nafnet = 'nafnet' @@ -170,6 +171,7 @@ class Pipelines(object): face_image_generation = 'gan-face-image-generation' product_retrieval_embedding = 'resnet50-product-retrieval-embedding' realtime_object_detection = 'cspnet_realtime-object-detection_yolox' + realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo' face_recognition = 'ir101-face-recognition-cfglint' image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation' image2image_translation = 'image-to-image-translation' diff --git a/modelscope/models/cv/realtime_object_detection/__init__.py b/modelscope/models/cv/realtime_object_detection/__init__.py index aed13cec..66156977 100644 --- a/modelscope/models/cv/realtime_object_detection/__init__.py +++ b/modelscope/models/cv/realtime_object_detection/__init__.py @@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .realtime_detector import RealtimeDetector + from .realtime_video_detector import RealtimeVideoDetector else: _import_structure = { 'realtime_detector': ['RealtimeDetector'], + 'realtime_video_detector': ['RealtimeVideoDetector'], } import sys diff --git a/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py new file mode 100644 index 00000000..fc7339b3 --- /dev/null +++ b/modelscope/models/cv/realtime_object_detection/realtime_video_detector.py @@ -0,0 +1,117 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import argparse +import logging as logger +import os +import os.path as osp +import time + +import cv2 +import json +import torch +from tqdm import tqdm + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.preprocessors import LoadImage +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from .yolox.data.data_augment import ValTransform +from .yolox.exp import get_exp_by_name +from .yolox.utils import postprocess + + +@MODELS.register_module( + group_key=Tasks.video_object_detection, + module_name=Models.realtime_video_object_detection) +class RealtimeVideoDetector(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + self.config = Config.from_file( + os.path.join(self.model_dir, ModelFile.CONFIGURATION)) + + # model type + self.exp = get_exp_by_name(self.config.model_type) + + # build model + self.model = self.exp.get_model() + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE) + ckpt = torch.load(model_path, map_location='cpu') + + # load the model state dict + self.model.load_state_dict(ckpt['model']) + self.model.eval() + + # params setting + self.exp.num_classes = self.config.num_classes + self.confthre = self.config.conf_thr + self.num_classes = self.exp.num_classes + self.nmsthre = self.exp.nmsthre + self.test_size = self.exp.test_size + self.preproc = ValTransform(legacy=False) + self.current_buffer = None + self.label_mapping = self.config['labels'] + + def inference(self, img): + with torch.no_grad(): + outputs, self.current_buffer = self.model( + img, buffer=self.current_buffer, mode='on_pipe') + return outputs + + def forward(self, inputs): + return self.inference_video(inputs) + + def preprocess(self, img): + img = LoadImage.convert_to_ndarray(img) + height, width = img.shape[:2] + self.ratio = min(self.test_size[0] / img.shape[0], + self.test_size[1] / img.shape[1]) + + img, _ = self.preproc(img, None, self.test_size) + img = torch.from_numpy(img).unsqueeze(0) + img = img.float() + + # Video decoding and preprocessing automatically are not supported by Pipeline/Model + # Sending preprocessed video frame tensor to GPU buffer self-adaptively + if next(self.model.parameters()).is_cuda: + img = img.to(next(self.model.parameters()).device) + return img + + def postprocess(self, input): + outputs = postprocess( + input, + self.num_classes, + self.confthre, + self.nmsthre, + class_agnostic=True) + + if len(outputs) == 1: + bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio + scores = outputs[0][:, 5].cpu().numpy() + labels = outputs[0][:, 6].cpu().int().numpy() + pred_label_names = [] + for lab in labels: + pred_label_names.append(self.label_mapping[lab]) + + return bboxes, scores, pred_label_names + + def inference_video(self, v_path): + outputs = [] + desc = 'Detecting video: {}'.format(v_path) + for frame, result in tqdm( + self.inference_video_iter(v_path), desc=desc): + outputs.append(result) + + return outputs + + def inference_video_iter(self, v_path): + capture = cv2.VideoCapture(v_path) + while capture.isOpened(): + ret, frame = capture.read() + if not ret: + break + output = self.preprocess(frame) + output = self.inference(output) + output = self.postprocess(output) + yield frame, output diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py index 4858100c..5865c53b 100644 --- a/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py +++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/build.py @@ -13,6 +13,8 @@ def get_exp_by_name(exp_name): from .default import YoloXNanoExp as YoloXExp elif exp == 'yolox_tiny': from .default import YoloXTinyExp as YoloXExp + elif exp == 'streamyolo': + from .default import StreamYoloExp as YoloXExp else: pass return YoloXExp() diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py index 552bbccd..cfec836c 100644 --- a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py +++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/__init__.py @@ -1,5 +1,5 @@ # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX - +from .streamyolo import StreamYoloExp from .yolox_nano import YoloXNanoExp from .yolox_s import YoloXSExp from .yolox_tiny import YoloXTinyExp diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py new file mode 100644 index 00000000..5a62c8fc --- /dev/null +++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/default/streamyolo.py @@ -0,0 +1,43 @@ +# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO +import os +import sys + +import torch + +from ..yolox_base import Exp as YoloXExp + + +class StreamYoloExp(YoloXExp): + + def __init__(self): + super(YoloXExp, self).__init__() + self.depth = 1.0 + self.width = 1.0 + self.num_classes = 8 + self.test_size = (600, 960) + self.test_conf = 0.3 + self.nmsthre = 0.65 + + def get_model(self): + from ...models import StreamYOLO, DFPPAFPN, TALHead + + def init_yolo(M): + for m in M.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + + if getattr(self, 'model', None) is None: + in_channels = [256, 512, 1024] + backbone = DFPPAFPN( + self.depth, self.width, in_channels=in_channels) + head = TALHead( + self.num_classes, + self.width, + in_channels=in_channels, + gamma=1.0, + ignore_thr=0.5, + ignore_value=1.6) + self.model = StreamYOLO(backbone, head) + + return self.model diff --git a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py index a2a41535..c5159a9f 100644 --- a/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py +++ b/modelscope/models/cv/realtime_object_detection/yolox/exp/yolox_base.py @@ -1,5 +1,4 @@ # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX - import os import random diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py index 20b1a0d1..d2e889f1 100644 --- a/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py +++ b/modelscope/models/cv/realtime_object_detection/yolox/models/__init__.py @@ -1,6 +1,9 @@ # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX from .darknet import CSPDarknet, Darknet +from .dfp_pafpn import DFPPAFPN +from .streamyolo import StreamYOLO +from .tal_head import TALHead from .yolo_fpn import YOLOFPN from .yolo_head import YOLOXHead from .yolo_pafpn import YOLOPAFPN diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py b/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py new file mode 100644 index 00000000..01284791 --- /dev/null +++ b/modelscope/models/cv/realtime_object_detection/yolox/models/dfp_pafpn.py @@ -0,0 +1,307 @@ +# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .darknet import CSPDarknet +from .network_blocks import BaseConv, CSPLayer, DWConv + + +class DFPPAFPN(nn.Module): + """ + YOLOv3 model. Darknet 53 is the default backbone of this model. + """ + + def __init__( + self, + depth=1.0, + width=1.0, + in_features=('dark3', 'dark4', 'dark5'), + in_channels=[256, 512, 1024], + depthwise=False, + act='silu', + ): + super().__init__() + self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act) + self.in_features = in_features + self.in_channels = in_channels + Conv = DWConv if depthwise else BaseConv + + self.lateral_conv0 = BaseConv( + int(in_channels[2] * width), + int(in_channels[1] * width), + 1, + 1, + act=act) + self.C3_p4 = CSPLayer( + int(2 * in_channels[1] * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) # cat + + self.reduce_conv1 = BaseConv( + int(in_channels[1] * width), + int(in_channels[0] * width), + 1, + 1, + act=act) + self.C3_p3 = CSPLayer( + int(2 * in_channels[0] * width), + int(in_channels[0] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) + + # bottom-up conv + self.bu_conv2 = Conv( + int(in_channels[0] * width), + int(in_channels[0] * width), + 3, + 2, + act=act) + self.C3_n3 = CSPLayer( + int(2 * in_channels[0] * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) + + # bottom-up conv + self.bu_conv1 = Conv( + int(in_channels[1] * width), + int(in_channels[1] * width), + 3, + 2, + act=act) + self.C3_n4 = CSPLayer( + int(2 * in_channels[1] * width), + int(in_channels[2] * width), + round(3 * depth), + False, + depthwise=depthwise, + act=act, + ) + + self.jian2 = Conv( + in_channels=int(in_channels[0] * width), + out_channels=int(in_channels[0] * width) // 2, + ksize=1, + stride=1, + act=act, + ) + + self.jian1 = Conv( + in_channels=int(in_channels[1] * width), + out_channels=int(in_channels[1] * width) // 2, + ksize=1, + stride=1, + act=act, + ) + + self.jian0 = Conv( + in_channels=int(in_channels[2] * width), + out_channels=int(in_channels[2] * width) // 2, + ksize=1, + stride=1, + act=act, + ) + + def off_forward(self, input): + """ + Args: + inputs: input images. + + Returns: + Tuple[Tensor]: FPN feature. + """ + + # backbone + rurrent_out_features = self.backbone(torch.split(input, 3, dim=1)[0]) + rurrent_features = [rurrent_out_features[f] for f in self.in_features] + [rurrent_x2, rurrent_x1, rurrent_x0] = rurrent_features + + rurrent_fpn_out0 = self.lateral_conv0(rurrent_x0) # 1024->512/32 + rurrent_f_out0 = F.interpolate( + rurrent_fpn_out0, size=rurrent_x1.shape[2:4], + mode='nearest') # 512/16 + rurrent_f_out0 = torch.cat([rurrent_f_out0, rurrent_x1], + 1) # 512->1024/16 + rurrent_f_out0 = self.C3_p4(rurrent_f_out0) # 1024->512/16 + + rurrent_fpn_out1 = self.reduce_conv1(rurrent_f_out0) # 512->256/16 + rurrent_f_out1 = F.interpolate( + rurrent_fpn_out1, size=rurrent_x2.shape[2:4], + mode='nearest') # 256/8 + rurrent_f_out1 = torch.cat([rurrent_f_out1, rurrent_x2], + 1) # 256->512/8 + rurrent_pan_out2 = self.C3_p3(rurrent_f_out1) # 512->256/8 + + rurrent_p_out1 = self.bu_conv2(rurrent_pan_out2) # 256->256/16 + rurrent_p_out1 = torch.cat([rurrent_p_out1, rurrent_fpn_out1], + 1) # 256->512/16 + rurrent_pan_out1 = self.C3_n3(rurrent_p_out1) # 512->512/16 + + rurrent_p_out0 = self.bu_conv1(rurrent_pan_out1) # 512->512/32 + rurrent_p_out0 = torch.cat([rurrent_p_out0, rurrent_fpn_out0], + 1) # 512->1024/32 + rurrent_pan_out0 = self.C3_n4(rurrent_p_out0) # 1024->1024/32 + + ##### + + support_out_features = self.backbone(torch.split(input, 3, dim=1)[1]) + support_features = [support_out_features[f] for f in self.in_features] + [support_x2, support_x1, support_x0] = support_features + + support_fpn_out0 = self.lateral_conv0(support_x0) # 1024->512/32 + support_f_out0 = F.interpolate( + support_fpn_out0, size=support_x1.shape[2:4], + mode='nearest') # 512/16 + support_f_out0 = torch.cat([support_f_out0, support_x1], + 1) # 512->1024/16 + support_f_out0 = self.C3_p4(support_f_out0) # 1024->512/16 + + support_fpn_out1 = self.reduce_conv1(support_f_out0) # 512->256/16 + support_f_out1 = F.interpolate( + support_fpn_out1, size=support_x2.shape[2:4], + mode='nearest') # 256/8 + support_f_out1 = torch.cat([support_f_out1, support_x2], + 1) # 256->512/8 + support_pan_out2 = self.C3_p3(support_f_out1) # 512->256/8 + + support_p_out1 = self.bu_conv2(support_pan_out2) # 256->256/16 + support_p_out1 = torch.cat([support_p_out1, support_fpn_out1], + 1) # 256->512/16 + support_pan_out1 = self.C3_n3(support_p_out1) # 512->512/16 + + support_p_out0 = self.bu_conv1(support_pan_out1) # 512->512/32 + support_p_out0 = torch.cat([support_p_out0, support_fpn_out0], + 1) # 512->1024/32 + support_pan_out0 = self.C3_n4(support_p_out0) # 1024->1024/32 + + # 0.5 channel + pan_out2 = torch.cat( + [self.jian2(rurrent_pan_out2), + self.jian2(support_pan_out2)], + dim=1) + rurrent_pan_out2 + pan_out1 = torch.cat( + [self.jian1(rurrent_pan_out1), + self.jian1(support_pan_out1)], + dim=1) + rurrent_pan_out1 + pan_out0 = torch.cat( + [self.jian0(rurrent_pan_out0), + self.jian0(support_pan_out0)], + dim=1) + rurrent_pan_out0 + + outputs = (pan_out2, pan_out1, pan_out0) + + return outputs + + def online_forward(self, input, buffer=None, node='star'): + """ + Args: + inputs: input images. + + Returns: + Tuple[Tensor]: FPN feature. + """ + + # backbone + rurrent_out_features = self.backbone(input) + rurrent_features = [rurrent_out_features[f] for f in self.in_features] + [rurrent_x2, rurrent_x1, rurrent_x0] = rurrent_features + + rurrent_fpn_out0 = self.lateral_conv0(rurrent_x0) # 1024->512/32 + rurrent_f_out0 = F.interpolate( + rurrent_fpn_out0, size=rurrent_x1.shape[2:4], + mode='nearest') # 512/16 + rurrent_f_out0 = torch.cat([rurrent_f_out0, rurrent_x1], + 1) # 512->1024/16 + rurrent_f_out0 = self.C3_p4(rurrent_f_out0) # 1024->512/16 + + rurrent_fpn_out1 = self.reduce_conv1(rurrent_f_out0) # 512->256/16 + rurrent_f_out1 = F.interpolate( + rurrent_fpn_out1, size=rurrent_x2.shape[2:4], + mode='nearest') # 256/8 + rurrent_f_out1 = torch.cat([rurrent_f_out1, rurrent_x2], + 1) # 256->512/8 + rurrent_pan_out2 = self.C3_p3(rurrent_f_out1) # 512->256/8 + + rurrent_p_out1 = self.bu_conv2(rurrent_pan_out2) # 256->256/16 + rurrent_p_out1 = torch.cat([rurrent_p_out1, rurrent_fpn_out1], + 1) # 256->512/16 + rurrent_pan_out1 = self.C3_n3(rurrent_p_out1) # 512->512/16 + + rurrent_p_out0 = self.bu_conv1(rurrent_pan_out1) # 512->512/32 + rurrent_p_out0 = torch.cat([rurrent_p_out0, rurrent_fpn_out0], + 1) # 512->1024/32 + rurrent_pan_out0 = self.C3_n4(rurrent_p_out0) # 1024->1024/32 + + ##### + if node == 'star': + pan_out2 = torch.cat( + [self.jian2(rurrent_pan_out2), + self.jian2(rurrent_pan_out2)], + dim=1) + rurrent_pan_out2 + pan_out1 = torch.cat( + [self.jian1(rurrent_pan_out1), + self.jian1(rurrent_pan_out1)], + dim=1) + rurrent_pan_out1 + pan_out0 = torch.cat( + [self.jian0(rurrent_pan_out0), + self.jian0(rurrent_pan_out0)], + dim=1) + rurrent_pan_out0 + elif node == 'buffer': + + [support_pan_out2, support_pan_out1, support_pan_out0] = buffer + + pan_out2 = torch.cat( + [self.jian2(rurrent_pan_out2), + self.jian2(support_pan_out2)], + dim=1) + rurrent_pan_out2 + pan_out1 = torch.cat( + [self.jian1(rurrent_pan_out1), + self.jian1(support_pan_out1)], + dim=1) + rurrent_pan_out1 + pan_out0 = torch.cat( + [self.jian0(rurrent_pan_out0), + self.jian0(support_pan_out0)], + dim=1) + rurrent_pan_out0 + + outputs = (pan_out2, pan_out1, pan_out0) + + buffer_ = (rurrent_pan_out2, rurrent_pan_out1, rurrent_pan_out0) + + return outputs, buffer_ + + def forward(self, input, buffer=None, mode='off_pipe'): + + if mode == 'off_pipe': + # Glops caculate mode + if input.size()[1] == 3: + input = torch.cat([input, input], dim=1) + output = self.off_forward(input) + # offline train mode + elif input.size()[1] == 6: + output = self.off_forward(input) + + return output + + elif mode == 'on_pipe': + # online star state + if buffer is None: + output, buffer_ = self.online_forward(input, node='star') + # online inference + else: + assert len(buffer) == 3 + assert input.size()[1] == 3 + output, buffer_ = self.online_forward( + input, buffer=buffer, node='buffer') + + return output, buffer_ diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py index fd15c1c1..88bd55c7 100644 --- a/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py +++ b/modelscope/models/cv/realtime_object_detection/yolox/models/network_blocks.py @@ -1,5 +1,4 @@ # The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX - import torch import torch.nn as nn diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py b/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py new file mode 100644 index 00000000..b3ec3504 --- /dev/null +++ b/modelscope/models/cv/realtime_object_detection/yolox/models/streamyolo.py @@ -0,0 +1,41 @@ +# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO +import torch.nn as nn + +from .dfp_pafpn import DFPPAFPN +from .tal_head import TALHead + + +class StreamYOLO(nn.Module): + """ + YOLOX model module. The module list is defined by create_yolov3_modules function. + The network returns loss values from three YOLO layers during training + and detection results during test. + """ + + def __init__(self, backbone=None, head=None): + super().__init__() + if backbone is None: + backbone = DFPPAFPN() + if head is None: + head = TALHead(20) + + self.backbone = backbone + self.head = head + + def forward(self, x, targets=None, buffer=None, mode='off_pipe'): + # fpn output content features of [dark3, dark4, dark5] + assert mode in ['off_pipe', 'on_pipe'] + + if mode == 'off_pipe': + fpn_outs = self.backbone(x, buffer=buffer, mode='off_pipe') + if self.training: + pass + else: + outputs = self.head(fpn_outs, imgs=x) + + return outputs + elif mode == 'on_pipe': + fpn_outs, buffer_ = self.backbone(x, buffer=buffer, mode='on_pipe') + outputs = self.head(fpn_outs) + + return outputs, buffer_ diff --git a/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py b/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py new file mode 100644 index 00000000..7a82f8c6 --- /dev/null +++ b/modelscope/models/cv/realtime_object_detection/yolox/models/tal_head.py @@ -0,0 +1,170 @@ +# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .network_blocks import BaseConv, DWConv + + +class TALHead(nn.Module): + + def __init__( + self, + num_classes, + width=1.0, + strides=[8, 16, 32], + in_channels=[256, 512, 1024], + act='silu', + depthwise=False, + gamma=1.5, + ignore_thr=0.2, + ignore_value=0.2, + ): + """ + Args: + act (str): activation type of conv. Defalut value: "silu". + depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False. + """ + super().__init__() + + self.gamma = gamma + self.ignore_thr = ignore_thr + self.ignore_value = ignore_value + + self.n_anchors = 1 + self.num_classes = num_classes + self.decode_in_inference = True # for deploy, set to False + + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.obj_preds = nn.ModuleList() + self.stems = nn.ModuleList() + Conv = DWConv if depthwise else BaseConv + + for i in range(len(in_channels)): + self.stems.append( + BaseConv( + in_channels=int(in_channels[i] * width), + out_channels=int(256 * width), + ksize=1, + stride=1, + act=act, + )) + self.cls_convs.append( + nn.Sequential(*[ + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + ])) + self.reg_convs.append( + nn.Sequential(*[ + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + ])) + self.cls_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=self.n_anchors * self.num_classes, + kernel_size=1, + stride=1, + padding=0, + )) + self.reg_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=4, + kernel_size=1, + stride=1, + padding=0, + )) + self.obj_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=self.n_anchors * 1, + kernel_size=1, + stride=1, + padding=0, + )) + + self.strides = strides + self.grids = [torch.zeros(1)] * len(in_channels) + self.expanded_strides = [None] * len(in_channels) + + def forward(self, xin, labels=None, imgs=None): + outputs = [] + for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate( + zip(self.cls_convs, self.reg_convs, self.strides, xin)): + x = self.stems[k](x) + cls_x = x + reg_x = x + + cls_feat = cls_conv(cls_x) + cls_output = self.cls_preds[k](cls_feat) + + reg_feat = reg_conv(reg_x) + reg_output = self.reg_preds[k](reg_feat) + obj_output = self.obj_preds[k](reg_feat) + + if self.training: + pass + + else: + output = torch.cat( + [reg_output, + obj_output.sigmoid(), + cls_output.sigmoid()], 1) + + outputs.append(output) + + if self.training: + pass + else: + self.hw = [x.shape[-2:] for x in outputs] + outputs = torch.cat([x.flatten(start_dim=2) for x in outputs], + dim=2).permute(0, 2, 1) + if self.decode_in_inference: + return self.decode_outputs(outputs, dtype=xin[0].type()) + else: + return outputs + + def decode_outputs(self, outputs, dtype): + grids = [] + strides = [] + for (hsize, wsize), stride in zip(self.hw, self.strides): + yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)]) + grid = torch.stack((xv, yv), 2).view(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + strides.append(torch.full((*shape, 1), stride)) + + grids = torch.cat(grids, dim=1).type(dtype) + strides = torch.cat(strides, dim=1).type(dtype) + + outputs[..., :2] = (outputs[..., :2] + grids) * strides + outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides + return outputs diff --git a/modelscope/outputs.py b/modelscope/outputs.py index c08779b4..a49ddacf 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -165,6 +165,32 @@ TASK_OUTPUTS = { Tasks.image_object_detection: [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES], + # video object detection result for single sample + # { + + # "scores": [[0.8, 0.25, 0.05, 0.05], [0.9, 0.1, 0.05, 0.05]] + # "labels": [["person", "traffic light", "car", "bus"], + # ["person", "traffic light", "car", "bus"]] + # "boxes": + # [ + # [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ], + # [ + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # [x1, y1, x2, y2], + # ] + # ], + + # } + Tasks.video_object_detection: + [OutputKeys.SCORES, OutputKeys.LABELS, OutputKeys.BOXES], + # instance segmentation result for single sample # { # "scores": [0.9, 0.1, 0.05, 0.05], @@ -676,8 +702,9 @@ TASK_OUTPUTS = { # "text_embedding": np.array with shape [1, D], # "similarity": float # } - Tasks.multi_modal_similarity: - [OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES], + Tasks.multi_modal_similarity: [ + OutputKeys.IMG_EMBEDDING, OutputKeys.TEXT_EMBEDDING, OutputKeys.SCORES + ], # VQA result for a sample # {"text": "this is a text answser. "} diff --git a/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py new file mode 100644 index 00000000..3686c50a --- /dev/null +++ b/modelscope/pipelines/cv/realtime_video_object_detection_pipeline.py @@ -0,0 +1,59 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +from typing import Any, Dict, List, Union + +import cv2 +import json +import numpy as np +import torch +from PIL import Image +from torchvision import transforms + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.realtime_object_detection import \ + RealtimeVideoDetector +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Input, Model, Pipeline, Tensor +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import load_image +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.video_object_detection, + module_name=Pipelines.realtime_video_object_detection) +class RealtimeVideoObjectDetectionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + super().__init__(model=model, **kwargs) + self.model = RealtimeVideoDetector(model) + + def preprocess(self, input: Input) -> Dict[Tensor, Union[str, np.ndarray]]: + return input + + def forward(self, input: Input) -> Dict[Tensor, Dict[str, np.ndarray]]: + self.video_path = input + # Processing the whole video and return results for each frame + forward_output = self.model.inference_video(self.video_path) + return {'forward_output': forward_output} + + def postprocess(self, input: Dict[Tensor, Dict[str, np.ndarray]], + **kwargs) -> str: + forward_output = input['forward_output'] + + scores, boxes, labels = [], [], [] + for result in forward_output: + box, score, label = result + scores.append(score) + boxes.append(box) + labels.append(label) + + return { + OutputKeys.BOXES: boxes, + OutputKeys.SCORES: scores, + OutputKeys.LABELS: labels, + } diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 9e10e802..0eb369da 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -38,6 +38,7 @@ class CVTasks(object): image_classification_dailylife = 'image-classification-dailylife' image_object_detection = 'image-object-detection' + video_object_detection = 'video-object-detection' image_segmentation = 'image-segmentation' semantic_segmentation = 'semantic-segmentation' diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py index 2d420892..34dc2348 100644 --- a/modelscope/utils/cv/image_utils.py +++ b/modelscope/utils/cv/image_utils.py @@ -231,6 +231,66 @@ def show_video_tracking_result(video_in_path, bboxes, video_save_path): cap.release() +def show_video_object_detection_result(video_in_path, bboxes_list, labels_list, + video_save_path): + + PALETTE = { + 'person': [128, 0, 0], + 'bicycle': [128, 128, 0], + 'car': [64, 0, 0], + 'motorcycle': [0, 128, 128], + 'bus': [64, 128, 0], + 'truck': [192, 128, 0], + 'traffic light': [64, 0, 128], + 'stop sign': [192, 0, 128], + } + from tqdm import tqdm + import math + cap = cv2.VideoCapture(video_in_path) + with tqdm(total=len(bboxes_list)) as pbar: + pbar.set_description( + 'Writing results to video: {}'.format(video_save_path)) + for i in range(len(bboxes_list)): + bboxes = bboxes_list[i].astype(int) + labels = labels_list[i] + success, frame = cap.read() + if success is False: + raise Exception(video_in_path, + ' can not be correctly decoded by OpenCV.') + if i == 0: + size = (frame.shape[1], frame.shape[0]) + fourcc = cv2.VideoWriter_fourcc('M', 'J', 'P', 'G') + video_writer = cv2.VideoWriter(video_save_path, fourcc, + cap.get(cv2.CAP_PROP_FPS), size, + True) + + FONT_SCALE = 1e-3 # Adjust for larger font size in all images + THICKNESS_SCALE = 1e-3 # Adjust for larger thickness in all images + TEXT_Y_OFFSET_SCALE = 1e-2 # Adjust for larger Y-offset of text and bounding box + H, W, _ = frame.shape + zeros_mask = np.zeros((frame.shape)).astype(np.uint8) + for bbox, l in zip(bboxes, labels): + cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), + PALETTE[l], 1) + cv2.putText( + frame, + l, (bbox[0], bbox[1] - int(TEXT_Y_OFFSET_SCALE * H)), + fontFace=cv2.FONT_HERSHEY_TRIPLEX, + fontScale=min(H, W) * FONT_SCALE, + thickness=math.ceil(min(H, W) * THICKNESS_SCALE), + color=PALETTE[l]) + zeros_mask = cv2.rectangle( + zeros_mask, (bbox[0], bbox[1]), (bbox[2], bbox[3]), + color=PALETTE[l], + thickness=-1) + + frame = cv2.addWeighted(frame, 1., zeros_mask, .65, 0) + video_writer.write(frame) + pbar.update(1) + video_writer.release + cap.release() + + def panoptic_seg_masks_to_image(masks): draw_img = np.zeros([masks[0].shape[0], masks[0].shape[1], 3]) from mmdet.core.visualization.palette import get_palette diff --git a/tests/pipelines/test_realtime_video_object_detection.py b/tests/pipelines/test_realtime_video_object_detection.py new file mode 100644 index 00000000..d65313a3 --- /dev/null +++ b/tests/pipelines/test_realtime_video_object_detection.py @@ -0,0 +1,46 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +import cv2 +import numpy as np + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.pipelines.base import Pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import show_video_object_detection_result +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + +logger = get_logger() + + +class RealtimeVideoObjectDetectionTest(unittest.TestCase, + DemoCompatibilityCheck): + + def setUp(self) -> None: + self.model_id = 'damo/cv_cspnet_video-object-detection_streamyolo' + self.test_video = 'data/test/videos/test_realtime_vod.mp4' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_modelhub(self): + realtime_video_object_detection = pipeline( + Tasks.video_object_detection, model=self.model_id) + result = realtime_video_object_detection(self.test_video) + if result: + logger.info('Video output to test_vod_results.avi') + show_video_object_detection_result(self.test_video, + result[OutputKeys.BOXES], + result[OutputKeys.LABELS], + 'test_vod_results.avi') + else: + raise ValueError('process error') + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main() From e3eb01f4cee8c39a66c797cfdf8e29917011424a Mon Sep 17 00:00:00 2001 From: "zhangzhicheng.zzc" Date: Mon, 17 Oct 2022 23:31:44 +0800 Subject: [PATCH 53/57] [to #42322933]update word-segmentation regression results Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10432186 --- data/test/regression/sbert_ws_en.bin | 4 ++-- data/test/regression/sbert_ws_zh.bin | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/test/regression/sbert_ws_en.bin b/data/test/regression/sbert_ws_en.bin index 4eb562d6..6e441f7f 100644 --- a/data/test/regression/sbert_ws_en.bin +++ b/data/test/regression/sbert_ws_en.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572 -size 60801 +oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c +size 61239 diff --git a/data/test/regression/sbert_ws_zh.bin b/data/test/regression/sbert_ws_zh.bin index 555f640d..b1841351 100644 --- a/data/test/regression/sbert_ws_zh.bin +++ b/data/test/regression/sbert_ws_zh.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c -size 60801 +oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1 +size 61115 From 2eb835aca489f5b7dcdfb6199d34f1bfc85f6d7c Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Tue, 18 Oct 2022 11:12:12 +0800 Subject: [PATCH 54/57] [to #42322933]Add uuid to model which created by ut test Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10434107 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10434107 * [Update] update finetune_result_upload * [Update] rename finetune_result_upload to model_dir_upload * Merge branch 'master' into feat/upload_ckpt * Merge branch 'master' into feat/upload_ckpt * [Fix] fix import error * [Fix] fix import error * Merge branch 'master' into feat/upload_ckpt * [Update] changes name to upload_folder and using tempfile to save repo * Merge branch 'master' into feat/upload_ckpt * [Fix] fix commit * Merge branch 'master' into feat/upload_ckpt * [Fix] fix format * Merge branch 'master' into feat/upload_ckpt * [Fix] add uuid after model created from upload ut --- tests/hub/test_hub_upload.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py index d7e6e439..2250164b 100644 --- a/tests/hub/test_hub_upload.py +++ b/tests/hub/test_hub_upload.py @@ -3,6 +3,7 @@ import os import shutil import tempfile import unittest +import uuid from modelscope.hub.api import HubApi from modelscope.hub.constants import Licenses, ModelVisibility @@ -23,7 +24,9 @@ class HubUploadTest(unittest.TestCase): self.api = HubApi() self.user = os.environ.get('TEST_MODEL_ORG', 'citest') logger.info(self.user) - self.create_model_name = '%s/%s' % (self.user, 'test_model_upload') + self.create_model_name = '%s/%s_%s' % (self.user, 'test_model_upload', + uuid.uuid4().hex) + logger.info('create %s' % self.create_model_name) temporary_dir = tempfile.mkdtemp() self.work_dir = temporary_dir self.model_dir = os.path.join(temporary_dir, self.create_model_name) From c0b546a96eaaaaef2e9ab1bf32b1abe9092d33e1 Mon Sep 17 00:00:00 2001 From: "huizheng.hz" Date: Tue, 18 Oct 2022 14:34:26 +0800 Subject: [PATCH 55/57] [to #42322933]add subset_name when loading dataset (NAFNet image denoising) Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10427797 --- tests/trainers/test_image_denoise_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/trainers/test_image_denoise_trainer.py b/tests/trainers/test_image_denoise_trainer.py index 0bcb8930..68ddf616 100644 --- a/tests/trainers/test_image_denoise_trainer.py +++ b/tests/trainers/test_image_denoise_trainer.py @@ -33,11 +33,13 @@ class ImageDenoiseTrainerTest(unittest.TestCase): dataset_train = MsDataset.load( 'SIDD', namespace='huizheng', + subset_name='default', split='validation', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds dataset_val = MsDataset.load( 'SIDD', namespace='huizheng', + subset_name='default', split='test', download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)._hf_ds self.dataset_train = SiddImageDenoisingDataset( From 3b1f1a0252d4fee7ecd15ac8dc7c04ec0535add0 Mon Sep 17 00:00:00 2001 From: "hemu.zp" Date: Tue, 18 Oct 2022 15:58:33 +0800 Subject: [PATCH 56/57] [to #42322933] Add GPT3 tensor parallel inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加基于 Megatron-v3 的 GPT3 tensor 并行的推理代码 复用 DistributedPipeline 与 megatron-util 适用模型:1.3B/2.7B/13B 参数的 GPT-3 预训练生成模型 Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10416721 --- modelscope/metainfo.py | 2 + modelscope/models/nlp/gpt3/__init__.py | 2 + .../models/nlp/gpt3/configuration_gpt3.py | 88 +- .../models/nlp/gpt3/distributed_gpt3.py | 1057 +++++++++++++++++ modelscope/models/nlp/gpt3/modeling_gpt3.py | 54 +- modelscope/models/nlp/gpt3/tokenizer_gpt3.py | 69 ++ .../nlp/distributed_gpt3_pipeline.py | 54 + modelscope/preprocessors/__init__.py | 2 + modelscope/preprocessors/nlp/__init__.py | 2 + modelscope/preprocessors/nlp/nlp_base.py | 35 + modelscope/utils/nlp/distributed.py | 5 +- tests/pipelines/test_gpt3_text_generation.py | 58 + 12 files changed, 1387 insertions(+), 41 deletions(-) create mode 100644 modelscope/models/nlp/gpt3/distributed_gpt3.py create mode 100644 modelscope/models/nlp/gpt3/tokenizer_gpt3.py create mode 100644 modelscope/pipelines/nlp/distributed_gpt3_pipeline.py create mode 100644 tests/pipelines/test_gpt3_text_generation.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index e4a26303..2dbff948 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -227,6 +227,7 @@ class Pipelines(object): zero_shot_classification = 'zero-shot-classification' text_error_correction = 'text-error-correction' plug_generation = 'plug-generation' + gpt3_generation = 'gpt3-generation' faq_question_answering = 'faq-question-answering' conversational_text_to_sql = 'conversational-text-to-sql' table_question_answering_pipeline = 'table-question-answering-pipeline' @@ -324,6 +325,7 @@ class Preprocessors(object): bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer' text_gen_tokenizer = 'text-gen-tokenizer' text2text_gen_preprocessor = 'text2text-gen-preprocessor' + text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer' text2text_translate_preprocessor = 'text2text-translate-preprocessor' token_cls_tokenizer = 'token-cls-tokenizer' ner_tokenizer = 'ner-tokenizer' diff --git a/modelscope/models/nlp/gpt3/__init__.py b/modelscope/models/nlp/gpt3/__init__.py index 076a0c6b..9cae8cc8 100644 --- a/modelscope/models/nlp/gpt3/__init__.py +++ b/modelscope/models/nlp/gpt3/__init__.py @@ -7,11 +7,13 @@ if TYPE_CHECKING: from .configuration_gpt3 import GPT3Config from .modeling_gpt3 import GPT3Model from .gpt3_for_text_generation import GPT3ForTextGeneration + from .tokenizer_gpt3 import JiebaBPETokenizer else: _import_structure = { 'configuration_gpt3': ['GPT3Config'], 'modeling_gpt3': ['GPT3Model'], 'gpt3_for_text_generation': ['GPT3ForTextGeneration'], + 'tokenizer_gpt3': ['JiebaBPETokenizer'], } import sys diff --git a/modelscope/models/nlp/gpt3/configuration_gpt3.py b/modelscope/models/nlp/gpt3/configuration_gpt3.py index d5a054fd..66e8b836 100644 --- a/modelscope/models/nlp/gpt3/configuration_gpt3.py +++ b/modelscope/models/nlp/gpt3/configuration_gpt3.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -21,25 +22,48 @@ logger = logging.get_logger(__name__) class GPT3Config(PretrainedConfig): - model_type = 'gpt' + model_type = 'gpt3' - def __init__(self, - vocab_size=25600, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act='gelu', - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=2048, - type_vocab_size=2, - layernorm_epsilon=1e-12, - **kwargs): + def __init__( + self, + vocab_size=25600, + hidden_size=768, + ffn_hidden_size=None, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=2048, + type_vocab_size=2, + layernorm_epsilon=1e-12, + bias_gelu_fusion=True, + fp32_residual_connection=False, + sequence_parallel=False, + fp16=False, + bf16=False, + apply_query_key_layer_scaling=True, + attention_softmax_in_fp32=False, + kv_channels=None, + masked_softmax_fusion=True, + attention_dropout=0.1, + bias_dropout_fusion=True, + apply_residual_connection_post_layernorm=False, + hidden_dropout=0.1, + init_method_std=0.02, + # generate + eod_id=7, + tokens_to_generate=100, + top_k=0, + top_p=0.9, + **kwargs): super().__init__(layer_norm_eps=layernorm_epsilon, **kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size + self.ffn_hidden_size = 4 * hidden_size \ + if ffn_hidden_size is None else ffn_hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act @@ -49,3 +73,39 @@ class GPT3Config(PretrainedConfig): self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.layernorm_epsilon = layernorm_epsilon + self.bias_gelu_fusion = bias_gelu_fusion + self.fp32_residual_connection = fp32_residual_connection + self.sequence_parallel = sequence_parallel + self.fp16 = fp16 + self.bf16 = bf16 + assert not (fp16 and bf16) + self.apply_query_key_layer_scaling = apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + if kv_channels is None: + assert hidden_size % num_attention_heads == 0 + self.kv_channels = hidden_size // num_attention_heads + self.masked_softmax_fusion = masked_softmax_fusion + self.attention_dropout = attention_dropout + self.bias_dropout_fusion = bias_dropout_fusion + self.apply_residual_connection_post_layernorm = \ + apply_residual_connection_post_layernorm + self.hidden_dropout = hidden_dropout + self.init_method_std = init_method_std + self.eod_id = eod_id + self.tokens_to_generate = tokens_to_generate + self.top_k = top_k + self.top_p = top_p + + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + self.no_persist_layer_norm = \ + TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11) + + @property + def params_dtype(self): + if self.fp16: + return torch.half + elif self.bf16: + return torch.bfloat16 + else: + return torch.float diff --git a/modelscope/models/nlp/gpt3/distributed_gpt3.py b/modelscope/models/nlp/gpt3/distributed_gpt3.py new file mode 100644 index 00000000..a0091259 --- /dev/null +++ b/modelscope/models/nlp/gpt3/distributed_gpt3.py @@ -0,0 +1,1057 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch +from megatron import mpu +from megatron.global_vars import get_global_memory_buffer, set_global_variables +from megatron.model import (AttnMaskType, Float16Module, LayerNorm, + bias_gelu_impl) +from megatron.model.fused_softmax import FusedScaleMaskSoftmax +from torch import nn +from torch.nn import functional as F +from transformers.modeling_utils import PreTrainedModel + +from modelscope.models import TorchModel +from modelscope.models.nlp.gpt3 import GPT3Config +from modelscope.utils.nlp.distributed import initialize_distributed +from modelscope.utils.nlp.load_checkpoint import pre_load +from modelscope.utils.torch_utils import set_random_seed_mpu + + +class GPT3ParallelMLP(nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. + """ + + def __init__(self, config, init_method, output_layer_init_method): + super().__init__() + + # Project to 4h. + self.dense_h_to_4h = mpu.ColumnParallelLinearV3( + config, + config.hidden_size, + config.ffn_hidden_size, + gather_output=False, + init_method=init_method, + skip_bias_add=True) + + self.bias_gelu_fusion = config.bias_gelu_fusion + self.activation_func = F.gelu + + # Project back to h. + self.dense_4h_to_h = mpu.RowParallelLinearV3( + config, + config.ffn_hidden_size, + config.hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + skip_bias_add=True) + + def forward(self, hidden_states): + + # [s, b, 4hp] + intermediate_parallel, bias_parallel = self.dense_h_to_4h( + hidden_states) + + if self.bias_gelu_fusion: + intermediate_parallel = \ + bias_gelu_impl(intermediate_parallel, bias_parallel) + else: + intermediate_parallel = \ + self.activation_func(intermediate_parallel + bias_parallel) + + # [s, b, h] + output, output_bias = self.dense_4h_to_h(intermediate_parallel) + return output, output_bias + + +class GPT3Embedding(nn.Module): + """Language model embeddings. + + Arguments: + hidden_size: hidden size + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + init_method: weight initialization method + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, config, init_method): + super().__init__() + + self.hidden_size = config.hidden_size + self.init_method = init_method + + # Word embeddings (parallel). + self.word_embeddings = mpu.VocabParallelEmbedding( + config.vocab_size, self.hidden_size, init_method=self.init_method) + + # Position embedding (serial). + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + self.hidden_size) + # Initialize the position embeddings. + self.init_method(self.position_embeddings.weight) + + self.fp32_residual_connection = config.fp32_residual_connection + self.sequence_parallel = config.sequence_parallel + # Embeddings dropout + self.embedding_dropout = nn.Dropout(config.hidden_dropout) + + def zero_parameters(self): + """Zero out all parameters in embedding.""" + self.word_embeddings.weight.data.fill_(0) + self.word_embeddings.weight.shared = True + self.position_embeddings.weight.data.fill_(0) + self.position_embeddings.weight.shared = True + + def forward(self, input_ids, position_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + embeddings = words_embeddings + position_embeddings + + # Data format change to avoid explicit tranposes : [b s h] --> [s b h]. + embeddings = embeddings.transpose(0, 1).contiguous() + + # If the input flag for fp32 residual connection is set, convert for float. + if self.fp32_residual_connection: + embeddings = embeddings.float() + + # Dropout. + if self.sequence_parallel: + embeddings = mpu.scatter_to_sequence_parallel_region(embeddings) + with mpu.get_cuda_rng_tracker().fork(): + embeddings = self.embedding_dropout(embeddings) + else: + embeddings = self.embedding_dropout(embeddings) + return embeddings + + +class NoopTransformerLayer(nn.Module): + + def __init__(self, layer_number): + super().__init__() + self.layer_number = layer_number + + def forward(self, + hidden_states, + attention_mask, + encoder_output=None, + enc_dec_attn_mask=None, + inference_params=None): + return hidden_states.clone() + + +def attention_mask_func(attention_scores, attention_mask): + attention_scores.masked_fill_(attention_mask, -10000.0) + return attention_scores + + +class GPT3CoreAttention(nn.Module): + + def __init__(self, + config, + layer_number, + attn_mask_type=AttnMaskType.padding): + super().__init__() + self.fp16 = config.fp16 + self.bf16 = config.bf16 + + self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling + self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 + if self.apply_query_key_layer_scaling: + self.attention_softmax_in_fp32 = True + self.layer_number = max(1, layer_number) + self.attn_mask_type = attn_mask_type + self.sequence_parallel = config.sequence_parallel + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + world_size = mpu.get_model_parallel_world_size() + self.hidden_size_per_partition = mpu.divide(projection_size, + world_size) + self.hidden_size_per_attention_head = mpu.divide( + projection_size, config.num_attention_heads) + self.num_attention_heads_per_partition = mpu.divide( + config.num_attention_heads, world_size) + + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff + + self.scale_mask_softmax = FusedScaleMaskSoftmax( + self.fp16, self.bf16, self.attn_mask_type, + config.masked_softmax_fusion, attention_mask_func, + self.attention_softmax_in_fp32, coeff) + + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = nn.Dropout(config.attention_dropout) + + def forward(self, query_layer, key_layer, value_layer, attention_mask): + + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), + query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], + output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], + output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = get_global_memory_buffer().get_tensor( + (output_size[0] * output_size[1], output_size[2], output_size[3]), + query_layer.dtype, 'mpu') + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor)) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + attention_probs = self.scale_mask_softmax(attention_scores, + attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + + if not self.sequence_parallel: + with mpu.get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), + query_layer.size(0), value_layer.size(3)) + + # change view [sk, b * np, hn] + value_layer = value_layer.view( + value_layer.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], + output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class GPT3ParallelAttention(nn.Module): + """Parallel self-attention layer abstract class. + + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__(self, config, init_method, output_layer_init_method, + layer_number): + super().__init__() + self.layer_number = max(1, layer_number) + self.params_dtype = config.params_dtype + + projection_size = config.kv_channels * config.num_attention_heads + + # Per attention head and per partition values. + world_size = mpu.get_model_parallel_world_size() + self.hidden_size_per_attention_head = mpu.divide( + projection_size, config.num_attention_heads) + self.num_attention_heads_per_partition = mpu.divide( + config.num_attention_heads, world_size) + + # Strided linear layer. + self.query_key_value = mpu.ColumnParallelLinearV3( + config, + config.hidden_size, + 3 * projection_size, + gather_output=False, + init_method=init_method) + + self.core_attention = GPT3CoreAttention(config, self.layer_number) + + # Output. + self.dense = mpu.RowParallelLinearV3( + config, + projection_size, + config.hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + skip_bias_add=True) + + def _allocate_memory(self, inference_max_sequence_len, batch_size): + return torch.empty( + inference_max_sequence_len, + batch_size, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + dtype=self.params_dtype, + device=torch.cuda.current_device()) + + def forward(self, hidden_states, attention_mask, inference_params=None): + # hidden_states: [sq, b, h] + + # ================================================= + # Pre-allocate memory for key-values for inference. + # ================================================= + if inference_params: + if self.layer_number not in inference_params.key_value_memory_dict: + inf_max_seq_len = inference_params.max_sequence_len + inf_max_batch_size = inference_params.max_batch_size + inference_key_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size) + inference_value_memory = self._allocate_memory( + inf_max_seq_len, inf_max_batch_size) + inference_params.key_value_memory_dict[self.layer_number] = ( + inference_key_memory, inference_value_memory) + else: + inference_key_memory, inference_value_memory = \ + inference_params.key_value_memory_dict[self.layer_number] + + # ===================== + # Query, Key, and Value + # ===================== + # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] + mixed_x_layer, _ = self.query_key_value(hidden_states) + + # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, + value_layer) = mpu.split_tensor_along_last_dim(mixed_x_layer, 3) + + # ================================== + # Adjust key and value for inference + # ================================== + + if inference_params: + batch_start = inference_params.batch_size_offset + batch_end = batch_start + key_layer.size(1) + assert batch_end <= inference_key_memory.size(1) + sequence_start = inference_params.sequence_len_offset + sequence_end = sequence_start + key_layer.size(0) + assert sequence_end <= inference_key_memory.size(0) + # Copy key and values. + inference_key_memory[sequence_start:sequence_end, + batch_start:batch_end, ...] = key_layer + inference_value_memory[sequence_start:sequence_end, + batch_start:batch_end, ...] = value_layer + key_layer = inference_key_memory[:sequence_end, + batch_start:batch_end, ...] + value_layer = inference_value_memory[:sequence_end, + batch_start:batch_end, ...] + + # ================================== + # core attention computation + # ================================== + + context_layer = self.core_attention(query_layer, key_layer, + value_layer, attention_mask) + + # ================= + # Output. [sq, b, h] + # ================= + + output, bias = self.dense(context_layer) + + return output, bias + + +class nullcontext: + + def __init__(self, enter_result=None): + self.enter_result = enter_result + + def __enter__(self): + return self.enter_result + + def __exit__(self, *excinfo): + pass + + +def bias_dropout_add(x, bias, residual, prob, training): + # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor + out = torch.nn.functional.dropout(x + bias, p=prob, training=training) + out = residual + out + return out + + +def get_bias_dropout_add(training): + + def _bias_dropout_add(x, bias, residual, prob): + return bias_dropout_add(x, bias, residual, prob, training) + + return _bias_dropout_add + + +@torch.jit.script +def bias_dropout_add_fused_train(x: torch.Tensor, bias: torch.Tensor, + residual: torch.Tensor, + prob: float) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, True) + + +@torch.jit.script +def bias_dropout_add_fused_inference(x: torch.Tensor, bias: torch.Tensor, + residual: torch.Tensor, + prob: float) -> torch.Tensor: + return bias_dropout_add(x, bias, residual, prob, False) + + +class GPT3ParallelTransformerLayer(nn.Module): + """A single transformer layer. + + Transformer layer takes input with size [s, b, h] and returns an + output of the same size. + """ + + def __init__(self, config, init_method, output_layer_init_method, + layer_number): + + super().__init__() + self.layer_number = layer_number + + self.apply_residual_connection_post_layernorm \ + = config.apply_residual_connection_post_layernorm + + self.bf16 = config.bf16 + self.fp32_residual_connection = config.fp32_residual_connection + + # Layernorm on the input data. + self.input_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=config.no_persist_layer_norm, + sequence_parallel=config.sequence_parallel) + + # Self attention. + self.self_attention = GPT3ParallelAttention(config, init_method, + output_layer_init_method, + layer_number) + self.hidden_dropout = config.hidden_dropout + self.bias_dropout_fusion = config.bias_dropout_fusion + + # Layernorm on the attention output + self.post_attention_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=config.no_persist_layer_norm, + sequence_parallel=config.sequence_parallel) + + # MLP + self.mlp = GPT3ParallelMLP(config, init_method, + output_layer_init_method) + + # Set bias+dropout+add fusion grad_enable execution handler. + TORCH_MAJOR = int(torch.__version__.split('.')[0]) + TORCH_MINOR = int(torch.__version__.split('.')[1]) + use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 + and TORCH_MINOR >= 10) + self.bias_dropout_add_exec_handler = \ + nullcontext if use_nvfuser else torch.enable_grad + + def forward(self, hidden_states, attention_mask, inference_params=None): + # hidden_states: [s, b, h] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + # Self attention. + attention_output, attention_bias = \ + self.self_attention( + layernorm_output, + attention_mask, + inference_params=inference_params) + # Residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + if self.bias_dropout_fusion: + if self.training: + bias_dropout_add_func = bias_dropout_add_fused_train + else: + bias_dropout_add_func = bias_dropout_add_fused_inference + else: + bias_dropout_add_func = get_bias_dropout_add(self.training) + + with self.bias_dropout_add_exec_handler(): + layernorm_input = bias_dropout_add_func( + attention_output, attention_bias.expand_as(residual), residual, + self.hidden_dropout) + + # Layer norm post the self attention. + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output, mlp_bias = self.mlp(layernorm_output) + # Second residual connection. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = layernorm_input + + with self.bias_dropout_add_exec_handler(): + output = bias_dropout_add_func(mlp_output, + mlp_bias.expand_as(residual), + residual, self.hidden_dropout) + + # Jit compiled function creates 'view' tensor. This tensor + # potentially gets saved in the MPU checkpoint function context, + # which rejects view tensors. While making a viewless tensor here + # won't result in memory savings (like the data loader, or + # p2p_communication), it serves to document the origin of this + # 'view' tensor. + output = mpu.make_viewless_tensor( + inp=output, requires_grad=output.requires_grad, keep_graph=True) + + return output + + +class GPT3ParallelTransformer(nn.Module): + """Transformer class.""" + + def __init__(self, + config, + init_method, + output_layer_init_method, + post_layer_norm=True, + pre_process=True, + post_process=True): + super().__init__() + + self.bf16 = config.bf16 + self.fp32_residual_connection = config.fp32_residual_connection + self.post_layer_norm = post_layer_norm + self.pre_process = pre_process + self.post_process = post_process + self.input_tensor = None + + self.sequence_parallel = config.sequence_parallel + + # Number of layers. + self.num_layers = config.num_hidden_layers + + # Transformer layers. + def build_layer(layer_number): + return GPT3ParallelTransformerLayer(config, init_method, + output_layer_init_method, + layer_number) + + if self.num_layers == 0: + self.num_layers = 1 + self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)]) + else: + self.layers = torch.nn.ModuleList( + [build_layer(i + 1) for i in range(self.num_layers)]) + + if self.post_process and self.post_layer_norm: + # Final layer norm before output. + self.final_layernorm = LayerNorm( + config.hidden_size, + eps=config.layernorm_epsilon, + no_persist_layer_norm=config.no_persist_layer_norm, + sequence_parallel=config.sequence_parallel) + + def _get_layer(self, layer_number): + return self.layers[layer_number] + + def forward(self, hidden_states, attention_mask, inference_params=None): + # hidden_states: [s, b, h] + + if not self.pre_process: + # See set_input_tensor() + hidden_states = self.input_tensor + + # Viewless tensor. + # - We only need to create a viewless tensor in the case of micro batch + # size (mbs) == 1, since in this case, 'hidden_states.transpose()' + # above creates a view tensor, and '.contiguous()' is a pass-through. + # For mbs >= 2, '.contiguous()' creates a new tensor, eliminating + # the need to make it viewless. + # + # However, we don't explicitly check mbs == 1 here because + # make_viewless_tensor() has negligible overhead when its input + # is already viewless. + # + # - For the 'else' case above, calling make_viewless_tensor() here is + # likely redundant, since p2p_communication.py (likely originator) + # already creates viewless tensors. That said, make_viewless_tensor() + # is called here to be future-proof and corner-case-proof. + hidden_states = mpu.make_viewless_tensor( + hidden_states, + requires_grad=True, + keep_graph=True, + ) + + if self.sequence_parallel: + rng_context = mpu.get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + + with rng_context: + # Forward pass. + for index in range(self.num_layers): + layer = self._get_layer(index) + hidden_states = layer( + hidden_states, + attention_mask, + inference_params=inference_params) + + # Final layer norm. + if self.post_process and self.post_layer_norm: + hidden_states = self.final_layernorm(hidden_states) + + return hidden_states + + +class GPT3TransformerLanguageModel(nn.Module): + """Transformer language model. + + Arguments: + transformer_hparams: transformer hyperparameters + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + embedding_dropout_prob: dropout probability for embeddings + num_tokentypes: size of the token-type embeddings. 0 value + will ignore this embedding + """ + + def __init__(self, config, init_method, output_layer_init_method): + super().__init__() + + self.hidden_size = config.hidden_size + self.init_method = init_method + self.encoder_hidden_state = None + + # Embeddings. + self.embedding = GPT3Embedding(config, self.init_method) + + # Transformer. + self.encoder = GPT3ParallelTransformer( + config, + self.init_method, + output_layer_init_method, + ) + + def forward(self, + enc_input_ids, + enc_position_ids, + enc_attn_mask, + inference_params=None, + enc_hidden_states=None): + + # Encoder embedding. + encoder_input = self.embedding(enc_input_ids, enc_position_ids) + + # Run encoder. + if enc_hidden_states is None: + if self.encoder is not None: + encoder_output = self.encoder( + encoder_input, + enc_attn_mask, + inference_params=inference_params) + else: + encoder_output = self.encoder_hidden_state + else: + encoder_output = enc_hidden_states.to(encoder_input.dtype) + + return encoder_output + + +def init_method_normal(sigma): + """Init method based on N(0, sigma).""" + + def init_(tensor): + return nn.init.normal_(tensor, mean=0.0, std=sigma) + + return init_ + + +def scaled_init_method_normal(sigma, num_layers): + """Init method based on N(0, sigma/sqrt(2*num_layers).""" + std = sigma / math.sqrt(2.0 * num_layers) + + def init_(tensor): + return nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + + +class GPT3Model(PreTrainedModel): + + config_class = GPT3Config + + def __init__(self, config, parallel_output=False): + super().__init__(config) + + self.parallel_output = parallel_output + + self.language_model = GPT3TransformerLanguageModel( + config, init_method_normal(config.init_method_std), + scaled_init_method_normal(config.init_method_std, + config.num_hidden_layers)) + + def word_embeddings_weight(self): + return self.language_model.embedding.word_embeddings.weight + + @staticmethod + def build_attention_mask_and_position_ids(tokens): + seq_length = tokens.size(1) + attention_mask = torch.tril( + torch.ones((1, 1, seq_length, seq_length), + dtype=torch.long, + device=tokens.device)) + attention_mask = (attention_mask < 0.5) + + position_ids = torch.arange( + seq_length, dtype=torch.long, device=tokens.device) + position_ids = position_ids.unsqueeze(0).expand_as(tokens) + + return attention_mask, position_ids + + def forward(self, + input_ids, + attention_mask=None, + position_ids=None, + inference_params=None, + **kwargs): + if attention_mask is None and position_ids is None: + attention_mask, position_ids = \ + self.build_attention_mask_and_position_ids(input_ids) + + lm_output = self.language_model( + input_ids, + position_ids, + attention_mask, + inference_params=inference_params) + + logits_parallel = mpu.LinearWithGradAccumulationAndAsyncCommunication.apply( + lm_output, self.word_embeddings_weight(), None, False, True, + self.config.sequence_parallel) + # Gather if needed. + + output = logits_parallel + if not self.parallel_output: + output = mpu.gather_from_model_parallel_region(logits_parallel) + return output.transpose(0, 1).contiguous() + + +def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf.""" + + filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(filter_, float('-Inf')) + + +def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf.""" + + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Filteration based on the cumulative sum. + filter_ = cumulative_probs > top_p + # This shift by 1 is weird and I cannot justify it. This existed + # in the original implementation: + # https://github.com/ari-holtzman/degen/blob/master/gen.py + # and I guess it is needed so keeping it for now. + filter_[:, 1:] = filter_[:, :-1].clone() + # Make sure we at least have one token to select from. + filter_[..., 0] = 0 + + # Fill in the filtered part + filter_ = filter_.scatter(1, sorted_indices, filter_) + logits.masked_fill_(filter_, float('-Inf')) + + +def sample(logits, top_k=0, top_p=0.0, temperature=1.0, vocab_size=None): + """ Sample and generate a token. + Note: logits has the dimension [b, v] where b is the batch size + and v is the vocabulary size. + If vocab_size is provided, we will make sure the sample that is + generated is in [0, vocab-size). This will avoid out of vocabulary + generations due to padding. + """ + + # Check logits for consistency. + assert logits.ndim == 2, 'expected the logits to be of [b, v] shape.' + assert logits.type() == 'torch.cuda.FloatTensor', \ + 'input logits should be floats.' + + # Greedy is just simple argmax. + if top_k == 1: + assert top_p == 0.0, 'cannot set both greedy and top-p samplings.' + samples = torch.argmax(logits, dim=-1) + + # Top-k or top-p sampling. + else: + # Clone so we do not modify the inputs, + logits = logits.clone() + # Apply temperature in place. + if temperature != 1.0: + logits.div_(temperature) + + if top_k > 1: + assert top_p == 0.0, 'cannot set both top-k and top-p samplings.' + assert top_k <= logits.size(1), 'top-k is larger than logit size.' + if vocab_size: + assert top_k < vocab_size, 'top-k is larger than vocab size.' + modify_logits_for_top_k_filtering(logits, top_k) + + elif top_p > 0.0: + assert top_p <= 1.0, 'top-p should be in (0, 1].' + modify_logits_for_top_p_filtering(logits, top_p) + + # After filtering, we need to recalculate the distribution. + probs = logits.softmax(dim=-1) + samples = torch.multinomial(probs, num_samples=1).view(-1) + + # If vocab size is provided, make sure the samples are in + # in the range [0, vocab-size). + if vocab_size: + samples = torch.clamp(samples, min=0, max=(vocab_size - 1)) + + return samples + + +class InferenceParams: + """Inference parameters that are passed to the main model in order + to efficienly calculate and store the context during inference.""" + + def __init__(self, max_batch_size, max_sequence_len): + """Note that offsets are set to zero and we always set the + flag to allocate memory. After the first call, make sure to + set this flag to False.""" + self.max_sequence_len = max_sequence_len + self.max_batch_size = max_batch_size + self.sequence_len_offset = 0 + self.batch_size_offset = 0 + self.key_value_memory_dict = {} + + def swap_key_value_dict(self, batch_idx): + 'swap between batches' + if len(self.key_value_memory_dict) == 0: + raise ValueError('should not swap when dict in empty') + + for layer_number in self.key_value_memory_dict.keys(): + inference_key_memory, inference_value_memory = self.key_value_memory_dict[ + layer_number] + assert len(batch_idx) == inference_key_memory.shape[ + 1] # make sure batch size is the same + new_inference_key_memory = inference_key_memory[:, batch_idx] + new_inference_value_memory = inference_value_memory[:, batch_idx] + self.key_value_memory_dict[layer_number] = ( + new_inference_key_memory, new_inference_value_memory) + + +class DistributedGPT3(TorchModel): + + def __init__(self, + model_dir, + rank, + path_load_tag='model', + *args, + **kwargs): + super().__init__(model_dir, *args, **kwargs) + initialize_distributed(rank, mpu, kwargs['world_size'], + kwargs['model_parallel_size'], + kwargs['master_ip'], kwargs['master_port']) + seed = 0 if 'seed' not in kwargs else kwargs['seed'] + set_random_seed_mpu(seed) + set_global_variables() + + self.config = GPT3Config.from_pretrained(model_dir) + # Build model. + model = GPT3Model(self.config) + + for param in model.parameters(): + mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param) + + # GPU allocation. + model.cuda(torch.cuda.current_device()) + + # Fp16 conversion. + if self.config.fp16 or self.config.bf16: + model = Float16Module(model, self.config) + + self.dist_model = model + load_model = pre_load(mpu, model_dir, tag=path_load_tag) + self.dist_model.load_state_dict(load_model) + + self.inference_params = None + + def forward_step(self, tokens, attention_mask, position_ids): + logits = self.dist_model( + tokens, + attention_mask, + position_ids, + inference_params=self.inference_params) + self.inference_params.sequence_len_offset += tokens.size(1) + return logits + + def generate(self, + tokens, + temperature=1.0, + use_eod_token_for_early_termination=True, + stop_on_double_eol=False, + stop_on_eol=False): + lengths = torch.tensor([tokens.size(1)], device=tokens.device) + pads = torch.ones( + 1, self.config.tokens_to_generate, + device=tokens.device).long() * self.config.eod_id + tokens = torch.cat((tokens, pads), dim=-1) + + batch_size = tokens.size(0) + min_prompt_length = lengths.min().item() + max_sequence_length = tokens.size(1) + max_sequence_length = min(max_sequence_length, + self.config.max_position_embeddings) + + # If the context is too big, this happens + if min_prompt_length >= max_sequence_length: + raise ValueError('context length + tokens_to_generate too large') + + # Initialize inference parameters. + self.inference_params = InferenceParams(batch_size, + max_sequence_length) + + # Added termination_id to support the case that we want to terminate the + # generation once that id is generated. + termination_id = self.config.eod_id + + # Whether we have reached a termination id. + is_generation_done = torch.zeros( + batch_size, dtype=torch.uint8, device=torch.cuda.current_device()) + + # ============= + # Run infernece + # ============= + + with torch.no_grad(): + attention_mask, position_ids = \ + GPT3Model.build_attention_mask_and_position_ids(tokens) + prev_context_length = 0 + for context_length in range(min_prompt_length, + max_sequence_length): + + # Pick the slice that we need to pass through the network. + tokens2use = tokens[:, prev_context_length:context_length] + positions2use = position_ids[:, prev_context_length: + context_length] + attention_mask2use = attention_mask[ + ..., prev_context_length:context_length, :context_length] + + # logits will be meanigful only in the last pipeline stage. + logits = self.forward_step(tokens2use, attention_mask2use, + positions2use) + + # Sample. + last_token_logits = logits[:, -1, :] + new_sample = sample( + last_token_logits, + top_k=self.config.top_k, + top_p=self.config.top_p, + temperature=temperature, + vocab_size=self.config.vocab_size) + + # If a prompt length is smaller or equal th current context + # length, it means we have started generating tokens + started = lengths <= context_length + # Update the tokens. + tokens[started, context_length] = new_sample[started] + + # Update the context length for the next token generation. + prev_context_length = context_length + + # instead tokenization should be in the inference loop so stop sequences can be used + if stop_on_double_eol: + hit_double_eol = (new_sample + == 628).byte() & started.byte() + hit_two_eols = (new_sample == 198).byte() & ( + tokens[:, context_length - 1] + == 198).byte() & started.byte() + done_token = hit_double_eol | hit_two_eols + elif stop_on_eol: + hit_double_eol = (new_sample + == 628).byte() & started.byte() + hit_eol = (new_sample == 198).byte() & started.byte() + done_token = hit_double_eol | hit_eol + else: + done_token = (new_sample == termination_id).byte() & \ + started.byte() + + is_generation_done = is_generation_done | done_token + done = torch.all(is_generation_done) + + if use_eod_token_for_early_termination and done: + break + + tokens = tokens[:, :(context_length + 1)] + return tokens diff --git a/modelscope/models/nlp/gpt3/modeling_gpt3.py b/modelscope/models/nlp/gpt3/modeling_gpt3.py index ade36e36..2c23f5db 100644 --- a/modelscope/models/nlp/gpt3/modeling_gpt3.py +++ b/modelscope/models/nlp/gpt3/modeling_gpt3.py @@ -19,8 +19,7 @@ from typing import Optional, Union import addict import torch -from torch.nn import (CrossEntropyLoss, Dropout, Embedding, LayerNorm, Linear, - Module, Softmax) +from torch import nn from torch.nn import functional as F from transformers.modeling_utils import PreTrainedModel @@ -28,7 +27,7 @@ from modelscope.utils.constant import ModelFile from .configuration_gpt3 import GPT3Config -class GPT3SelfAttention(Module): +class GPT3SelfAttention(nn.Module): """Parallel self-attention layer abstract class. Self-attention layer takes input with size [s, b, h] @@ -44,13 +43,15 @@ class GPT3SelfAttention(Module): self.hidden_size_per_attention_head = \ self.hidden_size // self.num_attention_heads - self.query_key_value = Linear(self.hidden_size, 3 * self.hidden_size) - self.softmax = Softmax(dim=-1) - self.attention_dropout = Dropout(config.attention_probs_dropout_prob) + self.query_key_value = nn.Linear(self.hidden_size, + 3 * self.hidden_size) + self.softmax = nn.Softmax(dim=-1) + self.attention_dropout = nn.Dropout( + config.attention_probs_dropout_prob) # Output. - self.dense = Linear(self.hidden_size, self.hidden_size) - self.output_dropout = torch.nn.Dropout(config.hidden_dropout_prob) + self.dense = nn.Linear(self.hidden_size, self.hidden_size) + self.output_dropout = nn.Dropout(config.hidden_dropout_prob) def _transpose_for_scores(self, tensor): """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with @@ -133,7 +134,7 @@ class GPT3SelfAttention(Module): return output -class GPT3MLP(Module): +class GPT3MLP(nn.Module): """MLP. MLP will take the input with h hidden state, project it to 4*h @@ -146,12 +147,12 @@ class GPT3MLP(Module): hidden_size = config.hidden_size # Project to 4h. - self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size) + self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size) self.activation_func = F.gelu # Project back to h. - self.dense_4h_to_h = Linear(4 * hidden_size, hidden_size) + self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size) - self.dropout = Dropout(config.hidden_dropout_prob) + self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states): @@ -164,7 +165,7 @@ class GPT3MLP(Module): return output -class GPT3TransformerLayer(Module): +class GPT3TransformerLayer(nn.Module): """A single transformer layer. Transformer layer takes input with size [s, b, h] and returns an @@ -175,14 +176,14 @@ class GPT3TransformerLayer(Module): super().__init__() # Layernorm on the input data. - self.input_layernorm = LayerNorm( + self.input_layernorm = nn.LayerNorm( config.hidden_size, eps=config.layernorm_epsilon) # Self attention. self.attention = GPT3SelfAttention(config) # Layernorm on the attention output - self.post_attention_layernorm = LayerNorm( + self.post_attention_layernorm = nn.LayerNorm( config.hidden_size, eps=config.layernorm_epsilon) # MLP @@ -208,7 +209,7 @@ class GPT3TransformerLayer(Module): return output -class GPT3Transformer(Module): +class GPT3Transformer(nn.Module): """Transformer class.""" def __init__(self, config): @@ -223,7 +224,7 @@ class GPT3Transformer(Module): [GPT3TransformerLayer(config) for _ in range(self.num_layers)]) # Final layer norm before output. - self.final_layernorm = LayerNorm( + self.final_layernorm = nn.LayerNorm( config.hidden_size, eps=config.layernorm_epsilon) def _get_layer(self, layer_number): @@ -242,7 +243,7 @@ class GPT3Transformer(Module): return hidden_states -class GPT3TransformerLanguageModel(Module): +class GPT3TransformerLanguageModel(nn.Module): """Transformer language model. Arguments: @@ -259,10 +260,11 @@ class GPT3TransformerLanguageModel(Module): super().__init__() # Embeddings. - self.word_embeddings = Embedding(config.vocab_size, config.hidden_size) - self.position_embeddings = Embedding(config.max_position_embeddings, - config.hidden_size) - self.embedding_dropout = Dropout(config.hidden_dropout_prob) + self.word_embeddings = nn.Embedding(config.vocab_size, + config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size) + self.embedding_dropout = nn.Dropout(config.hidden_dropout_prob) # Transformer. self.transformer = GPT3Transformer(config) @@ -286,19 +288,19 @@ class GPT3Model(PreTrainedModel): def _init_weights(self, module): """Initialize the weights""" - if isinstance(module, Linear): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_( mean=0.0, std=self.config.initializer_range) if module.bias is not None: module.bias.data.zero_() - elif isinstance(module, Embedding): + elif isinstance(module, nn.Embedding): module.weight.data.normal_( mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() - elif isinstance(module, LayerNorm): + elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) @@ -325,7 +327,7 @@ class GPT3Model(PreTrainedModel): logits = self.language_model(input_ids, attention_mask, position_ids) loss = None if labels is not None: - loss_fct = CrossEntropyLoss() + loss_fct = nn.CrossEntropyLoss() loss = loss_fct( logits.view(-1, self.config.vocab_size), labels.view(-1)) return addict.Dict(loss=loss, logits=logits) diff --git a/modelscope/models/nlp/gpt3/tokenizer_gpt3.py b/modelscope/models/nlp/gpt3/tokenizer_gpt3.py new file mode 100644 index 00000000..5780ddbd --- /dev/null +++ b/modelscope/models/nlp/gpt3/tokenizer_gpt3.py @@ -0,0 +1,69 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tokenizers import Tokenizer + + +class JiebaBPETokenizer: + """SentencePiece BPE tokenizer with Jieba integration""" + + def __init__(self, tokenizer_json_file): + self.name = 'Jieba BPE Tokenizer' + + self.tokenizer = Tokenizer.from_file(tokenizer_json_file) + self.eod_id = self.tokenizer.token_to_id('<|endoftext|>') + try: + import jieba + except ImportError: + raise ImportError( + 'You need to install rjieba to use JiebaTokenizer. ' + 'See https://pypi.org/project/rjieba/ for installation.') + self.jieba = jieba + self.new_line = self.vocab['\n'] + self.sep_token = self.vocab[''] + + @property + def vocab_size(self): + return self.tokenizer.get_vocab_size(with_added_tokens=True) + + @property + def vocab(self): + return self.tokenizer.get_vocab(with_added_tokens=True) + + @property + def inv_vocab(self): + vocab = self.vocab + inv_vocab = dict() + for key, val in vocab.items(): + inv_vocab[val] = key + return inv_vocab + + def tokenize(self, text, is_code=False): + """ + """ + if not is_code: + seg_list = [x for x in self.jieba.cut(text)] + return self.tokenizer.encode( + seg_list, is_pretokenized=True, add_special_tokens=True).ids + else: + return self.tokenizer.encode( + text, is_pretokenized=False, add_special_tokens=True).ids + + def detokenize(self, token_ids): + text = self.tokenizer.decode(token_ids, skip_special_tokens=False) + return text + + @property + def eod(self): + return self.eod_id diff --git a/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py new file mode 100644 index 00000000..325d3303 --- /dev/null +++ b/modelscope/pipelines/nlp/distributed_gpt3_pipeline.py @@ -0,0 +1,54 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict + +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models.nlp.gpt3.distributed_gpt3 import DistributedGPT3 +from modelscope.pipelines.base import DistributedPipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import TextGenerationJiebaPreprocessor +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + Tasks.text_generation, module_name=Pipelines.gpt3_generation) +class DistributedGPT3Pipeline(DistributedPipeline): + """This class is used to instantiate the gpt3 model. + """ + + model = None + + def __init__(self, model, preprocessor=None, **kwargs): + if preprocessor is None: + preprocessor = TextGenerationJiebaPreprocessor(model) + super().__init__(model, preprocessor=preprocessor, **kwargs) + assert hasattr(preprocessor, 'tokenizer') + + @classmethod + def _instantiate_one(cls, rank, model_dir, **kwargs): + cls.model = DistributedGPT3(model_dir, rank, **kwargs) + cls.model.eval() + + @classmethod + def _forward_one(cls, inputs: Dict[str, Any]) -> Dict[str, Any]: + tokens = inputs['inputs']['input_ids'].cuda( + torch.cuda.current_device()) + return cls.model.generate(tokens) + + def postprocess(self, inputs: Dict[str, Any], + **postprocess_params) -> Dict[str, str]: + """process the prediction results + + Args: + inputs (Dict[str, Any]): _description_ + + Returns: + Dict[str, str]: the prediction results + """ + from modelscope.outputs import OutputKeys + return { + OutputKeys.TEXT: + self.preprocessor.tokenizer.detokenize(inputs[0].tolist()) + } diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 43fa64a7..f7defd92 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: Tokenize, WordSegmentationBlankSetToLabelPreprocessor, ZeroShotClassificationPreprocessor, + TextGenerationJiebaPreprocessor, SentencePiecePreprocessor, ) from .space import (DialogIntentPredictionPreprocessor, @@ -72,6 +73,7 @@ else: 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'ZeroShotClassificationPreprocessor', + 'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor', ], 'space': [ diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index a753fe6c..f7478329 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -21,6 +21,7 @@ if TYPE_CHECKING: Tokenize, WordSegmentationBlankSetToLabelPreprocessor, ZeroShotClassificationPreprocessor, + TextGenerationJiebaPreprocessor, SentencePiecePreprocessor, ) @@ -42,6 +43,7 @@ else: 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', 'ZeroShotClassificationPreprocessor', + 'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor', ], 'text_error_correction': [ diff --git a/modelscope/preprocessors/nlp/nlp_base.py b/modelscope/preprocessors/nlp/nlp_base.py index 3d708634..267dbb8c 100644 --- a/modelscope/preprocessors/nlp/nlp_base.py +++ b/modelscope/preprocessors/nlp/nlp_base.py @@ -494,6 +494,41 @@ class TextGenerationPreprocessor(NLPTokenizerPreprocessorBase): } +@PREPROCESSORS.register_module( + Fields.nlp, module_name=Preprocessors.text_gen_jieba_tokenizer) +class TextGenerationJiebaPreprocessor(Preprocessor): + """The jieba tokenizer preprocessor used in text generation. + """ + + def __init__(self, model_dir: str, *args, **kwargs): + from modelscope.models.nlp.gpt3 import JiebaBPETokenizer + super().__init__(*args, **kwargs) + self.tokenizer = JiebaBPETokenizer( + osp.join(model_dir, 'tokenizer.json')) + + def __call__(self, data: str) -> Dict[str, Any]: + """process the raw input data + + Args: + data (str): a sentence + Example: + '深蓝的天空中挂着一轮金黄的圆月,下面是海边的沙地' + Returns: + Dict[str, Any]: the preprocessed data + Example: + {'net_input': + {'src_tokens':tensor([1,2,3,4]), + 'src_lengths': tensor([4])} + } + """ + import torch + + return { + 'input_ids': + torch.tensor(self.tokenizer.tokenize(data)).unsqueeze_(0) + } + + @PREPROCESSORS.register_module( Fields.nlp, module_name=Preprocessors.word_segment_text_to_label_preprocessor) diff --git a/modelscope/utils/nlp/distributed.py b/modelscope/utils/nlp/distributed.py index 2b590a10..53332c0f 100755 --- a/modelscope/utils/nlp/distributed.py +++ b/modelscope/utils/nlp/distributed.py @@ -35,7 +35,10 @@ def initialize_distributed(rank, mpu, world_size, model_parallel_size, init_method = 'tcp://' init_method += master_ip + ':' + master_port torch.distributed.init_process_group( - backend='nccl', world_size=8, rank=rank, init_method=init_method) + backend='nccl', + world_size=world_size, + rank=rank, + init_method=init_method) # Set the model-parallel communicators. mpu.initialize_model_parallel(model_parallel_size) diff --git a/tests/pipelines/test_gpt3_text_generation.py b/tests/pipelines/test_gpt3_text_generation.py new file mode 100644 index 00000000..413b5874 --- /dev/null +++ b/tests/pipelines/test_gpt3_text_generation.py @@ -0,0 +1,58 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class TextGPT3GenerationTest(unittest.TestCase): + + def setUp(self) -> None: + # please make sure this local path exists. + self.model_id_1_3B = 'damo/nlp_gpt3_text-generation_1.3B' + self.model_id_2_7B = 'damo/nlp_gpt3_text-generation_2.7B' + self.model_id_13B = 'damo/nlp_gpt3_text-generation_13B' + self.model_dir_13B = snapshot_download(self.model_id_13B) + self.input = '好的' + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_gpt3_1_3B(self): + pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B) + print(pipe(self.input)) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_gpt3_2_7B(self): + pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B) + print(pipe(self.input)) + + @unittest.skip('distributed gpt3 13B, skipped') + def test_gpt3_13B(self): + """ The model can be downloaded from the link on + TODO: add gpt3 checkpoint link + After downloading, you should have a gpt3 model structure like this: + nlp_gpt3_text-generation_13B + |_ config.json + |_ configuration.json + |_ tokenizer.json + |_ model <-- an empty directory + + Model binaries shall be downloaded separately to populate the model directory, so that + the model directory would contain the following binaries: + |_ model + |_ mp_rank_00_model_states.pt + |_ mp_rank_01_model_states.pt + |_ mp_rank_02_model_states.pt + |_ mp_rank_03_model_states.pt + |_ mp_rank_04_model_states.pt + |_ mp_rank_05_model_states.pt + |_ mp_rank_06_model_states.pt + |_ mp_rank_07_model_states.pt + """ + pipe = pipeline(Tasks.text_generation, model=self.model_dir_13B) + print(pipe(self.input)) + + +if __name__ == '__main__': + unittest.main() From cb570d586cb5f4a467de9aad1e058e3cd3276518 Mon Sep 17 00:00:00 2001 From: "shuying.shu" Date: Tue, 18 Oct 2022 16:10:10 +0800 Subject: [PATCH 57/57] add referring video object segmentation pipeline Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10400324 --- ...g_video_object_segmentation_test_video.mp4 | 3 + modelscope/metainfo.py | 2 + modelscope/models/cv/__init__.py | 3 +- .../__init__.py | 23 + .../model.py | 65 ++ .../utils/__init__.py | 4 + .../utils/backbone.py | 198 +++++ .../utils/misc.py | 234 ++++++ .../utils/mttr.py | 128 +++ .../utils/multimodal_transformer.py | 440 +++++++++++ .../utils/position_encoding_2d.py | 57 ++ .../utils/postprocessing.py | 119 +++ .../utils/segmentation.py | 137 ++++ .../utils/swin_transformer.py | 731 ++++++++++++++++++ modelscope/outputs.py | 6 + modelscope/pipelines/builder.py | 3 + modelscope/pipelines/cv/__init__.py | 4 + ...ring_video_object_segmentation_pipeline.py | 193 +++++ modelscope/utils/constant.py | 3 + requirements/cv.txt | 2 + ...est_referring_video_object_segmentation.py | 56 ++ 21 files changed, 2410 insertions(+), 1 deletion(-) create mode 100644 data/test/videos/referring_video_object_segmentation_test_video.mp4 create mode 100644 modelscope/models/cv/referring_video_object_segmentation/__init__.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/model.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/misc.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py create mode 100644 modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py create mode 100644 modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py create mode 100644 tests/pipelines/test_referring_video_object_segmentation.py diff --git a/data/test/videos/referring_video_object_segmentation_test_video.mp4 b/data/test/videos/referring_video_object_segmentation_test_video.mp4 new file mode 100644 index 00000000..529595a5 --- /dev/null +++ b/data/test/videos/referring_video_object_segmentation_test_video.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb +size 2924691 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 2dbff948..fc18ead9 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -34,6 +34,7 @@ class Models(object): vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' resnet50_bert = 'resnet50-bert' + referring_video_object_segmentation = 'swinT-referring-video-object-segmentation' fer = 'fer' retinaface = 'retinaface' shop_segmentation = 'shop-segmentation' @@ -203,6 +204,7 @@ class Pipelines(object): face_emotion = 'face-emotion' product_segmentation = 'product-segmentation' image_body_reshaping = 'flow-based-body-reshaping' + referring_video_object_segmentation = 'referring-video-object-segmentation' # nlp tasks automatic_post_editing = 'automatic-post-editing' diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index fd950f4c..64039863 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -12,7 +12,8 @@ from . import (action_recognition, animal_recognition, body_2d_keypoints, image_to_image_generation, image_to_image_translation, movie_scene_segmentation, object_detection, product_retrieval_embedding, realtime_object_detection, - salient_detection, shop_segmentation, super_resolution, + referring_video_object_segmentation, salient_detection, + shop_segmentation, super_resolution, video_single_object_tracking, video_summarization, virual_tryon) # yapf: enable diff --git a/modelscope/models/cv/referring_video_object_segmentation/__init__.py b/modelscope/models/cv/referring_video_object_segmentation/__init__.py new file mode 100644 index 00000000..58dbf7b0 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + + from .model import MovieSceneSegmentation + +else: + _import_structure = { + 'model': ['MovieSceneSegmentation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/referring_video_object_segmentation/model.py b/modelscope/models/cv/referring_video_object_segmentation/model.py new file mode 100644 index 00000000..902a3416 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/model.py @@ -0,0 +1,65 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +from typing import Any, Dict + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .utils import (MTTR, A2DSentencesPostProcess, ReferYoutubeVOSPostProcess, + nested_tensor_from_videos_list) + +logger = get_logger() + + +@MODELS.register_module( + Tasks.referring_video_object_segmentation, + module_name=Models.referring_video_object_segmentation) +class ReferringVideoObjectSegmentation(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, *args, **kwargs) + + config_path = osp.join(model_dir, ModelFile.CONFIGURATION) + self.cfg = Config.from_file(config_path) + self.model = MTTR(**self.cfg.model) + + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + params_dict = torch.load(model_path, map_location='cpu') + if 'model_state_dict' in params_dict.keys(): + params_dict = params_dict['model_state_dict'] + self.model.load_state_dict(params_dict, strict=True) + + dataset_name = self.cfg.pipeline.dataset_name + if dataset_name == 'a2d_sentences' or dataset_name == 'jhmdb_sentences': + self.postprocessor = A2DSentencesPostProcess() + elif dataset_name == 'ref_youtube_vos': + self.postprocessor = ReferYoutubeVOSPostProcess() + else: + assert False, f'postprocessing for dataset: {dataset_name} is not supported' + + def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]: + return inputs + + def inference(self, **kwargs): + window = kwargs['window'] + text_query = kwargs['text_query'] + video_metadata = kwargs['metadata'] + + window = nested_tensor_from_videos_list([window]) + valid_indices = torch.arange(len(window.tensors)) + if self._device_name == 'gpu': + valid_indices = valid_indices.cuda() + outputs = self.model(window, valid_indices, [text_query]) + window_masks = self.postprocessor( + outputs, [video_metadata], + window.tensors.shape[-2:])[0]['pred_masks'] + return window_masks + + def postprocess(self, inputs: Dict[str, Any], **kwargs): + return inputs diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py b/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py new file mode 100644 index 00000000..796bd6f4 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from .misc import nested_tensor_from_videos_list +from .mttr import MTTR +from .postprocessing import A2DSentencesPostProcess, ReferYoutubeVOSPostProcess diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py b/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py new file mode 100644 index 00000000..afa384c1 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/backbone.py @@ -0,0 +1,198 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR + +import torch +import torch.nn.functional as F +import torchvision +from einops import rearrange +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter + +from .misc import NestedTensor, is_main_process +from .swin_transformer import SwinTransformer3D + + +class VideoSwinTransformerBackbone(nn.Module): + """ + A wrapper which allows using Video-Swin Transformer as a temporal encoder for MTTR. + Check out video-swin's original paper at: https://arxiv.org/abs/2106.13230 for more info about this architecture. + Only the 'tiny' version of video swin was tested and is currently supported in our project. + Additionally, we slightly modify video-swin to make it output per-frame embeddings as required by MTTR (check our + paper's supplementary for more details), and completely discard of its 4th block. + """ + + def __init__(self, backbone_pretrained, backbone_pretrained_path, + train_backbone, running_mode, **kwargs): + super(VideoSwinTransformerBackbone, self).__init__() + # patch_size is (1, 4, 4) instead of the original (2, 4, 4). + # this prevents swinT's original temporal downsampling so we can get per-frame features. + swin_backbone = SwinTransformer3D( + patch_size=(1, 4, 4), + embed_dim=96, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + window_size=(8, 7, 7), + drop_path_rate=0.1, + patch_norm=True) + if backbone_pretrained and running_mode == 'train': + state_dict = torch.load(backbone_pretrained_path)['state_dict'] + # extract swinT's kinetics-400 pretrained weights and ignore the rest (prediction head etc.) + state_dict = { + k[9:]: v + for k, v in state_dict.items() if 'backbone.' in k + } + + # sum over the patch embedding weight temporal dim [96, 3, 2, 4, 4] --> [96, 3, 1, 4, 4] + patch_embed_weight = state_dict['patch_embed.proj.weight'] + patch_embed_weight = patch_embed_weight.sum(dim=2, keepdims=True) + state_dict['patch_embed.proj.weight'] = patch_embed_weight + swin_backbone.load_state_dict(state_dict) + + self.patch_embed = swin_backbone.patch_embed + self.pos_drop = swin_backbone.pos_drop + self.layers = swin_backbone.layers[:-1] + self.downsamples = nn.ModuleList() + for layer in self.layers: + self.downsamples.append(layer.downsample) + layer.downsample = None + self.downsamples[ + -1] = None # downsampling after the last layer is not necessary + + self.layer_output_channels = [ + swin_backbone.embed_dim * 2**i for i in range(len(self.layers)) + ] + self.train_backbone = train_backbone + if not train_backbone: + for parameter in self.parameters(): + parameter.requires_grad_(False) + + def forward(self, samples: NestedTensor): + vid_frames = rearrange(samples.tensors, 't b c h w -> b c t h w') + + vid_embeds = self.patch_embed(vid_frames) + vid_embeds = self.pos_drop(vid_embeds) + layer_outputs = [] # layer outputs before downsampling + for layer, downsample in zip(self.layers, self.downsamples): + vid_embeds = layer(vid_embeds.contiguous()) + layer_outputs.append(vid_embeds) + if downsample: + vid_embeds = rearrange(vid_embeds, 'b c t h w -> b t h w c') + vid_embeds = downsample(vid_embeds) + vid_embeds = rearrange(vid_embeds, 'b t h w c -> b c t h w') + layer_outputs = [ + rearrange(o, 'b c t h w -> t b c h w') for o in layer_outputs + ] + + outputs = [] + orig_pad_mask = samples.mask + for l_out in layer_outputs: + pad_mask = F.interpolate( + orig_pad_mask.float(), size=l_out.shape[-2:]).to(torch.bool) + outputs.append(NestedTensor(l_out, pad_mask)) + return outputs + + def num_parameters(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + +class FrozenBatchNorm2d(torch.nn.Module): + """ + Modified from DETR https://github.com/facebookresearch/detr + BatchNorm2d where the batch statistics and the affine parameters are fixed. + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer('weight', torch.ones(n)) + self.register_buffer('bias', torch.zeros(n)) + self.register_buffer('running_mean', torch.zeros(n)) + self.register_buffer('running_var', torch.ones(n)) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + num_batches_tracked_key = prefix + 'num_batches_tracked' + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, + self)._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, + unexpected_keys, error_msgs) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + eps = 1e-5 + scale = w * (rv + eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + +class ResNetBackbone(nn.Module): + """ + Modified from DETR https://github.com/facebookresearch/detr + ResNet backbone with frozen BatchNorm. + """ + + def __init__(self, + backbone_name: str = 'resnet50', + train_backbone: bool = True, + dilation: bool = True, + **kwargs): + super(ResNetBackbone, self).__init__() + backbone = getattr(torchvision.models, backbone_name)( + replace_stride_with_dilation=[False, False, dilation], + pretrained=is_main_process(), + norm_layer=FrozenBatchNorm2d) + for name, parameter in backbone.named_parameters(): + if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: + parameter.requires_grad_(False) + return_layers = { + 'layer1': '0', + 'layer2': '1', + 'layer3': '2', + 'layer4': '3' + } + self.body = IntermediateLayerGetter( + backbone, return_layers=return_layers) + output_channels = 512 if backbone_name in ('resnet18', + 'resnet34') else 2048 + self.layer_output_channels = [ + output_channels // 8, output_channels // 4, output_channels // 2, + output_channels + ] + + def forward(self, tensor_list: NestedTensor): + t, b, _, _, _ = tensor_list.tensors.shape + video_frames = rearrange(tensor_list.tensors, + 't b c h w -> (t b) c h w') + padding_masks = rearrange(tensor_list.mask, 't b h w -> (t b) h w') + features_list = self.body(video_frames) + out = [] + for _, f in features_list.items(): + resized_padding_masks = F.interpolate( + padding_masks[None].float(), + size=f.shape[-2:]).to(torch.bool)[0] + f = rearrange(f, '(t b) c h w -> t b c h w', t=t, b=b) + resized_padding_masks = rearrange( + resized_padding_masks, '(t b) h w -> t b h w', t=t, b=b) + out.append(NestedTensor(f, resized_padding_masks)) + return out + + def num_parameters(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + +def init_backbone(backbone_name, **kwargs): + if backbone_name == 'swin-t': + return VideoSwinTransformerBackbone(**kwargs) + elif 'resnet' in backbone_name: + return ResNetBackbone(backbone_name, **kwargs) + assert False, f'error: backbone "{backbone_name}" is not supported' diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py b/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py new file mode 100644 index 00000000..ecf34b8c --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/misc.py @@ -0,0 +1,234 @@ +# Modified from DETR https://github.com/facebookresearch/detr +# Misc functions. +# Mostly copy-paste from torchvision references. + +import pickle +from typing import List, Optional + +import torch +import torch.distributed as dist +# needed due to empty tensor bug in pytorch and torchvision 0.5 +import torchvision +from torch import Tensor + +if float(torchvision.__version__.split('.')[1]) < 7.0: + from torchvision.ops import _new_empty_tensor + from torchvision.ops.misc import _output_size + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to('cuda') + + # obtain Tensor size of each rank + local_size = torch.tensor([tensor.numel()], device='cuda') + size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append( + torch.empty((max_size, ), dtype=torch.uint8, device='cuda')) + if local_size != max_size: + padding = torch.empty( + size=(max_size - local_size, ), dtype=torch.uint8, device='cuda') + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + """ + This function receives a list of image tensors and returns a NestedTensor of the padded images, along with their + padding masks (true for padding areas, false otherwise). + """ + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img) + m[:img.shape[1], :img.shape[2]] = False + return NestedTensor(tensor, mask) + + +def nested_tensor_from_videos_list(videos_list: List[Tensor]): + """ + This function receives a list of videos (each of shape [T, C, H, W]) and returns a NestedTensor of the padded + videos (shape [T, B, C, PH, PW], along with their padding masks (true for padding areas, false otherwise, of shape + [T, B, PH, PW]. + """ + max_size = _max_by_axis([list(img.shape) for img in videos_list]) + padded_batch_shape = [len(videos_list)] + max_size + b, t, c, h, w = padded_batch_shape + dtype = videos_list[0].dtype + device = videos_list[0].device + padded_videos = torch.zeros(padded_batch_shape, dtype=dtype, device=device) + videos_pad_masks = torch.ones((b, t, h, w), + dtype=torch.bool, + device=device) + for vid_frames, pad_vid_frames, vid_pad_m in zip(videos_list, + padded_videos, + videos_pad_masks): + pad_vid_frames[:vid_frames.shape[0], :, :vid_frames. + shape[2], :vid_frames.shape[3]].copy_(vid_frames) + vid_pad_m[:vid_frames.shape[0], :vid_frames.shape[2], :vid_frames. + shape[3]] = False + # transpose the temporal and batch dims and create a NestedTensor: + return NestedTensor( + padded_videos.transpose(0, 1), videos_pad_masks.transpose(0, 1)) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def interpolate(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None): + # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor + """ + Equivalent to nn.functional.interpolate, but with support for empty batch sizes. + This will eventually be supported natively by PyTorch, and this + class can go away. + """ + if float(torchvision.__version__.split('.')[1]) < 7.0: + if input.numel() > 0: + return torch.nn.functional.interpolate(input, size, scale_factor, + mode, align_corners) + + output_shape = _output_size(2, input, size, scale_factor) + output_shape = list(input.shape[:-2]) + list(output_shape) + return _new_empty_tensor(input, output_shape) + else: + return torchvision.ops.misc.interpolate(input, size, scale_factor, + mode, align_corners) diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py b/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py new file mode 100644 index 00000000..e603df6c --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/mttr.py @@ -0,0 +1,128 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR + +import torch +import torch.nn.functional as F +from einops import rearrange +from torch import nn + +from .backbone import init_backbone +from .misc import NestedTensor +from .multimodal_transformer import MultimodalTransformer +from .segmentation import FPNSpatialDecoder + + +class MTTR(nn.Module): + """ The main module of the Multimodal Tracking Transformer """ + + def __init__(self, + num_queries, + mask_kernels_dim=8, + aux_loss=False, + **kwargs): + """ + Parameters: + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + MTTR can detect in a single image. In our paper we use 50 in all settings. + mask_kernels_dim: dim of the segmentation kernels and of the feature maps outputted by the spatial decoder. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.backbone = init_backbone(**kwargs) + self.transformer = MultimodalTransformer(**kwargs) + d_model = self.transformer.d_model + self.is_referred_head = nn.Linear( + d_model, + 2) # binary 'is referred?' prediction head for object queries + self.instance_kernels_head = MLP( + d_model, d_model, output_dim=mask_kernels_dim, num_layers=2) + self.obj_queries = nn.Embedding( + num_queries, d_model) # pos embeddings for the object queries + self.vid_embed_proj = nn.Conv2d( + self.backbone.layer_output_channels[-1], d_model, kernel_size=1) + self.spatial_decoder = FPNSpatialDecoder( + d_model, self.backbone.layer_output_channels[:-1][::-1], + mask_kernels_dim) + self.aux_loss = aux_loss + + def forward(self, samples: NestedTensor, valid_indices, text_queries): + """The forward expects a NestedTensor, which consists of: + - samples.tensor: Batched frames of shape [time x batch_size x 3 x H x W] + - samples.mask: A binary mask of shape [time x batch_size x H x W], containing 1 on padded pixels + + It returns a dict with the following elements: + - "pred_is_referred": The reference prediction logits for all queries. + Shape: [time x batch_size x num_queries x 2] + - "pred_masks": The mask logits for all queries. + Shape: [time x batch_size x num_queries x H_mask x W_mask] + - "aux_outputs": Optional, only returned when auxiliary losses are activated. It is a list of + dictionaries containing the two above keys for each decoder layer. + """ + backbone_out = self.backbone(samples) + # keep only the valid frames (frames which are annotated): + # (for example, in a2d-sentences only the center frame in each window is annotated). + for layer_out in backbone_out: + layer_out.tensors = layer_out.tensors.index_select( + 0, valid_indices) + layer_out.mask = layer_out.mask.index_select(0, valid_indices) + bbone_final_layer_output = backbone_out[-1] + vid_embeds, vid_pad_mask = bbone_final_layer_output.decompose() + + T, B, _, _, _ = vid_embeds.shape + vid_embeds = rearrange(vid_embeds, 't b c h w -> (t b) c h w') + vid_embeds = self.vid_embed_proj(vid_embeds) + vid_embeds = rearrange( + vid_embeds, '(t b) c h w -> t b c h w', t=T, b=B) + + transformer_out = self.transformer(vid_embeds, vid_pad_mask, + text_queries, + self.obj_queries.weight) + # hs is: [L, T, B, N, D] where L is number of decoder layers + # vid_memory is: [T, B, D, H, W] + # txt_memory is a list of length T*B of [S, C] where S might be different for each sentence + # encoder_middle_layer_outputs is a list of [T, B, H, W, D] + hs, vid_memory, txt_memory = transformer_out + + vid_memory = rearrange(vid_memory, 't b d h w -> (t b) d h w') + bbone_middle_layer_outputs = [ + rearrange(o.tensors, 't b d h w -> (t b) d h w') + for o in backbone_out[:-1][::-1] + ] + decoded_frame_features = self.spatial_decoder( + vid_memory, bbone_middle_layer_outputs) + decoded_frame_features = rearrange( + decoded_frame_features, '(t b) d h w -> t b d h w', t=T, b=B) + instance_kernels = self.instance_kernels_head(hs) # [L, T, B, N, C] + # output masks is: [L, T, B, N, H_mask, W_mask] + output_masks = torch.einsum('ltbnc,tbchw->ltbnhw', instance_kernels, + decoded_frame_features) + outputs_is_referred = self.is_referred_head(hs) # [L, T, B, N, 2] + + layer_outputs = [] + for pm, pir in zip(output_masks, outputs_is_referred): + layer_out = {'pred_masks': pm, 'pred_is_referred': pir} + layer_outputs.append(layer_out) + out = layer_outputs[ + -1] # the output for the last decoder layer is used by default + if self.aux_loss: + out['aux_outputs'] = layer_outputs[:-1] + return out + + def num_parameters(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + +class MLP(nn.Module): + """ Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py new file mode 100644 index 00000000..8c24e397 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py @@ -0,0 +1,440 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR +# MTTR Multimodal Transformer class. +# Modified from DETR https://github.com/facebookresearch/detr + +import copy +import os +from typing import Optional + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from torch import Tensor, nn +from transformers import RobertaModel, RobertaTokenizerFast + +from .position_encoding_2d import PositionEmbeddingSine2D + +os.environ[ + 'TOKENIZERS_PARALLELISM'] = 'false' # this disables a huggingface tokenizer warning (printed every epoch) + + +class MultimodalTransformer(nn.Module): + + def __init__(self, + num_encoder_layers=3, + num_decoder_layers=3, + text_encoder_type='roberta-base', + freeze_text_encoder=True, + **kwargs): + super().__init__() + self.d_model = kwargs['d_model'] + encoder_layer = TransformerEncoderLayer(**kwargs) + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers) + decoder_layer = TransformerDecoderLayer(**kwargs) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + norm=nn.LayerNorm(self.d_model), + return_intermediate=True) + self.pos_encoder_2d = PositionEmbeddingSine2D() + self._reset_parameters() + + self.text_encoder = RobertaModel.from_pretrained(text_encoder_type) + self.text_encoder.pooler = None # this pooler is never used, this is a hack to avoid DDP problems... + self.tokenizer = RobertaTokenizerFast.from_pretrained( + text_encoder_type) + self.freeze_text_encoder = freeze_text_encoder + if freeze_text_encoder: + for p in self.text_encoder.parameters(): + p.requires_grad_(False) + + self.txt_proj = FeatureResizer( + input_feat_size=self.text_encoder.config.hidden_size, + output_feat_size=self.d_model, + dropout=kwargs['dropout'], + ) + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, vid_embeds, vid_pad_mask, text_queries, obj_queries): + device = vid_embeds.device + t, b, _, h, w = vid_embeds.shape + + txt_memory, txt_pad_mask = self.forward_text(text_queries, device) + # add temporal dim to txt memory & padding mask: + txt_memory = repeat(txt_memory, 's b c -> s (t b) c', t=t) + txt_pad_mask = repeat(txt_pad_mask, 'b s -> (t b) s', t=t) + + vid_embeds = rearrange(vid_embeds, 't b c h w -> (h w) (t b) c') + # Concat the image & text embeddings on the sequence dimension + encoder_src_seq = torch.cat((vid_embeds, txt_memory), dim=0) + seq_mask = torch.cat( + (rearrange(vid_pad_mask, 't b h w -> (t b) (h w)'), txt_pad_mask), + dim=1) + # vid_pos_embed is: [T*B, H, W, d_model] + vid_pos_embed = self.pos_encoder_2d( + rearrange(vid_pad_mask, 't b h w -> (t b) h w'), self.d_model) + # use zeros in place of pos embeds for the text sequence: + pos_embed = torch.cat( + (rearrange(vid_pos_embed, 't_b h w c -> (h w) t_b c'), + torch.zeros_like(txt_memory)), + dim=0) + + memory = self.encoder( + encoder_src_seq, src_key_padding_mask=seq_mask, + pos=pos_embed) # [S, T*B, C] + vid_memory = rearrange( + memory[:h * w, :, :], + '(h w) (t b) c -> t b c h w', + h=h, + w=w, + t=t, + b=b) + txt_memory = memory[h * w:, :, :] + txt_memory = rearrange(txt_memory, 's t_b c -> t_b s c') + txt_memory = [ + t_mem[~pad_mask] + for t_mem, pad_mask in zip(txt_memory, txt_pad_mask) + ] # remove padding + + # add T*B dims to query embeds (was: [N, C], where N is the number of object queries): + obj_queries = repeat(obj_queries, 'n c -> n (t b) c', t=t, b=b) + tgt = torch.zeros_like(obj_queries) # [N, T*B, C] + + # hs is [L, N, T*B, C] where L is number of layers in the decoder + hs = self.decoder( + tgt, + memory, + memory_key_padding_mask=seq_mask, + pos=pos_embed, + query_pos=obj_queries) + hs = rearrange(hs, 'l n (t b) c -> l t b n c', t=t, b=b) + return hs, vid_memory, txt_memory + + def forward_text(self, text_queries, device): + tokenized_queries = self.tokenizer.batch_encode_plus( + text_queries, padding='longest', return_tensors='pt') + tokenized_queries = tokenized_queries.to(device) + with torch.inference_mode(mode=self.freeze_text_encoder): + encoded_text = self.text_encoder(**tokenized_queries) + # Transpose memory because pytorch's attention expects sequence first + txt_memory = rearrange(encoded_text.last_hidden_state, + 'b s c -> s b c') + txt_memory = self.txt_proj( + txt_memory) # change text embeddings dim to model dim + # Invert attention mask that we get from huggingface because its the opposite in pytorch transformer + txt_pad_mask = tokenized_queries.attention_mask.ne(1).bool() # [B, S] + return txt_memory, txt_pad_mask + + def num_parameters(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + +class TransformerEncoder(nn.Module): + + def __init__(self, encoder_layer, num_layers, norm=None): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, + src, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + output = src + + for layer in self.layers: + output = layer( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + pos=pos) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(nn.Module): + + def __init__(self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward(self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + output = tgt + + intermediate = [] + + for layer in self.layers: + output = layer( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + pos=pos, + query_pos=query_pos) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + if self.return_intermediate: + intermediate.pop() + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output.unsqueeze(0) + + +class TransformerEncoderLayer(nn.Module): + + def __init__(self, + d_model, + nheads, + dim_feedforward=2048, + dropout=0.1, + activation='relu', + normalize_before=False, + **kwargs): + super().__init__() + self.self_attn = nn.MultiheadAttention( + d_model, nheads, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(src, pos) + src2 = self.self_attn( + q, + k, + value=src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + def forward_pre(self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + src2 = self.norm1(src) + q = k = self.with_pos_embed(src2, pos) + src2 = self.self_attn( + q, + k, + value=src2, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src2 = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) + src = src + self.dropout2(src2) + return src + + def forward(self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(src, src_mask, src_key_padding_mask, pos) + return self.forward_post(src, src_mask, src_key_padding_mask, pos) + + +class TransformerDecoderLayer(nn.Module): + + def __init__(self, + d_model, + nheads, + dim_feedforward=2048, + dropout=0.1, + activation='relu', + normalize_before=False, + **kwargs): + super().__init__() + self.self_attn = nn.MultiheadAttention( + d_model, nheads, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention( + d_model, nheads, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn( + q, + k, + value=tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward_pre(self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + tgt2 = self.norm1(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn( + q, + k, + value=tgt2, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt2 = self.norm2(tgt) + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + def forward(self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(tgt, memory, tgt_mask, memory_mask, + tgt_key_padding_mask, + memory_key_padding_mask, pos, query_pos) + return self.forward_post(tgt, memory, tgt_mask, memory_mask, + tgt_key_padding_mask, memory_key_padding_mask, + pos, query_pos) + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +class FeatureResizer(nn.Module): + """ + This class takes as input a set of embeddings of dimension C1 and outputs a set of + embedding of dimension C2, after a linear transformation, dropout and normalization (LN). + """ + + def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): + super().__init__() + self.do_ln = do_ln + # Object feature encoding + self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True) + self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) + self.dropout = nn.Dropout(dropout) + + def forward(self, encoder_features): + x = self.fc(encoder_features) + if self.do_ln: + x = self.layer_norm(x) + output = self.dropout(x) + return output + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == 'relu': + return F.relu + if activation == 'gelu': + return F.gelu + if activation == 'glu': + return F.glu + raise RuntimeError(F'activation should be relu/gelu, not {activation}.') diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py b/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py new file mode 100644 index 00000000..f9ef05a1 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/position_encoding_2d.py @@ -0,0 +1,57 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR +# Modified from DETR https://github.com/facebookresearch/detr +# 2D sine positional encodings for the visual features in the multimodal transformer. + +import math + +import torch +from torch import Tensor, nn + + +class PositionEmbeddingSine2D(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__(self, temperature=10000, normalize=True, scale=None): + super().__init__() + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError('normalize should be True if scale is passed') + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, mask: Tensor, hidden_dim: int): + """ + @param mask: a tensor of shape [B, H, W] + @param hidden_dim: int + @return: + """ + num_pos_feats = hidden_dim // 2 + + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange( + num_pos_feats, dtype=torch.float32, device=mask.device) + dim_t = self.temperature**(2 * (dim_t // 2) / num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), + dim=4).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), + dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3) + return pos diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py new file mode 100644 index 00000000..64582140 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py @@ -0,0 +1,119 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR + +import numpy as np +import pycocotools.mask as mask_util +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + + +class A2DSentencesPostProcess(nn.Module): + """ + This module converts the model's output into the format expected by the coco api for the given task + """ + + def __init__(self): + super(A2DSentencesPostProcess, self).__init__() + + @torch.inference_mode() + def forward(self, outputs, resized_padded_sample_size, + resized_sample_sizes, orig_sample_sizes): + """ Perform the computation + Parameters: + outputs: raw outputs of the model + resized_padded_sample_size: size of samples (input to model) after size augmentation + padding. + resized_sample_sizes: size of samples after size augmentation but without padding. + orig_sample_sizes: original size of the samples (no augmentations or padding) + """ + pred_is_referred = outputs['pred_is_referred'] + prob = F.softmax(pred_is_referred, dim=-1) + scores = prob[..., 0] + pred_masks = outputs['pred_masks'] + pred_masks = F.interpolate( + pred_masks, + size=resized_padded_sample_size, + mode='bilinear', + align_corners=False) + pred_masks = (pred_masks.sigmoid() > 0.5) + processed_pred_masks, rle_masks = [], [] + for f_pred_masks, resized_size, orig_size in zip( + pred_masks, resized_sample_sizes, orig_sample_sizes): + f_mask_h, f_mask_w = resized_size # resized shape without padding + # remove the samples' padding + f_pred_masks_no_pad = f_pred_masks[:, :f_mask_h, : + f_mask_w].unsqueeze(1) + # resize the samples back to their original dataset (target) size for evaluation + f_pred_masks_processed = F.interpolate( + f_pred_masks_no_pad.float(), size=orig_size, mode='nearest') + f_pred_rle_masks = [ + mask_util.encode( + np.array( + mask[0, :, :, np.newaxis], dtype=np.uint8, + order='F'))[0] + for mask in f_pred_masks_processed.cpu() + ] + processed_pred_masks.append(f_pred_masks_processed) + rle_masks.append(f_pred_rle_masks) + predictions = [{ + 'scores': s, + 'masks': m, + 'rle_masks': rle + } for s, m, rle in zip(scores, processed_pred_masks, rle_masks)] + return predictions + + +class ReferYoutubeVOSPostProcess(nn.Module): + """ + This module converts the model's output into the format expected by the coco api for the given task + """ + + def __init__(self): + super(ReferYoutubeVOSPostProcess, self).__init__() + + @torch.inference_mode() + def forward(self, outputs, videos_metadata, samples_shape_with_padding): + """ Perform the computation + Parameters: + outputs: raw outputs of the model + videos_metadata: a dictionary with each video's metadata. + samples_shape_with_padding: size of the batch frames with padding. + """ + pred_is_referred = outputs['pred_is_referred'] + prob_is_referred = F.softmax(pred_is_referred, dim=-1) + # note we average on the temporal dim to compute score per trajectory: + trajectory_scores = prob_is_referred[..., 0].mean(dim=0) + pred_trajectory_indices = torch.argmax(trajectory_scores, dim=-1) + pred_masks = rearrange(outputs['pred_masks'], + 't b nq h w -> b t nq h w') + # keep only the masks of the chosen trajectories: + b = pred_masks.shape[0] + pred_masks = pred_masks[torch.arange(b), :, pred_trajectory_indices] + # resize the predicted masks to the size of the model input (which might include padding) + pred_masks = F.interpolate( + pred_masks, + size=samples_shape_with_padding, + mode='bilinear', + align_corners=False) + # apply a threshold to create binary masks: + pred_masks = (pred_masks.sigmoid() > 0.5) + # remove the padding per video (as videos might have different resolutions and thus different padding): + preds_by_video = [] + for video_pred_masks, video_metadata in zip(pred_masks, + videos_metadata): + # size of the model input batch frames without padding: + resized_h, resized_w = video_metadata['resized_frame_size'] + video_pred_masks = video_pred_masks[:, :resized_h, : + resized_w].unsqueeze( + 1) # remove the padding + # resize the masks back to their original frames dataset size for evaluation: + original_frames_size = video_metadata['original_frame_size'] + tuple_size = tuple(original_frames_size.cpu().numpy()) + video_pred_masks = F.interpolate( + video_pred_masks.float(), size=tuple_size, mode='nearest') + video_pred_masks = video_pred_masks.to(torch.uint8).cpu() + # combine the predicted masks and the video metadata to create a final predictions dict: + video_pred = {**video_metadata, **{'pred_masks': video_pred_masks}} + preds_by_video.append(video_pred) + return preds_by_video diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py b/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py new file mode 100644 index 00000000..b3228820 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/segmentation.py @@ -0,0 +1,137 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR +# Modified from DETR https://github.com/facebookresearch/detr + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + + +class FPNSpatialDecoder(nn.Module): + """ + An FPN-like spatial decoder. Generates high-res, semantically rich features which serve as the base for creating + instance segmentation masks. + """ + + def __init__(self, context_dim, fpn_dims, mask_kernels_dim=8): + super().__init__() + + inter_dims = [ + context_dim, context_dim // 2, context_dim // 4, context_dim // 8, + context_dim // 16 + ] + self.lay1 = torch.nn.Conv2d(context_dim, inter_dims[0], 3, padding=1) + self.gn1 = torch.nn.GroupNorm(8, inter_dims[0]) + self.lay2 = torch.nn.Conv2d(inter_dims[0], inter_dims[1], 3, padding=1) + self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) + self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) + self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) + self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) + self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) + self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) + self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) + self.context_dim = context_dim + + self.add_extra_layer = len(fpn_dims) == 3 + if self.add_extra_layer: + self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) + self.lay5 = torch.nn.Conv2d( + inter_dims[3], inter_dims[4], 3, padding=1) + self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) + self.out_lay = torch.nn.Conv2d( + inter_dims[4], mask_kernels_dim, 3, padding=1) + else: + self.out_lay = torch.nn.Conv2d( + inter_dims[3], mask_kernels_dim, 3, padding=1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_uniform_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor, layer_features: List[Tensor]): + x = self.lay1(x) + x = self.gn1(x) + x = F.relu(x) + x = self.lay2(x) + x = self.gn2(x) + x = F.relu(x) + + cur_fpn = self.adapter1(layer_features[0]) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode='nearest') + x = self.lay3(x) + x = self.gn3(x) + x = F.relu(x) + + cur_fpn = self.adapter2(layer_features[1]) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode='nearest') + x = self.lay4(x) + x = self.gn4(x) + x = F.relu(x) + + if self.add_extra_layer: + cur_fpn = self.adapter3(layer_features[2]) + x = cur_fpn + F.interpolate( + x, size=cur_fpn.shape[-2:], mode='nearest') + x = self.lay5(x) + x = self.gn5(x) + x = F.relu(x) + + x = self.out_lay(x) + return x + + def num_parameters(self): + return sum(p.numel() for p in self.parameters() if p.requires_grad) + + +def dice_loss(inputs, targets, num_masks): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_masks + + +def sigmoid_focal_loss(inputs, + targets, + num_masks, + alpha: float = 0.25, + gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits( + inputs, targets, reduction='none') + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t)**gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_masks diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py new file mode 100644 index 00000000..9a08ef48 --- /dev/null +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py @@ -0,0 +1,731 @@ +# The implementation is adopted from MTTR, +# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR +# Modified from Video-Swin-Transformer https://github.com/SwinTransformer/Video-Swin-Transformer + +from functools import lru_cache, reduce +from operator import mul + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from einops import rearrange +from timm.models.layers import DropPath, trunc_normal_ + + +class Mlp(nn.Module): + """ Multilayer perceptron.""" + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, D, H, W, C) + window_size (tuple[int]): window size + + Returns: + windows: (B*num_windows, window_size*window_size, C) + """ + B, D, H, W, C = x.shape + x = x.view(B, D // window_size[0], window_size[0], H // window_size[1], + window_size[1], W // window_size[2], window_size[2], C) + windows = x.permute(0, 1, 3, 5, 2, 4, 6, + 7).contiguous().view(-1, reduce(mul, window_size), C) + return windows + + +def window_reverse(windows, window_size, B, D, H, W): + """ + Args: + windows: (B*num_windows, window_size, window_size, C) + window_size (tuple[int]): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, D, H, W, C) + """ + x = windows.view(B, D // window_size[0], H // window_size[1], + W // window_size[2], window_size[0], window_size[1], + window_size[2], -1) + x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).contiguous().view(B, D, H, W, -1) + return x + + +def get_window_size(x_size, window_size, shift_size=None): + use_window_size = list(window_size) + if shift_size is not None: + use_shift_size = list(shift_size) + for i in range(len(x_size)): + if x_size[i] <= window_size[i]: + use_window_size[i] = x_size[i] + if shift_size is not None: + use_shift_size[i] = 0 + + if shift_size is None: + return tuple(use_window_size) + else: + return tuple(use_window_size), tuple(use_shift_size) + + +class WindowAttention3D(nn.Module): + """ Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The temporal length, height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, + dim, + window_size, + num_heads, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wd, Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + wd, wh, ww = window_size + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * wd - 1) * (2 * wh - 1) * (2 * ww - 1), num_heads)) + + # get pair-wise relative position index for each token inside the window + coords_d = torch.arange(self.window_size[0]) + coords_h = torch.arange(self.window_size[1]) + coords_w = torch.arange(self.window_size[2]) + coords = torch.stack(torch.meshgrid(coords_d, coords_h, + coords_w)) # 3, Wd, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 3, Wd*Wh*Ww + relative_coords = coords_flatten[:, :, + None] - coords_flatten[:, + None, :] # 3, Wd*Wh*Ww, Wd*Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0).contiguous() # Wd*Wh*Ww, Wd*Wh*Ww, 3 + relative_coords[:, :, + 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 2] += self.window_size[2] - 1 + + relative_coords[:, :, 0] *= (2 * self.window_size[1] + - 1) * (2 * self.window_size[2] - 1) + relative_coords[:, :, 1] *= (2 * self.window_size[2] - 1) + relative_position_index = relative_coords.sum(-1) # Wd*Wh*Ww, Wd*Wh*Ww + self.register_buffer('relative_position_index', + relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, N, N) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # B_, nH, N, C + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index[:N, :N].reshape(-1)].reshape( + N, N, -1) # Wd*Wh*Ww,Wd*Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1).contiguous() # nH, Wd*Wh*Ww, Wd*Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, + N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock3D(nn.Module): + """ Swin Transformer Block. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (tuple[int]): Window size. + shift_size (tuple[int]): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, + dim, + num_heads, + window_size=(2, 7, 7), + shift_size=(0, 0, 0), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + use_checkpoint=False): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + self.use_checkpoint = use_checkpoint + + assert 0 <= self.shift_size[0] < self.window_size[ + 0], 'shift_size must in 0-window_size' + assert 0 <= self.shift_size[1] < self.window_size[ + 1], 'shift_size must in 0-window_size' + assert 0 <= self.shift_size[2] < self.window_size[ + 2], 'shift_size must in 0-window_size' + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention3D( + dim, + window_size=self.window_size, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + + def forward_part1(self, x, mask_matrix): + B, D, H, W, C = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, + self.shift_size) + + x = self.norm1(x) + # pad feature maps to multiples of window size + pad_l = pad_t = pad_d0 = 0 + pad_d1 = (window_size[0] - D % window_size[0]) % window_size[0] + pad_b = (window_size[1] - H % window_size[1]) % window_size[1] + pad_r = (window_size[2] - W % window_size[2]) % window_size[2] + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b, pad_d0, pad_d1)) + _, Dp, Hp, Wp, _ = x.shape + # cyclic shift + if any(i > 0 for i in shift_size): + shifted_x = torch.roll( + x, + shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), + dims=(1, 2, 3)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + # partition windows + x_windows = window_partition(shifted_x, + window_size) # B*nW, Wd*Wh*Ww, C + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, mask=attn_mask) # B*nW, Wd*Wh*Ww, C + # merge windows + attn_windows = attn_windows.view(-1, *(window_size + (C, ))) + shifted_x = window_reverse(attn_windows, window_size, B, Dp, Hp, + Wp) # B D' H' W' C + # reverse cyclic shift + if any(i > 0 for i in shift_size): + x = torch.roll( + shifted_x, + shifts=(shift_size[0], shift_size[1], shift_size[2]), + dims=(1, 2, 3)) + else: + x = shifted_x + + if pad_d1 > 0 or pad_r > 0 or pad_b > 0: + x = x[:, :D, :H, :W, :].contiguous() + return x + + def forward_part2(self, x): + return self.drop_path(self.mlp(self.norm2(x))) + + def forward(self, x, mask_matrix): + """ Forward function. + + Args: + x: Input feature, tensor size (B, D, H, W, C). + mask_matrix: Attention mask for cyclic shift. + """ + + shortcut = x + if self.use_checkpoint: + x = checkpoint.checkpoint(self.forward_part1, x, mask_matrix) + else: + x = self.forward_part1(x, mask_matrix) + x = shortcut + self.drop_path(x) + + if self.use_checkpoint: + x = x + checkpoint.checkpoint(self.forward_part2, x) + else: + x = x + self.forward_part2(x) + + return x + + +class PatchMerging(nn.Module): + """ Patch Merging Layer + + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ Forward function. + + Args: + x: Input feature, tensor size (B, D, H, W, C). + """ + B, D, H, W, C = x.shape + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, :, 0::2, 0::2, :] # B D H/2 W/2 C + x1 = x[:, :, 1::2, 0::2, :] # B D H/2 W/2 C + x2 = x[:, :, 0::2, 1::2, :] # B D H/2 W/2 C + x3 = x[:, :, 1::2, 1::2, :] # B D H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B D H/2 W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +# cache each stage results +@lru_cache() +def compute_mask(D, H, W, window_size, shift_size, device): + img_mask = torch.zeros((1, D, H, W, 1), device=device) # 1 Dp Hp Wp 1 + cnt = 0 + for d in slice(-window_size[0]), slice(-window_size[0], + -shift_size[0]), slice( + -shift_size[0], None): + for h in slice(-window_size[1]), slice(-window_size[1], + -shift_size[1]), slice( + -shift_size[1], None): + for w in slice(-window_size[2]), slice(-window_size[2], + -shift_size[2]), slice( + -shift_size[2], None): + img_mask[:, d, h, w, :] = cnt + cnt += 1 + mask_windows = window_partition(img_mask, + window_size) # nW, ws[0]*ws[1]*ws[2], 1 + mask_windows = mask_windows.squeeze(-1) # nW, ws[0]*ws[1]*ws[2] + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-100.0)).masked_fill( + attn_mask == 0, float(0.0)) + return attn_mask + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (tuple[int]): Local window size. Default: (1,7,7). + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + """ + + def __init__(self, + dim, + depth, + num_heads, + window_size=(1, 7, 7), + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False): + super().__init__() + self.window_size = window_size + self.shift_size = tuple(i // 2 for i in window_size) + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock3D( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=(0, 0, 0) if (i % 2 == 0) else self.shift_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + use_checkpoint=use_checkpoint, + ) for i in range(depth) + ]) + + self.downsample = downsample + if self.downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + + def forward(self, x): + """ Forward function. + + Args: + x: Input feature, tensor size (B, C, D, H, W). + """ + # calculate attention mask for SW-MSA + B, C, D, H, W = x.shape + window_size, shift_size = get_window_size((D, H, W), self.window_size, + self.shift_size) + x = rearrange(x, 'b c d h w -> b d h w c') + Dp = int(np.ceil(D / window_size[0])) * window_size[0] + Hp = int(np.ceil(H / window_size[1])) * window_size[1] + Wp = int(np.ceil(W / window_size[2])) * window_size[2] + attn_mask = compute_mask(Dp, Hp, Wp, window_size, shift_size, x.device) + for blk in self.blocks: + x = blk(x, attn_mask) + x = x.view(B, D, H, W, -1) + + if self.downsample is not None: + x = self.downsample(x) + x = rearrange(x, 'b d h w c -> b c d h w') + return x + + +class PatchEmbed3D(nn.Module): + """ Video to Patch Embedding. + + Args: + patch_size (int): Patch token size. Default: (2,4,4). + in_chans (int): Number of input video channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, + patch_size=(2, 4, 4), + in_chans=3, + embed_dim=96, + norm_layer=None): + super().__init__() + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv3d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, D, H, W = x.size() + if W % self.patch_size[2] != 0: + x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2])) + if H % self.patch_size[1] != 0: + x = F.pad(x, + (0, 0, 0, self.patch_size[1] - H % self.patch_size[1])) + if D % self.patch_size[0] != 0: + x = F.pad( + x, + (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0])) + + x = self.proj(x) # B C D Wh Ww + if self.norm is not None: + D, Wh, Ww = x.size(2), x.size(3), x.size(4) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww) + + return x + + +class SwinTransformer3D(nn.Module): + """ Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + patch_size (int | tuple(int)): Patch size. Default: (4,4,4). + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: Truee + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer: Normalization layer. Default: nn.LayerNorm. + patch_norm (bool): If True, add normalization after patch embedding. Default: False. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + """ + + def __init__(self, + pretrained=None, + pretrained2d=True, + patch_size=(4, 4, 4), + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=(2, 7, 7), + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + patch_norm=False, + frozen_stages=-1, + use_checkpoint=False): + super().__init__() + + self.pretrained = pretrained + self.pretrained2d = pretrained2d + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.patch_norm = patch_norm + self.frozen_stages = frozen_stages + self.window_size = window_size + self.patch_size = patch_size + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed3D( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2**i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging + if i_layer < self.num_layers - 1 else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + self.num_features = int(embed_dim * 2**(self.num_layers - 1)) + + # add a norm layer for each output + self.norm = norm_layer(self.num_features) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1: + self.pos_drop.eval() + for i in range(0, self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def inflate_weights(self, logger): + """Inflate the swin2d parameters to swin3d. + + The differences between swin3d and swin2d mainly lie in an extra + axis. To utilize the pretrained parameters in 2d model, + the weight of swin2d models should be inflated to fit in the shapes of + the 3d counterpart. + + Args: + logger (logging.Logger): The logger used to print + debugging infomation. + """ + checkpoint = torch.load(self.pretrained, map_location='cpu') + state_dict = checkpoint['model'] + + # delete relative_position_index since we always re-init it + relative_position_index_keys = [ + k for k in state_dict.keys() if 'relative_position_index' in k + ] + for k in relative_position_index_keys: + del state_dict[k] + + # delete attn_mask since we always re-init it + attn_mask_keys = [k for k in state_dict.keys() if 'attn_mask' in k] + for k in attn_mask_keys: + del state_dict[k] + + state_dict['patch_embed.proj.weight'] = state_dict[ + 'patch_embed.proj.weight'].unsqueeze(2).repeat( + 1, 1, self.patch_size[0], 1, 1) / self.patch_size[0] + + # bicubic interpolate relative_position_bias_table if not match + relative_position_bias_table_keys = [ + k for k in state_dict.keys() if 'relative_position_bias_table' in k + ] + for k in relative_position_bias_table_keys: + relative_position_bias_table_pretrained = state_dict[k] + relative_position_bias_table_current = self.state_dict()[k] + L1, nH1 = relative_position_bias_table_pretrained.size() + L2, nH2 = relative_position_bias_table_current.size() + L2 = (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1) + wd = self.window_size[0] + if nH1 != nH2: + logger.warning(f'Error in loading {k}, passing') + else: + if L1 != L2: + S1 = int(L1**0.5) + relative_position_bias_table_pretrained_resized = torch.nn.functional.interpolate( + relative_position_bias_table_pretrained.permute( + 1, 0).view(1, nH1, S1, S1), + size=(2 * self.window_size[1] - 1, + 2 * self.window_size[2] - 1), + mode='bicubic') + relative_position_bias_table_pretrained = relative_position_bias_table_pretrained_resized.view( + nH2, L2).permute(1, 0) + state_dict[k] = relative_position_bias_table_pretrained.repeat( + 2 * wd - 1, 1) + + msg = self.load_state_dict(state_dict, strict=False) + logger.info(msg) + logger.info(f"=> loaded successfully '{self.pretrained}'") + del checkpoint + torch.cuda.empty_cache() + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x.contiguous()) + + x = rearrange(x, 'n c d h w -> n d h w c') + x = self.norm(x) + x = rearrange(x, 'n d h w c -> n c d h w') + + return x + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer3D, self).train(mode) + self._freeze_stages() diff --git a/modelscope/outputs.py b/modelscope/outputs.py index a49ddacf..fbe15646 100644 --- a/modelscope/outputs.py +++ b/modelscope/outputs.py @@ -417,6 +417,12 @@ TASK_OUTPUTS = { # } Tasks.video_summarization: [OutputKeys.OUTPUT], + # referring video object segmentation result for a single video + # { + # "masks": [np.array # 2D array with shape [height, width]] + # } + Tasks.referring_video_object_segmentation: [OutputKeys.MASKS], + # ============ nlp tasks =================== # text classification result for single sample diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 174d10b1..8098bdec 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -202,6 +202,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.face_emotion: (Pipelines.face_emotion, 'damo/cv_face-emotion'), Tasks.product_segmentation: (Pipelines.product_segmentation, 'damo/cv_F3Net_product-segmentation'), + Tasks.referring_video_object_segmentation: + (Pipelines.referring_video_object_segmentation, + 'damo/cv_swin-t_referring_video-object-segmentation'), } diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index f84f5fe5..97cd8761 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -58,6 +58,7 @@ if TYPE_CHECKING: from .facial_expression_recognition_pipeline import FacialExpressionRecognitionPipeline from .mtcnn_face_detection_pipeline import MtcnnFaceDetectionPipelin from .hand_static_pipeline import HandStaticPipeline + from .referring_video_object_segmentation_pipeline import ReferringVideoObjectSegmentationPipeline else: _import_structure = { @@ -128,6 +129,9 @@ else: ['FacialExpressionRecognitionPipeline'], 'mtcnn_face_detection_pipeline': ['MtcnnFaceDetectionPipeline'], 'hand_static_pipeline': ['HandStaticPipeline'], + 'referring_video_object_segmentation_pipeline': [ + 'ReferringVideoObjectSegmentationPipeline' + ], } import sys diff --git a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py new file mode 100644 index 00000000..d264b386 --- /dev/null +++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py @@ -0,0 +1,193 @@ +# The implementation here is modified based on MTTR, +# originally Apache 2.0 License and publicly avaialbe at https://github.com/mttr2021/MTTR +# Copyright (c) Alibaba, Inc. and its affiliates. + +from typing import Any, Dict + +import numpy as np +import torch +import torchvision +import torchvision.transforms.functional as F +from einops import rearrange +from moviepy.editor import AudioFileClip, ImageSequenceClip, VideoFileClip +from PIL import Image, ImageDraw, ImageFont, ImageOps +from tqdm import tqdm + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.referring_video_object_segmentation, + module_name=Pipelines.referring_video_object_segmentation) +class ReferringVideoObjectSegmentationPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """use `model` to create a referring video object segmentation pipeline for prediction + + Args: + model: model id on modelscope hub + """ + _device = kwargs.pop('device', 'gpu') + if torch.cuda.is_available() and _device == 'gpu': + self.device = 'gpu' + else: + self.device = 'cpu' + super().__init__(model=model, device=self.device, **kwargs) + + logger.info('Load model done!') + + def preprocess(self, input: Input) -> Dict[str, Any]: + """ + + Args: + input: path of the input video + + """ + assert isinstance(input, tuple) and len( + input + ) == 4, 'error - input type must be tuple and input length must be 4' + self.input_video_pth, text_queries, start_pt, end_pt = input + + assert 0 < end_pt - start_pt <= 10, 'error - the subclip length must be 0-10 seconds long' + assert 1 <= len( + text_queries) <= 2, 'error - 1-2 input text queries are expected' + + # extract the relevant subclip: + self.input_clip_pth = 'input_clip.mp4' + with VideoFileClip(self.input_video_pth) as video: + subclip = video.subclip(start_pt, end_pt) + subclip.write_videofile(self.input_clip_pth) + + self.window_length = 24 # length of window during inference + self.window_overlap = 6 # overlap (in frames) between consecutive windows + + self.video, audio, self.meta = torchvision.io.read_video( + filename=self.input_clip_pth) + self.video = rearrange(self.video, 't h w c -> t c h w') + + input_video = F.resize(self.video, size=360, max_size=640) + if self.device_name == 'gpu': + input_video = input_video.cuda() + + input_video = input_video.to(torch.float).div_(255) + input_video = F.normalize( + input_video, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + video_metadata = { + 'resized_frame_size': input_video.shape[-2:], + 'original_frame_size': self.video.shape[-2:] + } + + # partition the clip into overlapping windows of frames: + windows = [ + input_video[i:i + self.window_length] + for i in range(0, len(input_video), self.window_length + - self.window_overlap) + ] + # clean up the text queries: + self.text_queries = [' '.join(q.lower().split()) for q in text_queries] + + result = { + 'text_queries': self.text_queries, + 'windows': windows, + 'video_metadata': video_metadata + } + + return result + + def forward(self, input: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + pred_masks_per_query = [] + t, _, h, w = self.video.shape + for text_query in tqdm(input['text_queries'], desc='text queries'): + pred_masks = torch.zeros(size=(t, 1, h, w)) + for i, window in enumerate( + tqdm(input['windows'], desc='windows')): + + window_masks = self.model.inference( + window=window, + text_query=text_query, + metadata=input['video_metadata']) + + win_start_idx = i * ( + self.window_length - self.window_overlap) + pred_masks[win_start_idx:win_start_idx + + self.window_length] = window_masks + pred_masks_per_query.append(pred_masks) + return pred_masks_per_query + + def postprocess(self, inputs) -> Dict[str, Any]: + if self.model.cfg.pipeline.save_masked_video: + # RGB colors for instance masks: + light_blue = (41, 171, 226) + purple = (237, 30, 121) + dark_green = (35, 161, 90) + orange = (255, 148, 59) + colors = np.array([light_blue, purple, dark_green, orange]) + + # width (in pixels) of the black strip above the video on which the text queries will be displayed: + text_border_height_per_query = 36 + + video_np = rearrange(self.video, + 't c h w -> t h w c').numpy() / 255.0 + + # del video + pred_masks_per_frame = rearrange( + torch.stack(inputs), 'q t 1 h w -> t q h w').numpy() + masked_video = [] + for vid_frame, frame_masks in tqdm( + zip(video_np, pred_masks_per_frame), + total=len(video_np), + desc='applying masks...'): + # apply the masks: + for inst_mask, color in zip(frame_masks, colors): + vid_frame = apply_mask(vid_frame, inst_mask, color / 255.0) + vid_frame = Image.fromarray((vid_frame * 255).astype(np.uint8)) + # visualize the text queries: + vid_frame = ImageOps.expand( + vid_frame, + border=(0, len(self.text_queries) + * text_border_height_per_query, 0, 0)) + W, H = vid_frame.size + draw = ImageDraw.Draw(vid_frame) + font = ImageFont.truetype(font='DejaVuSansMono.ttf', size=30) + for i, (text_query, color) in enumerate( + zip(self.text_queries, colors), start=1): + w, h = draw.textsize(text_query, font=font) + draw.text(((W - w) / 2, + (text_border_height_per_query * i) - h - 3), + text_query, + fill=tuple(color) + (255, ), + font=font) + masked_video.append(np.array(vid_frame)) + print(type(vid_frame)) + print(type(masked_video[0])) + print(masked_video[0].shape) + # generate and save the output clip: + + assert self.model.cfg.pipeline.output_path + output_clip_path = self.model.cfg.pipeline.output_path + clip = ImageSequenceClip( + sequence=masked_video, fps=self.meta['video_fps']) + clip = clip.set_audio(AudioFileClip(self.input_clip_pth)) + clip.write_videofile( + output_clip_path, fps=self.meta['video_fps'], audio=True) + del masked_video + + result = {OutputKeys.MASKS: inputs} + return result + + +def apply_mask(image, mask, color, transparency=0.7): + mask = mask[..., np.newaxis].repeat(repeats=3, axis=2) + mask = mask * transparency + color_matrix = np.ones(image.shape, dtype=np.float) * color + out_image = color_matrix * mask + image * (1.0 - mask) + return out_image diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 0eb369da..6ba58c19 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -80,6 +80,9 @@ class CVTasks(object): virtual_try_on = 'virtual-try-on' movie_scene_segmentation = 'movie-scene-segmentation' + # video segmentation + referring_video_object_segmentation = 'referring-video-object-segmentation' + # video editing video_inpainting = 'video-inpainting' diff --git a/requirements/cv.txt b/requirements/cv.txt index eb38beb1..d23fab3a 100644 --- a/requirements/cv.txt +++ b/requirements/cv.txt @@ -1,4 +1,5 @@ albumentations>=1.0.3 +av>=9.2.0 easydict fairscale>=0.4.1 fastai>=1.0.51 @@ -14,6 +15,7 @@ lpips ml_collections mmcls>=0.21.0 mmdet>=2.25.0 +moviepy>=1.0.3 networkx>=2.5 numba onnxruntime>=1.10 diff --git a/tests/pipelines/test_referring_video_object_segmentation.py b/tests/pipelines/test_referring_video_object_segmentation.py new file mode 100644 index 00000000..3e81d9c3 --- /dev/null +++ b/tests/pipelines/test_referring_video_object_segmentation.py @@ -0,0 +1,56 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class ReferringVideoObjectSegmentationTest(unittest.TestCase, + DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.referring_video_object_segmentation + self.model_id = 'damo/cv_swin-t_referring_video-object-segmentation' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_referring_video_object_segmentation(self): + input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4' + text_queries = [ + 'guy in black performing tricks on a bike', + 'a black bike used to perform tricks' + ] + start_pt, end_pt = 4, 14 + input_tuple = (input_location, text_queries, start_pt, end_pt) + pp = pipeline( + Tasks.referring_video_object_segmentation, model=self.model_id) + result = pp(input_tuple) + if result: + print(result) + else: + raise ValueError('process error') + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_referring_video_object_segmentation_with_default_task(self): + input_location = 'data/test/videos/referring_video_object_segmentation_test_video.mp4' + text_queries = [ + 'guy in black performing tricks on a bike', + 'a black bike used to perform tricks' + ] + start_pt, end_pt = 4, 14 + input_tuple = (input_location, text_queries, start_pt, end_pt) + pp = pipeline(Tasks.referring_video_object_segmentation) + result = pp(input_tuple) + if result: + print(result) + else: + raise ValueError('process error') + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main()