diff --git a/data/test b/data/test index 77a9ad7f..860764da 160000 --- a/data/test +++ b/data/test @@ -1 +1 @@ -Subproject commit 77a9ad7fb3cc4bcc99f4a33822c813e7ab473ba0 +Subproject commit 860764da23420f08fa551eccc053719b8f1a4b42 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index d7487f84..2eed9e2b 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -52,6 +52,7 @@ class Models(object): vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' newcrfs_depth_estimation = 'newcrfs-depth-estimation' + omnidata_normal_estimation = 'omnidata-normal-estimation' panovit_layout_estimation = 'panovit-layout-estimation' unifuse_depth_estimation = 'unifuse-depth-estimation' s2net_depth_estimation = 's2net-depth-estimation' @@ -388,6 +389,7 @@ class Pipelines(object): language_guided_video_summarization = 'clip-it-video-summarization' image_semantic_segmentation = 'image-semantic-segmentation' image_depth_estimation = 'image-depth-estimation' + image_normal_estimation = 'image-normal-estimation' indoor_layout_estimation = 'indoor-layout-estimation' video_depth_estimation = 'video-depth-estimation' panorama_depth_estimation = 'panorama-depth-estimation' @@ -783,6 +785,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.image_depth_estimation: (Pipelines.image_depth_estimation, 'damo/cv_newcrfs_image-depth-estimation_indoor'), + Tasks.image_normal_estimation: + (Pipelines.image_normal_estimation, + 'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'), Tasks.indoor_layout_estimation: (Pipelines.indoor_layout_estimation, 'damo/cv_panovit_indoor-layout-estimation'), @@ -820,9 +825,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_convnextTiny_ocr-recognition-general_damo'), Tasks.skin_retouching: (Pipelines.skin_retouching, 'damo/cv_unet_skin-retouching'), - Tasks.faq_question_answering: - (Pipelines.faq_question_answering, - 'damo/nlp_structbert_faq-question-answering_chinese-base'), + Tasks.faq_question_answering: ( + Pipelines.faq_question_answering, + 'damo/nlp_structbert_faq-question-answering_chinese-base'), Tasks.crowd_counting: (Pipelines.crowd_counting, 'damo/cv_hrnet_crowd-counting_dcanet'), Tasks.video_single_object_tracking: ( diff --git a/modelscope/models/cv/image_normal_estimation/__init__.py b/modelscope/models/cv/image_normal_estimation/__init__.py new file mode 100644 index 00000000..9551a384 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .omnidata_model import OmnidataNormalEstimation + +else: + _import_structure = { + 'omnidata_model': ['OmnidataNormalEstimation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_normal_estimation/modules/__init__.py b/modelscope/models/cv/image_normal_estimation/modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/__init__.py b/modelscope/models/cv/image_normal_estimation/modules/midas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py new file mode 100644 index 00000000..41564c78 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py @@ -0,0 +1,20 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import torch + + +class BaseModel(torch.nn.Module): + + def load(self, path): + """Load model from file. + + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device('cpu')) + + if 'optimizer' in parameters: + parameters = parameters['model'] + + self.load_state_dict(parameters) diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py new file mode 100644 index 00000000..e0a30733 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py @@ -0,0 +1,395 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import torch +import torch.nn as nn + +from .vit import (_make_pretrained_vitb16_384, _make_pretrained_vitb_rn50_384, + _make_pretrained_vitl16_384, forward_vit) + + +def _make_encoder( + backbone, + features, + use_pretrained, + groups=1, + expand=False, + exportable=True, + hooks=None, + use_vit_only=False, + use_readout='ignore', +): + if backbone == 'vitl16_384': + pretrained = _make_pretrained_vitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, + expand=expand) # ViT-L/16 - 85.0% Top1 (backbone) + elif backbone == 'vitb_rn50_384': + pretrained = _make_pretrained_vitb_rn50_384( + use_pretrained, + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) + scratch = _make_scratch( + [256, 512, 768, 768], features, groups=groups, + expand=expand) # ViT-H/16 - 85.0% Top1 (backbone) + elif backbone == 'vitb16_384': + pretrained = _make_pretrained_vitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, + expand=expand) # ViT-B/16 - 84.6% Top1 (backbone) + elif backbone == 'resnext101_wsl': + pretrained = _make_pretrained_resnext101_wsl(use_pretrained) + scratch = _make_scratch([256, 512, 1024, 2048], + features, + groups=groups, + expand=expand) # efficientnet_lite3 + elif backbone == 'efficientnet_lite3': + pretrained = _make_pretrained_efficientnet_lite3( + use_pretrained, exportable=exportable) + scratch = _make_scratch([32, 48, 136, 384], + features, + groups=groups, + expand=expand) # efficientnet_lite3 + else: + print(f"Backbone '{backbone}' not implemented") + assert False + + return pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand is True: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], + out_shape1, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], + out_shape2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], + out_shape3, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], + out_shape4, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + + return scratch + + +def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): + efficientnet = torch.hub.load( + 'rwightman/gen-efficientnet-pytorch', + 'tf_efficientnet_lite3', + pretrained=use_pretrained, + exportable=exportable) + return _make_efficientnet_backbone(efficientnet) + + +def _make_efficientnet_backbone(effnet): + pretrained = nn.Module() + + pretrained.layer1 = nn.Sequential(effnet.conv_stem, effnet.bn1, + effnet.act1, *effnet.blocks[0:2]) + pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) + pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) + pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) + + return pretrained + + +def _make_resnet_backbone(resnet): + pretrained = nn.Module() + pretrained.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, + resnet.maxpool, resnet.layer1) + + pretrained.layer2 = resnet.layer2 + pretrained.layer3 = resnet.layer3 + pretrained.layer4 = resnet.layer4 + + return pretrained + + +def _make_pretrained_resnext101_wsl(use_pretrained): + resnet = torch.hub.load('facebookresearch/WSL-Images', + 'resnext101_32x8d_wsl') + return _make_resnet_backbone(resnet) + + +class Interpolate(nn.Module): + """Interpolation module. + """ + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode='bilinear', align_corners=True) + + return output + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups = 1 + + self.conv1 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=True, + groups=self.groups) + + self.conv2 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=True, + groups=self.groups) + + if self.bn is True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn is True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn is True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + # return out + x + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block. + """ + + def __init__(self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups = 1 + + self.expand = expand + out_features = features + if self.expand is True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, + out_features, + kernel_size=1, + stride=1, + padding=0, + bias=True, + groups=1) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners) + + output = self.out_conv(output) + + return output diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py new file mode 100644 index 00000000..af799327 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py @@ -0,0 +1,108 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .base_model import BaseModel +from .blocks import (FeatureFusionBlock, FeatureFusionBlock_custom, + Interpolate, _make_encoder, forward_vit) + + +def _make_fusion_block(features, use_bn): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + ) + + +class DPT(BaseModel): + + def __init__( + self, + head, + features=256, + backbone='vitb_rn50_384', + readout='project', + channels_last=False, + use_bn=False, + ): + + super(DPT, self).__init__() + + self.channels_last = channels_last + + hooks = { + 'vitb_rn50_384': [0, 1, 8, 11], + 'vitb16_384': [2, 5, 8, 11], + 'vitl16_384': [5, 11, 17, 23], + } + + # Instantiate backbone and reassemble blocks + self.pretrained, self.scratch = _make_encoder( + backbone, + features, + False, # Set to true of you want to train from scratch, uses ImageNet weights + groups=1, + expand=False, + exportable=False, + hooks=hooks[backbone], + use_readout=readout, + ) + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.scratch.output_conv = head + + def forward(self, x): + if self.channels_last is True: + x.contiguous(memory_format=torch.channels_last) + + layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return out + + +class DPTDepthModel(DPT): + + def __init__(self, path=None, non_negative=True, num_channels=1, **kwargs): + features = kwargs['features'] if 'features' in kwargs else 256 + + head = nn.Sequential( + nn.Conv2d( + features, features // 2, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode='bilinear', align_corners=True), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, num_channels, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) + + def forward(self, x): + return super().forward(x).squeeze(dim=1) diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py new file mode 100644 index 00000000..bb8ba9f3 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py @@ -0,0 +1,517 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import math +import types + +import timm +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Slice(nn.Module): + + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index:] + + +class AddReadout(nn.Module): + + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index:] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential( + nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) + features = torch.cat((x[:, self.start_index:], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +def forward_vit(pretrained, x): + b, c, h, w = x.shape + + _ = pretrained.model.forward_flex(x) + + layer_1 = pretrained.activations['1'] + layer_2 = pretrained.activations['2'] + layer_3 = pretrained.activations['3'] + layer_4 = pretrained.activations['4'] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size([ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ]), + )) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)]( + layer_1) + layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)]( + layer_2) + layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)]( + layer_3) + layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)]( + layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, :self.start_index], + posemb[0, self.start_index:], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, + -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate( + posemb_grid, size=(gs_h, gs_w), mode='bilinear') + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1], + w // self.patch_size[0]) + + B = x.shape[0] + + if hasattr(self.patch_embed, 'backbone'): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[ + -1] # last feature if backbone outputs list/tuple of features + + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, 'dist_token', None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + cls_tokens = self.cls_token.expand( + B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + return x + + +activations = {} + + +def get_activation(name): + + def hook(model, input, output): + activations[name] = output + + return hook + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == 'ignore': + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == 'add': + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == 'project': + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout='ignore', + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook( + get_activation('1')) + pretrained.model.blocks[hooks[1]].register_forward_hook( + get_activation('2')) + pretrained.model.blocks[hooks[2]].register_forward_hook( + get_activation('3')) + pretrained.model.blocks[hooks[3]].register_forward_hook( + get_activation('4')) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, + start_index) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, + pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model) + + return pretrained + + +def _make_pretrained_vitl16_384(pretrained, use_readout='ignore', hooks=None): + model = timm.create_model('vit_large_patch16_384', pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_vitb16_384(pretrained, use_readout='ignore', hooks=None): + model = timm.create_model('vit_base_patch16_384', pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout) + + +def _make_pretrained_deitb16_384(pretrained, use_readout='ignore', hooks=None): + model = timm.create_model( + 'vit_deit_base_patch16_384', pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout) + + +def _make_pretrained_deitb16_distil_384(pretrained, + use_readout='ignore', + hooks=None): + model = timm.create_model( + 'vit_deit_base_distilled_patch16_384', pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout, + start_index=2, + ) + + +def _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=[0, 1, 8, 11], + vit_features=768, + use_vit_only=False, + use_readout='ignore', + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + + if use_vit_only: + pretrained.model.blocks[hooks[0]].register_forward_hook( + get_activation('1')) + pretrained.model.blocks[hooks[1]].register_forward_hook( + get_activation('2')) + else: + pretrained.model.patch_embed.backbone.stages[0].register_forward_hook( + get_activation('1')) + pretrained.model.patch_embed.backbone.stages[1].register_forward_hook( + get_activation('2')) + + pretrained.model.blocks[hooks[2]].register_forward_hook( + get_activation('3')) + pretrained.model.blocks[hooks[3]].register_forward_hook( + get_activation('4')) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, + start_index) + + if use_vit_only: + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + else: + pretrained.act_postprocess1 = nn.Sequential(nn.Identity(), + nn.Identity(), + nn.Identity()) + pretrained.act_postprocess2 = nn.Sequential(nn.Identity(), + nn.Identity(), + nn.Identity()) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, + pretrained.model) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model) + + return pretrained + + +def _make_pretrained_vitb_rn50_384(pretrained, + use_readout='ignore', + hooks=None, + use_vit_only=False): + model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained) + + hooks = [0, 1, 8, 11] if hooks is None else hooks + return _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) diff --git a/modelscope/models/cv/image_normal_estimation/omnidata_model.py b/modelscope/models/cv/image_normal_estimation/omnidata_model.py new file mode 100644 index 00000000..35e89c1c --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/omnidata_model.py @@ -0,0 +1,54 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Model: Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets from 3D Scans +# Paper link: https://arxiv.org/pdf/2110.04994.pdf +import os.path as osp + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.image_normal_estimation.modules.midas.dpt_depth import \ + DPTDepthModel +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks + + +@MODELS.register_module( + Tasks.image_normal_estimation, + module_name=Models.omnidata_normal_estimation) +class OmnidataNormalEstimation(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + # build model + self.model = DPTDepthModel( + backbone='vitb_rn50_384', num_channels=3) # DPT Hybrid + # checkpoint = torch.load(pretrained_weights_path, map_location=map_location) + + # load model + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + checkpoint = torch.load(model_path, map_location='cpu') + if 'state_dict' in checkpoint: + state_dict = {} + for k, v in checkpoint['state_dict'].items(): + state_dict[k[6:]] = v + else: + state_dict = checkpoint + self.model.load_state_dict(state_dict) + self.model.eval() + + def forward(self, inputs): + return self.model(inputs['imgs']).clamp(min=0, max=1) + + def postprocess(self, inputs): + normal_result = inputs.flip(1) + results = {OutputKeys.NORMALS: normal_result} + return results + + def inference(self, data): + results = self.forward(data) + + return results diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index 0b01e69e..1f9abc37 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -25,6 +25,8 @@ class OutputKeys(object): MASKS = 'masks' DEPTHS = 'depths' DEPTHS_COLOR = 'depths_color' + NORMALS = 'normals' + NORMALS_COLOR = 'normals_color' LAYOUT = 'layout' TEXT = 'text' POLYGONS = 'polygons' diff --git a/modelscope/pipelines/cv/image_normal_estimation_pipeline.py b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py new file mode 100644 index 00000000..6622a6ee --- /dev/null +++ b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py @@ -0,0 +1,154 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Union + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_normal_estimation, + module_name=Pipelines.image_normal_estimation) +class ImageNormalEstimationPipeline(Pipeline): + r""" Image Normal Estimation Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + + >>> estimator = pipeline( + >>> Tasks.image_normal_estimation, model='Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal') + >>> estimator("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_normal_estimation.jpg") + >>> { + >>> "normals": array([[[0.09233217, 0.07563387, 0.08025375, ..., 0.06992684, + >>> 0.07490329, 0.14308228], + >>> [0.07833742, 0.06736029, 0.07296766, ..., 0.09184352, + >>> 0.0800755 , 0.09726034], + >>> [0.07676302, 0.06631223, 0.07067154, ..., 0.09527256, + >>> 0.09292313, 0.08056315], + >>> ..., + >>> [0.26432115, 0.29100573, 0.2956126 , ..., 0.2913087 , + >>> 0.29201347, 0.29539976], + >>> [0.24557455, 0.26430887, 0.28548756, ..., 0.2877307 , + >>> 0.28856137, 0.2937242 ], + >>> [0.26316068, 0.2718169 , 0.28436714, ..., 0.29435217, + >>> 0.29842147, 0.2943223 ]], + >>> [[0.59257126, 0.6459297 , 0.66572756, ..., 0.68350476, + >>> 0.6882835 , 0.66579086], + >>> [0.7054596 , 0.6592535 , 0.6728153 , ..., 0.6589912 , + >>> 0.64541686, 0.63954735], + >>> [0.6912665 , 0.6638877 , 0.67816293, ..., 0.6607329 , + >>> 0.6472897 , 0.64633334], + >>> ..., + >>> [0.04231769, 0.04427819, 0.04816979, ..., 0.04485315, + >>> 0.04652229, 0.04869233], + >>> [0.04601872, 0.03706329, 0.04397734, ..., 0.04522909, + >>> 0.04745695, 0.04823782], + >>> [0.06671816, 0.0520605 , 0.0563788 , ..., 0.04913886, + >>> 0.04974678, 0.04954173]], + >>> [[0.4338835 , 0.43240184, 0.43519282, ..., 0.36894026, + >>> 0.35207224, 0.33153164], + >>> [0.4786287 , 0.4399531 , 0.4350407 , ..., 0.34690523, + >>> 0.3179497 , 0.26544768], + >>> [0.47692937, 0.4416514 , 0.437603 , ..., 0.34660107, + >>> 0.3102659 , 0.27787644], + >>> ..., + >>> [0.49566334, 0.48355937, 0.48710674, ..., 0.4964854 , + >>> 0.48945957, 0.49413157], + >>> [0.490632 , 0.4706958 , 0.48100013, ..., 0.48724395, + >>> 0.4799561 , 0.48129278], + >>> [0.49428058, 0.47433382, 0.4823783 , ..., 0.48930234, + >>> 0.48616886, 0.47176325]]], dtype=float32), + >>> 'normals_color': array([[[ 23, 151, 110], + >>> [ 19, 164, 110], + >>> [ 20, 169, 110], + >>> ..., + >>> [ 17, 174, 94], + >>> [ 19, 175, 89], + >>> [ 36, 169, 84]], + >>> [[ 19, 179, 122], + >>> [ 17, 168, 112], + >>> [ 18, 171, 110], + >>> ..., + >>> [ 23, 168, 88], + >>> [ 20, 164, 81], + >>> [ 24, 163, 67]], + >>> [[ 19, 176, 121], + >>> [ 16, 169, 112], + >>> [ 18, 172, 111], + >>> ..., + >>> [ 24, 168, 88], + >>> [ 23, 165, 79], + >>> [ 20, 164, 70]], + >>> ..., + >>> [[ 67, 10, 126], + >>> [ 74, 11, 123], + >>> [ 75, 12, 124], + >>> ..., + >>> [ 74, 11, 126], + >>> [ 74, 11, 124], + >>> [ 75, 12, 126]], + >>> [[ 62, 11, 125], + >>> [ 67, 9, 120], + >>> [ 72, 11, 122], + >>> ..., + >>> [ 73, 11, 124], + >>> [ 73, 12, 122], + >>> [ 74, 12, 122]], + >>> [[ 67, 17, 126], + >>> [ 69, 13, 120], + >>> [ 72, 14, 123], + >>> ..., + >>> [ 75, 12, 124], + >>> [ 76, 12, 123], + >>> [ 75, 12, 120]]], dtype=uint8)} + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image normal estimation pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + logger.info('normal estimation model, pipeline init') + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input).astype(np.float32) + H, W = 384, 384 + img = cv2.resize(img, [W, H]) + img = img.transpose(2, 0, 1) / 255.0 + imgs = img[None, ...] + data = {'imgs': imgs} + + return data + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.inference(input) + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.postprocess(inputs) + normals = results[OutputKeys.NORMALS] + if isinstance(normals, torch.Tensor): + normals = normals.detach().cpu().squeeze().numpy() + normals_color = (np.transpose(normals, + (1, 2, 0)) * 255).astype(np.uint8) + outputs = { + OutputKeys.NORMALS: normals, + OutputKeys.NORMALS_COLOR: normals_color + } + + return outputs diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index 999be154..54a206a4 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -57,6 +57,7 @@ class CVTasks(object): semantic_segmentation = 'semantic-segmentation' image_driving_perception = 'image-driving-perception' image_depth_estimation = 'image-depth-estimation' + image_normal_estimation = 'image-normal-estimation' indoor_layout_estimation = 'indoor-layout-estimation' video_depth_estimation = 'video-depth-estimation' panorama_depth_estimation = 'panorama-depth-estimation' diff --git a/modelscope/utils/pipeline_schema.json b/modelscope/utils/pipeline_schema.json index cf5c7fb7..013d4f6e 100644 --- a/modelscope/utils/pipeline_schema.json +++ b/modelscope/utils/pipeline_schema.json @@ -1144,6 +1144,13 @@ "type": "object" } }, + "image-normal-estimation": { + "input": {}, + "parameters": {}, + "output": { + "type": "object" + } + }, "image-driving-perception": { "input": { "type": "object", diff --git a/tests/pipelines/test_image_normal_estimation.py b/tests/pipelines/test_image_normal_estimation.py new file mode 100644 index 00000000..2ae5ca69 --- /dev/null +++ b/tests/pipelines/test_image_normal_estimation.py @@ -0,0 +1,33 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import unittest + +import cv2 +import numpy as np + +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.test_utils import test_level + + +class ImageNormalEstimationTest(unittest.TestCase): + + def setUp(self) -> None: + self.task = 'image-normal-estimation' + self.model_id = 'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_image_normal_estimation(self): + input_location = 'data/test/images/image_normal_estimation.jpg' + estimator = pipeline( + Tasks.image_normal_estimation, model=self.model_id) + result = estimator(input_location) + normals_vis = result[OutputKeys.NORMALS_COLOR] + cv2.imwrite('result.jpg', normals_vis) + + print('test_image_normal_estimation DONE') + + +if __name__ == '__main__': + unittest.main()