Feature/image normal estimation (#683)

* image_normal_estimation * image_normal_estimation * update according to pr review * update submodule data test --------- Co-authored-by: Weihao Yuan <qianmu.ywh@alibaba-inc.com>
2026-02-24 04:01:10 +01:00 · 2024-01-09 11:53:02 +08:00
parent 2d528ed482
commit 105247140c
15 changed files with 1322 additions and 4 deletions
--- a/data/test
+++ b/data/test
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -52,6 +52,7 @@ class Models(object):
    vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
    text_driven_segmentation = 'text-driven-segmentation'
    newcrfs_depth_estimation = 'newcrfs-depth-estimation'
+    omnidata_normal_estimation = 'omnidata-normal-estimation'
    panovit_layout_estimation = 'panovit-layout-estimation'
    unifuse_depth_estimation = 'unifuse-depth-estimation'
    s2net_depth_estimation = 's2net-depth-estimation'
@@ -388,6 +389,7 @@ class Pipelines(object):
    language_guided_video_summarization = 'clip-it-video-summarization'
    image_semantic_segmentation = 'image-semantic-segmentation'
    image_depth_estimation = 'image-depth-estimation'
+    image_normal_estimation = 'image-normal-estimation'
    indoor_layout_estimation = 'indoor-layout-estimation'
    video_depth_estimation = 'video-depth-estimation'
    panorama_depth_estimation = 'panorama-depth-estimation'
@@ -783,6 +785,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
    Tasks.image_depth_estimation:
    (Pipelines.image_depth_estimation,
     'damo/cv_newcrfs_image-depth-estimation_indoor'),
+    Tasks.image_normal_estimation:
+    (Pipelines.image_normal_estimation,
+     'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'),
    Tasks.indoor_layout_estimation:
    (Pipelines.indoor_layout_estimation,
     'damo/cv_panovit_indoor-layout-estimation'),
@@ -820,9 +825,9 @@ DEFAULT_MODEL_FOR_PIPELINE = {
     'damo/cv_convnextTiny_ocr-recognition-general_damo'),
    Tasks.skin_retouching: (Pipelines.skin_retouching,
                            'damo/cv_unet_skin-retouching'),
-    Tasks.faq_question_answering:
-    (Pipelines.faq_question_answering,
-     'damo/nlp_structbert_faq-question-answering_chinese-base'),
+    Tasks.faq_question_answering: (
+        Pipelines.faq_question_answering,
+        'damo/nlp_structbert_faq-question-answering_chinese-base'),
    Tasks.crowd_counting: (Pipelines.crowd_counting,
                           'damo/cv_hrnet_crowd-counting_dcanet'),
    Tasks.video_single_object_tracking: (
--- a/modelscope/models/cv/image_normal_estimation/init.py
+++ b/modelscope/models/cv/image_normal_estimation/init.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .omnidata_model import OmnidataNormalEstimation
+
+else:
+    _import_structure = {
+        'omnidata_model': ['OmnidataNormalEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
--- a/modelscope/models/cv/image_normal_estimation/modules/init.py
+++ b/modelscope/models/cv/image_normal_estimation/modules/init.py
--- a/modelscope/models/cv/image_normal_estimation/modules/midas/init.py
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/init.py
--- a/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py
@@ -0,0 +1,20 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import torch
+
+
+class BaseModel(torch.nn.Module):
+
+    def load(self, path):
+        """Load model from file.
+
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+
+        if 'optimizer' in parameters:
+            parameters = parameters['model']
+
+        self.load_state_dict(parameters)
--- a/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py
@@ -0,0 +1,395 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import torch
+import torch.nn as nn
+
+from .vit import (_make_pretrained_vitb16_384, _make_pretrained_vitb_rn50_384,
+                  _make_pretrained_vitl16_384, forward_vit)
+
+
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout='ignore',
+):
+    if backbone == 'vitl16_384':
+        pretrained = _make_pretrained_vitl16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout)
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups,
+            expand=expand)  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == 'vitb_rn50_384':
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups,
+            expand=expand)  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == 'vitb16_384':
+        pretrained = _make_pretrained_vitb16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout)
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups,
+            expand=expand)  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == 'resnext101_wsl':
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch([256, 512, 1024, 2048],
+                                features,
+                                groups=groups,
+                                expand=expand)  # efficientnet_lite3
+    elif backbone == 'efficientnet_lite3':
+        pretrained = _make_pretrained_efficientnet_lite3(
+            use_pretrained, exportable=exportable)
+        scratch = _make_scratch([32, 48, 136, 384],
+                                features,
+                                groups=groups,
+                                expand=expand)  # efficientnet_lite3
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+
+    return pretrained, scratch
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand is True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+
+    return scratch
+
+
+def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
+    efficientnet = torch.hub.load(
+        'rwightman/gen-efficientnet-pytorch',
+        'tf_efficientnet_lite3',
+        pretrained=use_pretrained,
+        exportable=exportable)
+    return _make_efficientnet_backbone(efficientnet)
+
+
+def _make_efficientnet_backbone(effnet):
+    pretrained = nn.Module()
+
+    pretrained.layer1 = nn.Sequential(effnet.conv_stem, effnet.bn1,
+                                      effnet.act1, *effnet.blocks[0:2])
+    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
+    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
+    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
+
+    return pretrained
+
+
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu,
+                                      resnet.maxpool, resnet.layer1)
+
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+
+    return pretrained
+
+
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load('facebookresearch/WSL-Images',
+                            'resnext101_32x8d_wsl')
+    return _make_resnet_backbone(resnet)
+
+
+class Interpolate(nn.Module):
+    """Interpolation module.
+    """
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners)
+
+        return x
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        return out + x
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode='bilinear', align_corners=True)
+
+        return output
+
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module.
+    """
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+            groups=self.groups)
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+            groups=self.groups)
+
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+        # return out + x
+
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block.
+    """
+
+    def __init__(self,
+                 features,
+                 activation,
+                 deconv=False,
+                 bn=False,
+                 expand=False,
+                 align_corners=True):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1)
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        output = self.out_conv(output)
+
+        return output
--- a/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py
@@ -0,0 +1,108 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base_model import BaseModel
+from .blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
+                     Interpolate, _make_encoder, forward_vit)
+
+
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+
+
+class DPT(BaseModel):
+
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone='vitb_rn50_384',
+        readout='project',
+        channels_last=False,
+        use_bn=False,
+    ):
+
+        super(DPT, self).__init__()
+
+        self.channels_last = channels_last
+
+        hooks = {
+            'vitb_rn50_384': [0, 1, 8, 11],
+            'vitb16_384': [2, 5, 8, 11],
+            'vitl16_384': [5, 11, 17, 23],
+        }
+
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            False,  # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+        self.scratch.output_conv = head
+
+    def forward(self, x):
+        if self.channels_last is True:
+            x.contiguous(memory_format=torch.channels_last)
+
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+        out = self.scratch.output_conv(path_1)
+
+        return out
+
+
+class DPTDepthModel(DPT):
+
+    def __init__(self, path=None, non_negative=True, num_channels=1, **kwargs):
+        features = kwargs['features'] if 'features' in kwargs else 256
+
+        head = nn.Sequential(
+            nn.Conv2d(
+                features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, num_channels, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+
+        super().__init__(head, **kwargs)
+
+        if path is not None:
+            self.load(path)
+
+    def forward(self, x):
+        return super().forward(x).squeeze(dim=1)
--- a/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py
@@ -0,0 +1,517 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import math
+import types
+
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Slice(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        return x[:, self.start_index:]
+
+
+class AddReadout(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class ProjectReadout(nn.Module):
+
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+
+        self.project = nn.Sequential(
+            nn.Linear(2 * in_features, in_features), nn.GELU())
+
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+
+        return self.project(features)
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+
+
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+
+    _ = pretrained.model.forward_flex(x)
+
+    layer_1 = pretrained.activations['1']
+    layer_2 = pretrained.activations['2']
+    layer_3 = pretrained.activations['3']
+    layer_4 = pretrained.activations['4']
+
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size([
+                h // pretrained.model.patch_size[1],
+                w // pretrained.model.patch_size[0],
+            ]),
+        ))
+
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+
+    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
+        layer_1)
+    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
+        layer_2)
+    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
+        layer_3)
+    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
+        layer_4)
+
+    return layer_1, layer_2, layer_3, layer_4
+
+
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :self.start_index],
+        posemb[0, self.start_index:],
+    )
+
+    gs_old = int(math.sqrt(len(posemb_grid)))
+
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=(gs_h, gs_w), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+    return posemb
+
+
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+
+    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
+                                       w // self.patch_size[0])
+
+    B = x.shape[0]
+
+    if hasattr(self.patch_embed, 'backbone'):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[
+                -1]  # last feature if backbone outputs list/tuple of features
+
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+
+    if getattr(self, 'dist_token', None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+
+    x = x + pos_embed
+    x = self.pos_drop(x)
+
+    for blk in self.blocks:
+        x = blk(x)
+
+    x = self.norm(x)
+
+    return x
+
+
+activations = {}
+
+
+def get_activation(name):
+
+    def hook(model, input, output):
+        activations[name] = output
+
+    return hook
+
+
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == 'ignore':
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == 'add':
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == 'project':
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+
+    return readout_oper
+
+
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout='ignore',
+    start_index=1,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(
+        get_activation('1'))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(
+        get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
+
+
+def _make_pretrained_vitl16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_large_patch16_384', pretrained=pretrained)
+
+    hooks = [5, 11, 17, 23] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+    )
+
+
+def _make_pretrained_vitb16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_base_patch16_384', pretrained=pretrained)
+
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout)
+
+
+def _make_pretrained_deitb16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model(
+        'vit_deit_base_patch16_384', pretrained=pretrained)
+
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout)
+
+
+def _make_pretrained_deitb16_distil_384(pretrained,
+                                        use_readout='ignore',
+                                        hooks=None):
+    model = timm.create_model(
+        'vit_deit_base_distilled_patch16_384', pretrained=pretrained)
+
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+    )
+
+
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout='ignore',
+    start_index=1,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+
+    if use_vit_only:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(
+            get_activation('1'))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(
+            get_activation('2'))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation('1'))
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation('2'))
+
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    if use_vit_only:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(nn.Identity(),
+                                                    nn.Identity(),
+                                                    nn.Identity())
+        pretrained.act_postprocess2 = nn.Sequential(nn.Identity(),
+                                                    nn.Identity(),
+                                                    nn.Identity())
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
+
+
+def _make_pretrained_vitb_rn50_384(pretrained,
+                                   use_readout='ignore',
+                                   hooks=None,
+                                   use_vit_only=False):
+    model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained)
+
+    hooks = [0, 1, 8, 11] if hooks is None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+    )
--- a/modelscope/models/cv/image_normal_estimation/omnidata_model.py
+++ b/modelscope/models/cv/image_normal_estimation/omnidata_model.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Model: Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets from 3D Scans
+# Paper link: https://arxiv.org/pdf/2110.04994.pdf
+import os.path as osp
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_normal_estimation.modules.midas.dpt_depth import \
+    DPTDepthModel
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_normal_estimation,
+    module_name=Models.omnidata_normal_estimation)
+class OmnidataNormalEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        # build model
+        self.model = DPTDepthModel(
+            backbone='vitb_rn50_384', num_channels=3)  # DPT Hybrid
+        # checkpoint = torch.load(pretrained_weights_path, map_location=map_location)
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = torch.load(model_path, map_location='cpu')
+        if 'state_dict' in checkpoint:
+            state_dict = {}
+            for k, v in checkpoint['state_dict'].items():
+                state_dict[k[6:]] = v
+        else:
+            state_dict = checkpoint
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+
+    def forward(self, inputs):
+        return self.model(inputs['imgs']).clamp(min=0, max=1)
+
+    def postprocess(self, inputs):
+        normal_result = inputs.flip(1)
+        results = {OutputKeys.NORMALS: normal_result}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+
+        return results
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -25,6 +25,8 @@ class OutputKeys(object):
    MASKS = 'masks'
    DEPTHS = 'depths'
    DEPTHS_COLOR = 'depths_color'
+    NORMALS = 'normals'
+    NORMALS_COLOR = 'normals_color'
    LAYOUT = 'layout'
    TEXT = 'text'
    POLYGONS = 'polygons'
--- a/modelscope/pipelines/cv/image_normal_estimation_pipeline.py
+++ b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py
@@ -0,0 +1,154 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_normal_estimation,
+    module_name=Pipelines.image_normal_estimation)
+class ImageNormalEstimationPipeline(Pipeline):
+    r""" Image Normal Estimation Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+
+    >>> estimator = pipeline(
+    >>>        Tasks.image_normal_estimation, model='Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal')
+    >>> estimator("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_normal_estimation.jpg")
+    >>>   {
+    >>>    "normals": array([[[0.09233217, 0.07563387, 0.08025375, ..., 0.06992684,
+    >>>                       0.07490329, 0.14308228],
+    >>>                       [0.07833742, 0.06736029, 0.07296766, ..., 0.09184352,
+    >>>                       0.0800755 , 0.09726034],
+    >>>                       [0.07676302, 0.06631223, 0.07067154, ..., 0.09527256,
+    >>>                       0.09292313, 0.08056315],
+    >>>                       ...,
+    >>>                       [0.26432115, 0.29100573, 0.2956126 , ..., 0.2913087 ,
+    >>>                       0.29201347, 0.29539976],
+    >>>                       [0.24557455, 0.26430887, 0.28548756, ..., 0.2877307 ,
+    >>>                       0.28856137, 0.2937242 ],
+    >>>                       [0.26316068, 0.2718169 , 0.28436714, ..., 0.29435217,
+    >>>                       0.29842147, 0.2943223 ]],
+    >>>                      [[0.59257126, 0.6459297 , 0.66572756, ..., 0.68350476,
+    >>>                       0.6882835 , 0.66579086],
+    >>>                       [0.7054596 , 0.6592535 , 0.6728153 , ..., 0.6589912 ,
+    >>>                       0.64541686, 0.63954735],
+    >>>                       [0.6912665 , 0.6638877 , 0.67816293, ..., 0.6607329 ,
+    >>>                       0.6472897 , 0.64633334],
+    >>>                       ...,
+    >>>                       [0.04231769, 0.04427819, 0.04816979, ..., 0.04485315,
+    >>>                       0.04652229, 0.04869233],
+    >>>                       [0.04601872, 0.03706329, 0.04397734, ..., 0.04522909,
+    >>>                       0.04745695, 0.04823782],
+    >>>                       [0.06671816, 0.0520605 , 0.0563788 , ..., 0.04913886,
+    >>>                       0.04974678, 0.04954173]],
+    >>>                      [[0.4338835 , 0.43240184, 0.43519282, ..., 0.36894026,
+    >>>                       0.35207224, 0.33153164],
+    >>>                       [0.4786287 , 0.4399531 , 0.4350407 , ..., 0.34690523,
+    >>>                       0.3179497 , 0.26544768],
+    >>>                       [0.47692937, 0.4416514 , 0.437603  , ..., 0.34660107,
+    >>>                       0.3102659 , 0.27787644],
+    >>>                       ...,
+    >>>                       [0.49566334, 0.48355937, 0.48710674, ..., 0.4964854 ,
+    >>>                       0.48945957, 0.49413157],
+    >>>                       [0.490632  , 0.4706958 , 0.48100013, ..., 0.48724395,
+    >>>                       0.4799561 , 0.48129278],
+    >>>                       [0.49428058, 0.47433382, 0.4823783 , ..., 0.48930234,
+    >>>                       0.48616886, 0.47176325]]], dtype=float32),
+    >>>    'normals_color': array([[[ 23, 151, 110],
+    >>>                             [ 19, 164, 110],
+    >>>                             [ 20, 169, 110],
+    >>>                             ...,
+    >>>                             [ 17, 174,  94],
+    >>>                             [ 19, 175,  89],
+    >>>                             [ 36, 169,  84]],
+    >>>                            [[ 19, 179, 122],
+    >>>                             [ 17, 168, 112],
+    >>>                             [ 18, 171, 110],
+    >>>                             ...,
+    >>>                             [ 23, 168,  88],
+    >>>                             [ 20, 164,  81],
+    >>>                             [ 24, 163,  67]],
+    >>>                            [[ 19, 176, 121],
+    >>>                             [ 16, 169, 112],
+    >>>                             [ 18, 172, 111],
+    >>>                             ...,
+    >>>                             [ 24, 168,  88],
+    >>>                             [ 23, 165,  79],
+    >>>                             [ 20, 164,  70]],
+    >>>                             ...,
+    >>>                            [[ 67,  10, 126],
+    >>>                             [ 74,  11, 123],
+    >>>                             [ 75,  12, 124],
+    >>>                             ...,
+    >>>                             [ 74,  11, 126],
+    >>>                             [ 74,  11, 124],
+    >>>                             [ 75,  12, 126]],
+    >>>                            [[ 62,  11, 125],
+    >>>                             [ 67,   9, 120],
+    >>>                             [ 72,  11, 122],
+    >>>                             ...,
+    >>>                             [ 73,  11, 124],
+    >>>                             [ 73,  12, 122],
+    >>>                             [ 74,  12, 122]],
+    >>>                            [[ 67,  17, 126],
+    >>>                             [ 69,  13, 120],
+    >>>                             [ 72,  14, 123],
+    >>>                             ...,
+    >>>                             [ 75,  12, 124],
+    >>>                             [ 76,  12, 123],
+    >>>                             [ 75,  12, 120]]], dtype=uint8)}
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image normal estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('normal estimation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input).astype(np.float32)
+        H, W = 384, 384
+        img = cv2.resize(img, [W, H])
+        img = img.transpose(2, 0, 1) / 255.0
+        imgs = img[None, ...]
+        data = {'imgs': imgs}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        normals = results[OutputKeys.NORMALS]
+        if isinstance(normals, torch.Tensor):
+            normals = normals.detach().cpu().squeeze().numpy()
+        normals_color = (np.transpose(normals,
+                                      (1, 2, 0)) * 255).astype(np.uint8)
+        outputs = {
+            OutputKeys.NORMALS: normals,
+            OutputKeys.NORMALS_COLOR: normals_color
+        }
+
+        return outputs
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -57,6 +57,7 @@ class CVTasks(object):
    semantic_segmentation = 'semantic-segmentation'
    image_driving_perception = 'image-driving-perception'
    image_depth_estimation = 'image-depth-estimation'
+    image_normal_estimation = 'image-normal-estimation'
    indoor_layout_estimation = 'indoor-layout-estimation'
    video_depth_estimation = 'video-depth-estimation'
    panorama_depth_estimation = 'panorama-depth-estimation'
--- a/modelscope/utils/pipeline_schema.json
+++ b/modelscope/utils/pipeline_schema.json
@@ -1144,6 +1144,13 @@
            "type": "object"
        }
    },
+    "image-normal-estimation": {
+        "input": {},
+        "parameters": {},
+        "output": {
+            "type": "object"
+        }
+    },
    "image-driving-perception": {
        "input": {
            "type": "object",
--- a/tests/pipelines/test_image_normal_estimation.py
+++ b/tests/pipelines/test_image_normal_estimation.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageNormalEstimationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'image-normal-estimation'
+        self.model_id = 'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_normal_estimation(self):
+        input_location = 'data/test/images/image_normal_estimation.jpg'
+        estimator = pipeline(
+            Tasks.image_normal_estimation, model=self.model_id)
+        result = estimator(input_location)
+        normals_vis = result[OutputKeys.NORMALS_COLOR]
+        cv2.imwrite('result.jpg', normals_vis)
+
+        print('test_image_normal_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()