From 6fc15926a3ba262ab0be0a13a2a1b05ed4d14a05 Mon Sep 17 00:00:00 2001 From: "hejunjie.hjj" Date: Fri, 10 Feb 2023 08:01:23 +0000 Subject: [PATCH] [to #42322933] add single and multiple human parsing models Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11508413 --- .../images/image_multiple_human_parsing.jpg | 3 + .../images/image_single_human_parsing.jpg | 3 + modelscope/metainfo.py | 2 + .../models/cv/image_human_parsing/__init__.py | 23 ++ .../image_human_parsing/backbone/__init__.py | 22 + .../backbone/deeplab_resnet.py | 377 ++++++++++++++++++ .../cv/image_human_parsing/m2fp/__init__.py | 24 ++ .../image_human_parsing/m2fp/m2fp_decoder.py | 221 ++++++++++ .../image_human_parsing/m2fp/m2fp_encoder.py | 215 ++++++++++ .../models/cv/image_human_parsing/m2fp_net.py | 363 +++++++++++++++++ .../cv/image_human_parsing/parsing_utils.py | 156 ++++++++ .../cascade_mask_rcnn_swin.py | 2 +- .../maskdino_swin.py | 2 +- modelscope/pipelines/cv/__init__.py | 2 + .../cv/image_human_parsing_pipeline.py | 126 ++++++ tests/pipelines/test_image_human_parsing.py | 48 +++ 16 files changed, 1587 insertions(+), 2 deletions(-) create mode 100644 data/test/images/image_multiple_human_parsing.jpg create mode 100644 data/test/images/image_single_human_parsing.jpg create mode 100644 modelscope/models/cv/image_human_parsing/__init__.py create mode 100644 modelscope/models/cv/image_human_parsing/backbone/__init__.py create mode 100644 modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py create mode 100644 modelscope/models/cv/image_human_parsing/m2fp/__init__.py create mode 100644 modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py create mode 100644 modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py create mode 100644 modelscope/models/cv/image_human_parsing/m2fp_net.py create mode 100644 modelscope/models/cv/image_human_parsing/parsing_utils.py create mode 100644 modelscope/pipelines/cv/image_human_parsing_pipeline.py create mode 100644 tests/pipelines/test_image_human_parsing.py diff --git a/data/test/images/image_multiple_human_parsing.jpg b/data/test/images/image_multiple_human_parsing.jpg new file mode 100644 index 00000000..c95881fe --- /dev/null +++ b/data/test/images/image_multiple_human_parsing.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:176c824d99af119b36f743d3d90b44529167b0e4fc6db276da60fa140ee3f4a9 +size 87228 diff --git a/data/test/images/image_single_human_parsing.jpg b/data/test/images/image_single_human_parsing.jpg new file mode 100644 index 00000000..981efe4e --- /dev/null +++ b/data/test/images/image_single_human_parsing.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1976ea249b4ad5409cdae403dcd154fac3c628909b6b1874cc968960e2c62d +size 8259 diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 40cfaf76..11fcd4c7 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -99,6 +99,7 @@ class Models(object): ddpm = 'ddpm' ocr_recognition = 'OCRRecognition' image_quality_assessment_mos = 'image-quality-assessment-mos' + m2fp = 'm2fp' nerf_recon_acc = 'nerf-recon-acc' bts_depth_estimation = 'bts-depth-estimation' vision_efficient_tuning = 'vision-efficient-tuning' @@ -363,6 +364,7 @@ class Pipelines(object): video_colorization = 'video-colorization' motion_generattion = 'mdm-motion-generation' mobile_image_super_resolution = 'mobile-image-super-resolution' + image_human_parsing = 'm2fp-image-human-parsing' object_detection_3d_depe = 'object-detection-3d-depe' bad_image_detecting = 'bad-image-detecting' nerf_recon_acc = 'nerf-recon-acc' diff --git a/modelscope/models/cv/image_human_parsing/__init__.py b/modelscope/models/cv/image_human_parsing/__init__.py new file mode 100644 index 00000000..787c0353 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .m2fp_net import M2FP + from parsing_utils import center_to_target_size_test +else: + _import_structure = { + 'm2fp_net': ['M2FP'], + 'parsing_utils': ['center_to_target_size_test'] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_human_parsing/backbone/__init__.py b/modelscope/models/cv/image_human_parsing/backbone/__init__.py new file mode 100644 index 00000000..47bf7f72 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/backbone/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .deeplab_resnet import build_resnet_deeplab_backbone + +else: + _import_structure = { + 'deeplab_resnet': ['build_resnet_deeplab_backbone'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py b/modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py new file mode 100644 index 00000000..d8f890b8 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/backbone/deeplab_resnet.py @@ -0,0 +1,377 @@ +# Part of the implementation is borrowed and modified from Detectron2, publicly available at +# https://github.com/facebookresearch/detectron2/blob/main/projects/DeepLab/deeplab/resnet.py + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \ + Conv2d + + +def get_norm(norm, out_channels): + if norm is None: + return None + if isinstance(norm, str): + if len(norm) == 0: + return None + norm = { + 'BN': torch.nn.BatchNorm2d, + 'GN': lambda channels: nn.GroupNorm(32, channels), + 'nnSyncBN': nn.SyncBatchNorm, + }[norm] + return norm(out_channels) + + +class BasicBlock(nn.Module): + + def __init__(self, in_channels, out_channels, *, stride=1, norm='BN'): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels)) + else: + self.shortcut = None + + self.conv1 = Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + norm=get_norm(norm, out_channels)) + + self.conv2 = Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=get_norm(norm, out_channels)) + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + out = self.conv2(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class BottleneckBlock(nn.Module): + + def __init__(self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm='BN', + stride_in_1x1=False, + dilation=1): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + norm=get_norm(norm, bottleneck_channels)) + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + bias=False, + groups=num_groups, + dilation=dilation, + norm=get_norm(norm, bottleneck_channels)) + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=1, + bias=False, + norm=get_norm(norm, out_channels)) + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + + out = self.conv2(out) + out = F.relu_(out) + + out = self.conv3(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class DeepLabStem(nn.Module): + + def __init__(self, in_channels=3, out_channels=128, norm='BN'): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = 4 + self.conv1 = Conv2d( + in_channels, + out_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=False, + norm=get_norm(norm, out_channels // 2)) + self.conv2 = Conv2d( + out_channels // 2, + out_channels // 2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=get_norm(norm, out_channels // 2)) + self.conv3 = Conv2d( + out_channels // 2, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=get_norm(norm, out_channels)) + + def forward(self, x): + x = self.conv1(x) + x = F.relu_(x) + x = self.conv2(x) + x = F.relu_(x) + x = self.conv3(x) + x = F.relu_(x) + x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) + return x + + +class DeeplabResNet(nn.Module): + + def __init__(self, stem, stages, num_classes=None, out_features=None): + super().__init__() + self.stem = stem + self.num_classes = num_classes + + current_stride = self.stem.stride + self._out_feature_strides = {'stem': current_stride} + self._out_feature_channels = {'stem': self.stem.out_channels} + + self.stage_names, self.stages = [], [] + + if out_features is not None: + num_stages = max([{ + 'res2': 1, + 'res3': 2, + 'res4': 3, + 'res5': 4 + }.get(f, 0) for f in out_features]) + stages = stages[:num_stages] + for i, blocks in enumerate(stages): + assert len(blocks) > 0, len(blocks) + for block in blocks: + assert isinstance(block, nn.Module), block + + name = 'res' + str(i + 2) + stage = nn.Sequential(*blocks) + + self.add_module(name, stage) + self.stage_names.append(name) + self.stages.append(stage) + + self._out_feature_strides[name] = current_stride = int( + current_stride * np.prod([k.stride for k in blocks])) + self._out_feature_channels[name] = curr_channels = blocks[ + -1].out_channels + self.stage_names = tuple( + self.stage_names) # Make it static for scripting + + if num_classes is not None: + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.linear = nn.Linear(curr_channels, num_classes) + nn.init.normal_(self.linear.weight, std=0.01) + name = 'linear' + + if out_features is None: + out_features = [name] + self._out_features = out_features + assert len(self._out_features) + children = [x[0] for x in self.named_children()] + for out_feature in self._out_features: + assert out_feature in children, 'Available children: {}'.format( + ', '.join(children)) + + def forward(self, x): + assert x.dim( + ) == 4, f'ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!' + outputs = {} + x = self.stem(x) + if 'stem' in self._out_features: + outputs['stem'] = x + for name, stage in zip(self.stage_names, self.stages): + x = stage(x) + if name in self._out_features: + outputs[name] = x + if self.num_classes is not None: + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.linear(x) + if 'linear' in self._out_features: + outputs['linear'] = x + return outputs + + def output_shape(self): + return { + name: dict( + channels=self._out_feature_channels[name], + stride=self._out_feature_strides[name]) + for name in self._out_features + } + + @property + def size_divisibility(self) -> int: + return 0 + + @staticmethod + def make_stage(block_class, num_blocks, *, in_channels, out_channels, + **kwargs): + blocks = [] + for i in range(num_blocks): + curr_kwargs = {} + for k, v in kwargs.items(): + if k.endswith('_per_block'): + assert len(v) == num_blocks, ( + f"Argument '{k}' of make_stage should have the " + f'same length as num_blocks={num_blocks}.') + newk = k[:-len('_per_block')] + assert newk not in kwargs, f'Cannot call make_stage with both {k} and {newk}!' + curr_kwargs[newk] = v[i] + else: + curr_kwargs[k] = v + + blocks.append( + block_class( + in_channels=in_channels, + out_channels=out_channels, + **curr_kwargs)) + in_channels = out_channels + return blocks + + +def build_resnet_deeplab_backbone(out_features, depth, num_groups, + width_per_group, norm, stem_out_channels, + res2_out_channels, stride_in_1x1, + res4_dilation, res5_dilation, + res5_multi_grid, input_shape): + stem = DeepLabStem( + in_channels=input_shape['channels'], + out_channels=stem_out_channels, + norm=norm) + bottleneck_channels = num_groups * width_per_group + in_channels = stem_out_channels + out_channels = res2_out_channels + + assert res4_dilation in { + 1, 2 + }, 'res4_dilation cannot be {}.'.format(res4_dilation) + assert res5_dilation in { + 1, 2, 4 + }, 'res5_dilation cannot be {}.'.format(res5_dilation) + if res4_dilation == 2: + # Always dilate res5 if res4 is dilated. + assert res5_dilation == 4 + + num_blocks_per_stage = { + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3] + }[depth] + + stages = [] + out_stage_idx = [{ + 'res2': 2, + 'res3': 3, + 'res4': 4, + 'res5': 5 + }[f] for f in out_features] + max_stage_idx = max(out_stage_idx) + for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): + if stage_idx == 4: + dilation = res4_dilation + elif stage_idx == 5: + dilation = res5_dilation + else: + dilation = 1 + first_stride = 1 if idx == 0 or dilation > 1 else 2 + stride_per_block = [first_stride] + stride_per_block += [1] * (num_blocks_per_stage[idx] - 1) + stage_kargs = { + 'num_blocks': num_blocks_per_stage[idx], + 'stride_per_block': stride_per_block, + 'in_channels': in_channels, + 'out_channels': out_channels, + 'norm': norm, + 'bottleneck_channels': bottleneck_channels, + 'stride_in_1x1': stride_in_1x1, + 'dilation': dilation, + 'num_groups': num_groups, + 'block_class': BottleneckBlock + } + if stage_idx == 5: + stage_kargs.pop('dilation') + stage_kargs['dilation_per_block'] = [ + dilation * mg for mg in res5_multi_grid + ] + blocks = DeeplabResNet.make_stage(**stage_kargs) + in_channels = out_channels + out_channels *= 2 + bottleneck_channels *= 2 + stages.append(blocks) + return DeeplabResNet(stem, stages, out_features=out_features) diff --git a/modelscope/models/cv/image_human_parsing/m2fp/__init__.py b/modelscope/models/cv/image_human_parsing/m2fp/__init__.py new file mode 100644 index 00000000..d90618d3 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/m2fp/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .m2fp_encoder import MSDeformAttnPixelDecoder + from .m2fp_decoder import MultiScaleMaskedTransformerDecoder + +else: + _import_structure = { + 'm2fp_encoder': ['MSDeformAttnPixelDecoder'], + 'm2fp_decoder': ['MultiScaleMaskedTransformerDecoder'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py new file mode 100644 index 00000000..f21eae46 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_decoder.py @@ -0,0 +1,221 @@ +# The implementation is adopted from Mask2Former, made publicly available under the MIT License at +# https://github.com/facebookresearch/Mask2Former + +import torch +from torch import nn +from torch.nn import functional as F + +from modelscope.models.cv.image_colorization.ddcolor.utils.transformer_utils import ( + MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer) +from modelscope.models.cv.image_instance_segmentation.maskdino.position_encoding import \ + PositionEmbeddingSine +from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \ + Conv2d + + +class MultiScaleMaskedTransformerDecoder(nn.Module): + + def __init__( + self, + in_channels, + mask_classification=True, + *, + num_classes: int, + hidden_dim: int, + num_queries: int, + nheads: int, + dim_feedforward: int, + dec_layers: int, + pre_norm: bool, + mask_dim: int, + enforce_input_project: bool, + ): + """ + NOTE: this interface is experimental. + Args: + in_channels: channels of the input features + mask_classification: whether to add mask classifier or not + num_classes: number of classes + hidden_dim: Transformer feature dimension + num_queries: number of queries + nheads: number of heads + dim_feedforward: feature dimension in feedforward network + dec_layers: number of Transformer decoder layers + pre_norm: whether to use pre-LayerNorm or not + mask_dim: mask feature dimension + enforce_input_project: add input project 1x1 conv even if input + channels and hidden dim is identical + """ + super().__init__() + + assert mask_classification, 'Only support mask classification model' + self.mask_classification = mask_classification + + # positional encoding + N_steps = hidden_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + # define Transformer decoder here + self.num_heads = nheads + self.num_layers = dec_layers + self.num_classes = num_classes + self.transformer_self_attention_layers = nn.ModuleList() + self.transformer_cross_attention_layers = nn.ModuleList() + self.transformer_ffn_layers = nn.ModuleList() + + for _ in range(self.num_layers): + self.transformer_self_attention_layers.append( + SelfAttentionLayer( + d_model=hidden_dim, + nhead=nheads, + dropout=0.0, + normalize_before=pre_norm, + )) + + self.transformer_cross_attention_layers.append( + CrossAttentionLayer( + d_model=hidden_dim, + nhead=nheads, + dropout=0.0, + normalize_before=pre_norm, + )) + + self.transformer_ffn_layers.append( + FFNLayer( + d_model=hidden_dim, + dim_feedforward=dim_feedforward, + dropout=0.0, + normalize_before=pre_norm, + )) + + self.decoder_norm = nn.LayerNorm(hidden_dim) + + self.num_queries = num_queries + # learnable query features + self.query_feat = nn.Embedding(num_queries, hidden_dim) + # learnable query p.e. + self.query_embed = nn.Embedding(num_queries, hidden_dim) + + # level embedding (we always use 3 scales) + self.num_feature_levels = 3 + self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) + self.input_proj = nn.ModuleList() + for _ in range(self.num_feature_levels): + if in_channels != hidden_dim or enforce_input_project: + self.input_proj.append( + Conv2d(in_channels, hidden_dim, kernel_size=1)) + else: + self.input_proj.append(nn.Sequential()) + + # output FFNs + if self.mask_classification: + self.class_embed = nn.Linear(hidden_dim, num_classes + 1) + self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) + + def forward(self, x, mask_features, mask=None): + # x is a list of multi-scale feature + assert len(x) == self.num_feature_levels + src = [] + pos = [] + size_list = [] + + # disable mask, it does not affect performance + del mask + + for i in range(self.num_feature_levels): + size_list.append(x[i].shape[-2:]) + pos.append(self.pe_layer(x[i], None).flatten(2)) + src.append(self.input_proj[i](x[i]).flatten(2) + + self.level_embed.weight[i][None, :, None]) + + # flatten NxCxHxW to HWxNxC + pos[-1] = pos[-1].permute(2, 0, 1) + src[-1] = src[-1].permute(2, 0, 1) + + _, bs, _ = src[0].shape + + # QxNxC + query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) + output = self.query_feat.weight.unsqueeze(1).repeat(1, bs, 1) + + predictions_class = [] + predictions_mask = [] + + # prediction heads on learnable query features + outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads( + output, mask_features, attn_mask_target_size=size_list[0]) + predictions_class.append(outputs_class) + predictions_mask.append(outputs_mask) + + for i in range(self.num_layers): + level_index = i % self.num_feature_levels + attn_mask[torch.where( + attn_mask.sum(-1) == attn_mask.shape[-1])] = False + # attention: cross-attention first + output = self.transformer_cross_attention_layers[i]( + output, + src[level_index], + memory_mask=attn_mask, + memory_key_padding_mask= + None, # here we do not apply masking on padded region + pos=pos[level_index], + query_pos=query_embed) + + output = self.transformer_self_attention_layers[i]( + output, + tgt_mask=None, + tgt_key_padding_mask=None, + query_pos=query_embed) + + # FFN + output = self.transformer_ffn_layers[i](output) + + outputs_class, outputs_mask, attn_mask = \ + self.forward_prediction_heads( + output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels]) + predictions_class.append(outputs_class) + predictions_mask.append(outputs_mask) + + assert len(predictions_class) == self.num_layers + 1 + + out = { + 'pred_logits': + predictions_class[-1], + 'pred_masks': + predictions_mask[-1], + 'aux_outputs': + self._set_aux_loss( + predictions_class if self.mask_classification else None, + predictions_mask) + } + return out + + def forward_prediction_heads(self, output, mask_features, + attn_mask_target_size): + decoder_output = self.decoder_norm(output) + decoder_output = decoder_output.transpose(0, 1) + outputs_class = self.class_embed(decoder_output) + mask_embed = self.mask_embed(decoder_output) + outputs_mask = torch.einsum('bqc,bchw->bqhw', mask_embed, + mask_features) + + attn_mask = F.interpolate( + outputs_mask, + size=attn_mask_target_size, + mode='bilinear', + align_corners=False) + attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat( + 1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() + attn_mask = attn_mask.detach() + + return outputs_class, outputs_mask, attn_mask + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_seg_masks): + if self.mask_classification: + return [{ + 'pred_logits': a, + 'pred_masks': b + } for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])] + else: + return [{'pred_masks': b} for b in outputs_seg_masks[:-1]] diff --git a/modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py new file mode 100644 index 00000000..7b9cf78d --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/m2fp/m2fp_encoder.py @@ -0,0 +1,215 @@ +# The implementation is adopted from Mask2Former, made publicly available under the MIT License at +# https://github.com/facebookresearch/Mask2Former + +from typing import Any, Dict, List + +import numpy as np +import torch +from torch import nn +from torch.cuda.amp import autocast +from torch.nn import functional as F + +from modelscope.models.cv.image_instance_segmentation.maskdino.maskdino_encoder import \ + MSDeformAttnTransformerEncoderOnly +from modelscope.models.cv.image_instance_segmentation.maskdino.position_encoding import \ + PositionEmbeddingSine +from modelscope.models.cv.image_instance_segmentation.maskdino.utils import \ + Conv2d + + +class MSDeformAttnPixelDecoder(nn.Module): + + def __init__( + self, + input_shape: Dict[str, Any], + *, + transformer_dropout: float, + transformer_nheads: int, + transformer_dim_feedforward: int, + transformer_enc_layers: int, + conv_dim: int, + mask_dim: int, + # deformable transformer encoder args + transformer_in_features: List[str], + common_stride: int, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + transformer_dropout: dropout probability in transformer + transformer_nheads: number of heads in transformer + transformer_dim_feedforward: dimension of feedforward network + transformer_enc_layers: number of transformer encoder layers + conv_dim: number of output channels for the intermediate conv layers. + mask_dim: number of output channels for the final conv layer. + """ + super().__init__() + self.conv_dim = conv_dim + + transformer_input_shape = { + k: v + for k, v in input_shape.items() if k in transformer_in_features + } + + # this is the input shape of pixel decoder + input_shape = sorted(input_shape.items(), key=lambda x: x[1]['stride']) + self.in_features = [k for k, v in input_shape + ] # starting from "res2" to "res5" + self.feature_strides = [v['stride'] for k, v in input_shape] + self.feature_channels = [v['channels'] for k, v in input_shape] + + # this is the input shape of transformer encoder (could use less features than pixel decoder + transformer_input_shape = sorted( + transformer_input_shape.items(), key=lambda x: x[1]['stride']) + self.transformer_in_features = [k for k, v in transformer_input_shape + ] # starting from "res2" to "res5" + transformer_in_channels = [ + v['channels'] for k, v in transformer_input_shape + ] + self.transformer_feature_strides = [ + v['stride'] for k, v in transformer_input_shape + ] # to decide extra FPN layers + + self.transformer_num_feature_levels = len(self.transformer_in_features) + if self.transformer_num_feature_levels > 1: + input_proj_list = [] + # from low resolution to high resolution (res5 -> res2) + for in_channels in transformer_in_channels[::-1]: + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, conv_dim, kernel_size=1), + nn.GroupNorm(32, conv_dim), + )) + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList([ + nn.Sequential( + nn.Conv2d( + transformer_in_channels[-1], conv_dim, kernel_size=1), + nn.GroupNorm(32, conv_dim), + ) + ]) + + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + + self.transformer = MSDeformAttnTransformerEncoderOnly( + d_model=conv_dim, + dropout=transformer_dropout, + nhead=transformer_nheads, + dim_feedforward=transformer_dim_feedforward, + num_encoder_layers=transformer_enc_layers, + num_feature_levels=self.transformer_num_feature_levels, + ) + N_steps = conv_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + self.mask_dim = mask_dim + # use 1x1 conv instead + self.mask_features = Conv2d( + conv_dim, + mask_dim, + kernel_size=1, + stride=1, + padding=0, + ) + + self.maskformer_num_feature_levels = 3 # always use 3 scales + self.common_stride = common_stride + + # extra fpn levels + stride = min(self.transformer_feature_strides) + self.num_fpn_levels = int( + np.log2(stride) - np.log2(self.common_stride)) + + lateral_convs = [] + output_convs = [] + + use_bias = False + for idx, in_channels in enumerate( + self.feature_channels[:self.num_fpn_levels]): + lateral_norm = nn.GroupNorm(32, conv_dim) + output_norm = nn.GroupNorm(32, conv_dim) + + lateral_conv = Conv2d( + in_channels, + conv_dim, + kernel_size=1, + bias=use_bias, + norm=lateral_norm) + output_conv = Conv2d( + conv_dim, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + self.add_module('adapter_{}'.format(idx + 1), lateral_conv) + self.add_module('layer_{}'.format(idx + 1), output_conv) + + lateral_convs.append(lateral_conv) + output_convs.append(output_conv) + # Place convs into top-down order (from low to high resolution) + # to make the top-down computation in forward clearer. + self.lateral_convs = lateral_convs[::-1] + self.output_convs = output_convs[::-1] + + @autocast(enabled=False) + def forward_features(self, features): + srcs = [] + pos = [] + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.transformer_in_features[::-1]): + x = features[f].float( + ) # deformable detr does not support half precision + srcs.append(self.input_proj[idx](x)) + pos.append(self.pe_layer(x)) + + y, spatial_shapes, level_start_index = self.transformer( + srcs, None, pos) + bs = y.shape[0] + + split_size_or_sections = [None] * self.transformer_num_feature_levels + for i in range(self.transformer_num_feature_levels): + if i < self.transformer_num_feature_levels - 1: + split_size_or_sections[i] = level_start_index[ + i + 1] - level_start_index[i] + else: + split_size_or_sections[i] = y.shape[1] - level_start_index[i] + y = torch.split(y, split_size_or_sections, dim=1) + + out = [] + multi_scale_features = [] + num_cur_levels = 0 + for i, z in enumerate(y): + out.append( + z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], + spatial_shapes[i][1])) + + # append `out` with extra FPN levels + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]): + x = features[f].float() + lateral_conv = self.lateral_convs[idx] + output_conv = self.output_convs[idx] + cur_fpn = lateral_conv(x) + # Following FPN implementation, we use nearest upsampling here + y = cur_fpn + F.interpolate( + out[-1], + size=cur_fpn.shape[-2:], + mode='bilinear', + align_corners=False) + y = output_conv(y) + out.append(y) + + for o in out: + if num_cur_levels < self.maskformer_num_feature_levels: + multi_scale_features.append(o) + num_cur_levels += 1 + + return self.mask_features(out[-1]), out[0], multi_scale_features diff --git a/modelscope/models/cv/image_human_parsing/m2fp_net.py b/modelscope/models/cv/image_human_parsing/m2fp_net.py new file mode 100644 index 00000000..3f771663 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/m2fp_net.py @@ -0,0 +1,363 @@ +# Part of the implementation is borrowed and modified from M2FP, made publicly available +# under the CC BY-NC 4.0 License at https://github.com/soeaver/M2FP +import os +from typing import Any, Dict + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.image_instance_segmentation.maskdino_swin import \ + ImageList +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .backbone import build_resnet_deeplab_backbone +from .m2fp.m2fp_decoder import MultiScaleMaskedTransformerDecoder +from .m2fp.m2fp_encoder import MSDeformAttnPixelDecoder + +logger = get_logger() + + +@MODELS.register_module(Tasks.image_segmentation, module_name=Models.m2fp) +class M2FP(TorchModel): + + def __init__(self, + model_dir, + backbone=None, + encoder=None, + decoder=None, + pretrained=None, + input_single_human=None, + classes=None, + num_parsing=None, + single_human=True, + parsing_ins_score_thr=0.5, + parsing_on=False, + semantic_on=True, + sem_seg_postprocess_before_inference=True, + **kwargs): + """ + Deep Learning Technique for Human Parsing: A Survey and Outlook. See https://arxiv.org/abs/2301.00394 + Args: + backbone (dict): backbone config. + encoder (dict): encoder config. + decoder (dict): decoder config. + pretrained (bool): whether to use pretrained model + input_single_human (dict): input size config for single human parsing + classes (list): class names + num_parsing (int): total number of parsing instances, only for multiple human parsing + single_human (bool): whether the task is single human parsing + parsing_ins_score_thr: instance score threshold for multiple human parsing + parsing_on (bool): whether to parse results, only for multiple human parsing + semantic_on (bool): whether to output semantic map + sem_seg_postprocess_before_inference: whether to resize the prediction back + to original input size before semantic segmentation inference or after. + """ + super(M2FP, self).__init__(model_dir, **kwargs) + + self.register_buffer( + 'pixel_mean', + torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1), False) + self.register_buffer( + 'pixel_std', + torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1), False) + self.size_divisibility = 32 + + self.backbone = build_resnet_deeplab_backbone( + **backbone, input_shape={'channels': 3}) + in_features = encoder.pop('in_features') + input_shape = { + k: v + for k, v in self.backbone.output_shape().items() + if k in in_features + } + encoder = MSDeformAttnPixelDecoder(input_shape=input_shape, **encoder) + decoder = MultiScaleMaskedTransformerDecoder( + in_channels=encoder.conv_dim, **decoder) + self.sem_seg_head = M2FPHead( + pixel_decoder=encoder, transformer_predictor=decoder) + self.num_classes = decoder.num_classes + self.num_queries = decoder.num_queries + self.test_topk_per_image = 100 + + self.input_single_human = input_single_human + self.classes = classes + self.num_parsing = num_parsing + self.single_human = single_human + self.parsing_ins_score_thr = parsing_ins_score_thr + self.parsing_on = parsing_on + self.semantic_on = semantic_on + self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference or parsing_on + + if not self.semantic_on: + assert self.sem_seg_postprocess_before_inference + + if pretrained: + model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE) + logger.info(f'loading model from {model_path}') + weight = torch.load(model_path, map_location='cpu')['model'] + tgt_weight = self.state_dict() + for name in list(weight.keys()): + if name in tgt_weight: + load_size = weight[name].size() + tgt_size = tgt_weight[name].size() + mis_match = False + if len(load_size) != len(tgt_size): + mis_match = True + else: + for n1, n2 in zip(load_size, tgt_size): + if n1 != n2: + mis_match = True + break + if mis_match: + logger.info( + f'size mismatch for {name} ' + f'({load_size} -> {tgt_size}), skip loading.') + del weight[name] + else: + logger.info( + f'{name} doesn\'t exist in current model, skip loading.' + ) + + self.load_state_dict(weight, strict=False) + logger.info('load model done') + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + batched_inputs = input['batched_inputs'] + images = [x['image'].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.size_divisibility) + + features = self.backbone(images.tensor) + outputs = self.sem_seg_head(features) + + return dict( + outputs=outputs, batched_inputs=batched_inputs, images=images) + + def postprocess(self, input: Dict[str, Any]) -> Dict[str, Any]: + outputs = input['outputs'] + batched_inputs = input['batched_inputs'] + images = input['images'] + if self.training: + raise NotImplementedError + else: + mask_cls_results = outputs['pred_logits'] # (B, Q, C+1) + mask_pred_results = outputs['pred_masks'] # (B, Q, H, W) + # upsample masks + mask_pred_results = F.interpolate( + mask_pred_results, + size=(images.tensor.shape[-2], images.tensor.shape[-1]), + mode='bilinear', + align_corners=False, + ) + + del outputs + + processed_results = [] + for mask_cls_result, mask_pred_result, input_per_image, image_size in zip( + mask_cls_results, mask_pred_results, batched_inputs, + images.image_sizes): + height = input_per_image.get('height', image_size[0]) + width = input_per_image.get('width', image_size[1]) + processed_results.append({}) # for each image + + if self.sem_seg_postprocess_before_inference: + if not self.single_human: + mask_pred_result = self.sem_seg_postprocess( + mask_pred_result, image_size, height, width) + else: + mask_pred_result = self.single_human_sem_seg_postprocess( + mask_pred_result, image_size, + input_per_image['crop_box'], height, width) + mask_cls_result = mask_cls_result.to(mask_pred_result) + + # semantic segmentation inference + if self.semantic_on: + r = self.semantic_inference(mask_cls_result, + mask_pred_result) + if not self.sem_seg_postprocess_before_inference: + if not self.single_human: + r = self.sem_seg_postprocess( + r, image_size, height, width) + else: + r = self.single_human_sem_seg_postprocess( + r, image_size, input_per_image['crop_box'], + height, width) + processed_results[-1]['sem_seg'] = r + + # parsing inference + if self.parsing_on: + parsing_r = self.instance_parsing_inference( + mask_cls_result, mask_pred_result) + processed_results[-1]['parsing'] = parsing_r + + return dict(eval_result=processed_results) + + @property + def device(self): + return self.pixel_mean.device + + def single_human_sem_seg_postprocess(self, result, img_size, crop_box, + output_height, output_width): + result = result[:, :img_size[0], :img_size[1]] + result = result[:, crop_box[1]:crop_box[3], + crop_box[0]:crop_box[2]].expand(1, -1, -1, -1) + result = F.interpolate( + result, + size=(output_height, output_width), + mode='bilinear', + align_corners=False)[0] + return result + + def sem_seg_postprocess(self, result, img_size, output_height, + output_width): + result = result[:, :img_size[0], :img_size[1]].expand(1, -1, -1, -1) + result = F.interpolate( + result, + size=(output_height, output_width), + mode='bilinear', + align_corners=False)[0] + return result + + def semantic_inference(self, mask_cls, mask_pred): + mask_cls = F.softmax( + mask_cls, dim=-1)[..., :-1] # discard non-sense category + mask_pred = mask_pred.sigmoid() + semseg = torch.einsum('qc,qhw->chw', mask_cls, mask_pred) + return semseg + + def instance_parsing_inference(self, mask_cls, mask_pred): + scores = F.softmax(mask_cls, dim=-1)[:, :-1] + labels = torch.arange( + self.num_classes, + device=self.device).unsqueeze(0).repeat(self.num_queries, + 1).flatten(0, 1) + + scores_per_image, topk_indices = scores.flatten(0, 1).topk( + self.test_topk_per_image, sorted=False) + labels_per_image = labels[topk_indices] + + topk_indices = topk_indices // self.num_classes + mask_pred = mask_pred[topk_indices] + + binary_pred_masks = (mask_pred > 0).float() + mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * binary_pred_masks.flatten(1)).sum(1) / \ + (binary_pred_masks.flatten(1).sum(1) + 1e-6) + + pred_scores = scores_per_image * mask_scores_per_image + pred_labels = labels_per_image + pred_masks = mask_pred + + # prepare outputs + part_instance_res = [] + human_instance_res = [] + + # bkg and part instances + bkg_part_index = torch.where(pred_labels != self.num_parsing)[0] + bkg_part_labels = pred_labels[bkg_part_index] + bkg_part_scores = pred_scores[bkg_part_index] + bkg_part_masks = pred_masks[bkg_part_index, :, :] + + # human instances + human_index = torch.where(pred_labels == self.num_parsing)[0] + human_labels = pred_labels[human_index] + human_scores = pred_scores[human_index] + human_masks = pred_masks[human_index, :, :] + + semantic_res = self.paste_instance_to_semseg_probs( + bkg_part_labels, bkg_part_scores, bkg_part_masks) + + # part instances + part_index = torch.where(bkg_part_labels != 0)[0] + part_labels = bkg_part_labels[part_index] + part_scores = bkg_part_scores[part_index] + part_masks = bkg_part_masks[part_index, :, :] + + # part instance results + for idx in range(part_labels.shape[0]): + if part_scores[idx] < 0.1: + continue + part_instance_res.append({ + 'category_id': + part_labels[idx].cpu().tolist(), + 'score': + part_scores[idx].cpu().tolist(), + 'mask': + part_masks[idx], + }) + + # human instance results + for human_idx in range(human_scores.shape[0]): + if human_scores[human_idx] > 0.1: + human_instance_res.append({ + 'category_id': + human_labels[human_idx].cpu().tolist(), + 'score': + human_scores[human_idx].cpu().tolist(), + 'mask': + human_masks[human_idx], + }) + + return { + 'semantic_outputs': semantic_res, + 'part_outputs': part_instance_res, + 'human_outputs': human_instance_res, + } + + def paste_instance_to_semseg_probs(self, labels, scores, mask_probs): + im_h, im_w = mask_probs.shape[-2:] + semseg_im = [] + for cls_ind in range(self.num_parsing): + cate_inds = torch.where(labels == cls_ind)[0] + cate_scores = scores[cate_inds] + cate_mask_probs = mask_probs[cate_inds, :, :].sigmoid() + semseg_im.append( + self.paste_category_probs(cate_scores, cate_mask_probs, im_h, + im_w)) + + return torch.stack(semseg_im, dim=0) + + def paste_category_probs(self, scores, mask_probs, h, w): + category_probs = torch.zeros((h, w), + dtype=torch.float32, + device=mask_probs.device) + paste_times = torch.zeros((h, w), + dtype=torch.float32, + device=mask_probs.device) + + index = scores.argsort() + for k in range(len(index)): + if scores[index[k]] < self.parsing_ins_score_thr: + continue + ins_mask_probs = mask_probs[index[k], :, :] * scores[index[k]] + category_probs = torch.where(ins_mask_probs > 0.5, + ins_mask_probs + category_probs, + category_probs) + paste_times += torch.where(ins_mask_probs > 0.5, 1, 0) + + paste_times = torch.where(paste_times == 0, paste_times + 1, + paste_times) + category_probs /= paste_times + + return category_probs + + +class M2FPHead(nn.Module): + + def __init__(self, pixel_decoder: nn.Module, + transformer_predictor: nn.Module): + super().__init__() + self.pixel_decoder = pixel_decoder + self.predictor = transformer_predictor + + def forward(self, features, mask=None): + return self.layers(features, mask) + + def layers(self, features, mask=None): + mask_features, transformer_encoder_features, multi_scale_features = self.pixel_decoder.forward_features( + features) + predictions = self.predictor(multi_scale_features, mask_features, mask) + return predictions diff --git a/modelscope/models/cv/image_human_parsing/parsing_utils.py b/modelscope/models/cv/image_human_parsing/parsing_utils.py new file mode 100644 index 00000000..a1c20072 --- /dev/null +++ b/modelscope/models/cv/image_human_parsing/parsing_utils.py @@ -0,0 +1,156 @@ +# Part of the implementation is borrowed and modified from M2FP, made publicly available +# under the CC BY-NC 4.0 License at https://github.com/soeaver/M2FP +# Part of the implementation is borrowed and modified from Detectron2, made publicly available +# under the Apache-2.0 License at https://github.com/facebookresearch/detectron2 + +import copy + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image + + +def center_to_target_size_test(img, target_size): + src_h, src_w = img.shape[0], img.shape[1] + trg_h, trg_w = target_size[1], target_size[0] + + new_h, new_w = 0, 0 + tfm_list = [] + if src_h > trg_h and src_w > trg_w: + if src_h > src_w: + new_h = trg_h + new_w = int(new_h * src_w / src_h) + if new_w > trg_w: + new_w = trg_w + new_h = int(new_w * src_h / src_w) + elif src_w > src_h: + new_w = trg_w + new_h = int(new_w * src_h / src_w) + if new_h > trg_h: + new_h = trg_h + new_w = int(new_h * src_w / src_h) + tfm_list.append(ResizeTransform(src_h, src_w, new_h, new_w)) + tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w)) + + elif src_h > trg_h and src_w <= trg_w: + new_h = trg_h + new_w = int(new_h * src_w / src_h) + tfm_list.append(ResizeTransform(src_h, src_w, new_h, new_w)) + tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w)) + + elif src_h <= trg_h and src_w > trg_w: + new_w = trg_w + new_h = int(new_w * src_h / src_w) + tfm_list.append(ResizeTransform(src_h, src_w, new_h, new_w)) + tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w)) + + else: + new_h, new_w = src_h, src_w + tfm_list.append(PadTransform(new_h, new_w, trg_h, trg_w)) + + box = get_box(new_h, new_w, trg_h, trg_w) + + new_img = copy.deepcopy(img) + for tfm in tfm_list: + new_img = tfm.apply_image(new_img) + + return new_img, box + + +def get_box(src_h, src_w, trg_h, trg_w): + assert src_h <= trg_h, 'expect src_h <= trg_h' + assert src_w <= trg_w, 'expect src_w <= trg_w' + + x0 = int((trg_w - src_w) / 2) + x1 = src_w + x0 + y0 = int((trg_h - src_h) / 2) + y1 = src_h + y0 + + box = [x0, y0, x1, y1] + return box + + +class PadTransform: + + def __init__(self, src_h, src_w, trg_h, trg_w): + super().__init__() + assert src_h <= trg_h, 'expect src_h <= trg_h' + assert src_w <= trg_w, 'expect src_w <= trg_w' + + self.src_h, self.src_w = src_h, src_w + self.trg_h, self.trg_w = trg_h, trg_w + self.pad_left = int((trg_w - src_w) / 2) + self.pad_right = trg_w - src_w - self.pad_left + self.pad_top = int((trg_h - src_h) / 2) + self.pad_bottom = trg_h - src_h - self.pad_top + + def apply_image(self, img, pad_value=128): + if self.pad_left == 0 and self.pad_top == 0: + return img + + if len(img.shape) == 2: + return np.pad( + img, ((self.pad_top, self.pad_bottom), + (self.pad_left, self.pad_right)), + 'constant', + constant_values=((pad_value, pad_value), (pad_value, + pad_value))) + elif len(img.shape) == 3: + return np.pad( + img, ((self.pad_top, self.pad_bottom), + (self.pad_left, self.pad_right), (0, 0)), + 'constant', + constant_values=((pad_value, pad_value), + (pad_value, pad_value), (pad_value, + pad_value))) + + +class ResizeTransform: + + def __init__(self, h, w, new_h, new_w, interp=None): + super().__init__() + if interp is None: + interp = Image.BILINEAR + self.h, self.w = h, w + self.new_h, self.new_w = new_h, new_w + self.interp = interp + + def apply_image(self, img, interp=None): + assert img.shape[:2] == (self.h, self.w) + assert len(img.shape) <= 4 + interp_method = interp if interp is not None else self.interp + + if img.dtype == np.uint8: + if len(img.shape) > 2 and img.shape[2] == 1: + pil_image = Image.fromarray(img[:, :, 0], mode='L') + else: + pil_image = Image.fromarray(img) + pil_image = pil_image.resize((self.new_w, self.new_h), + interp_method) + ret = np.asarray(pil_image) + if len(img.shape) > 2 and img.shape[2] == 1: + ret = np.expand_dims(ret, -1) + else: + # PIL only supports uint8 + if any(x < 0 for x in img.strides): + img = np.ascontiguousarray(img) + img = torch.from_numpy(img) + shape = list(img.shape) + shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:] + img = img.view(shape_4d).permute(2, 3, 0, 1) # hw(c) -> nchw + _PIL_RESIZE_TO_INTERPOLATE_MODE = { + Image.NEAREST: 'nearest', + Image.BILINEAR: 'bilinear', + Image.BICUBIC: 'bicubic', + } + mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[interp_method] + align_corners = None if mode == 'nearest' else False + img = F.interpolate( + img, (self.new_h, self.new_w), + mode=mode, + align_corners=align_corners) + shape[:2] = (self.new_h, self.new_w) + ret = img.permute(2, 3, 0, 1).view(shape).numpy() # nchw -> hw(c) + + return ret diff --git a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py index ff83271e..375a5e45 100644 --- a/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py +++ b/modelscope/models/cv/image_instance_segmentation/cascade_mask_rcnn_swin.py @@ -89,7 +89,7 @@ class CascadeMaskRCNNSwin(nn.Module): model_path = os.path.join(kwargs['model_dir'], ModelFile.TORCH_MODEL_FILE) logger.info(f'loading model from {model_path}') - weight = torch.load(model_path)['state_dict'] + weight = torch.load(model_path, map_location='cpu')['state_dict'] tgt_weight = self.state_dict() for name in list(weight.keys()): if name in tgt_weight: diff --git a/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py b/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py index 5b60eb40..8c2aa7d2 100644 --- a/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py +++ b/modelscope/models/cv/image_instance_segmentation/maskdino_swin.py @@ -61,7 +61,7 @@ class MaskDINOSwin(nn.Module): model_path = os.path.join(kwargs['model_dir'], ModelFile.TORCH_MODEL_FILE) logger.info(f'loading model from {model_path}') - weight = torch.load(model_path)['model'] + weight = torch.load(model_path, map_location='cpu')['model'] tgt_weight = self.state_dict() for name in list(weight.keys()): if name in tgt_weight: diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 6530b5be..c94f6527 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -105,6 +105,7 @@ if TYPE_CHECKING: from .image_quality_assessment_mos_pipeline import ImageQualityAssessmentMosPipeline from .bad_image_detecting_pipeline import BadImageDetecingPipeline from .mobile_image_super_resolution_pipeline import MobileImageSuperResolutionPipeline + from .image_human_parsing_pipeline import ImageHumanParsingPipeline from .nerf_recon_acc_pipeline import NeRFReconAccPipeline else: @@ -257,6 +258,7 @@ else: 'MobileImageSuperResolutionPipeline' ], 'bad_image_detecting_pipeline': ['BadImageDetecingPipeline'], + 'image_human_parsing_pipeline': ['ImageHumanParsingPipeline'], 'nerf_recon_acc_pipeline': ['NeRFReconAccPipeline'], } diff --git a/modelscope/pipelines/cv/image_human_parsing_pipeline.py b/modelscope/pipelines/cv/image_human_parsing_pipeline.py new file mode 100644 index 00000000..01b29d81 --- /dev/null +++ b/modelscope/pipelines/cv/image_human_parsing_pipeline.py @@ -0,0 +1,126 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Optional, Union + +import numpy as np +import torch +import torchvision.transforms as T + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.image_human_parsing import ( + M2FP, center_to_target_size_test) +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_segmentation, module_name=Pipelines.image_human_parsing) +class ImageHumanParsingPipeline(Pipeline): + + def __init__(self, + model: Union[M2FP, str], + preprocessor: Optional = None, + **kwargs): + """use `model` and `preprocessor` to create an image human parsing + pipeline for prediction + + Args: + model (M2FPModel | str): a model instance + preprocessor (None): a preprocessor instance + """ + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + self.model.eval() + + def _get_preprocess_shape(self, oldh, oldw, short_edge_length, max_size): + h, w = oldh, oldw + size = short_edge_length * 1.0 + scale = size / min(h, w) + if h < w: + newh, neww = size, scale * w + else: + newh, neww = scale * h, size + if max(newh, neww) > max_size: + scale = max_size * 1.0 / max(newh, neww) + newh = newh * scale + neww = neww * scale + neww = int(neww + 0.5) + newh = int(newh + 0.5) + return (newh, neww) + + def preprocess(self, + input: Input, + min_size=640, + max_size=1333) -> Dict[str, Any]: + image = LoadImage.convert_to_img(input) + w, h = image.size[:2] + dataset_dict = {'width': w, 'height': h} + if self.model.single_human: + image = np.asarray(image) + image, crop_box = center_to_target_size_test( + image, self.model.input_single_human['sizes'][0]) + dataset_dict['image'] = torch.as_tensor( + np.ascontiguousarray(image.transpose(2, 0, 1))) + dataset_dict['crop_box'] = crop_box + else: + new_h, new_w = self._get_preprocess_shape(h, w, min_size, max_size) + test_transforms = T.Compose([ + T.Resize((new_h, new_w)), + T.ToTensor(), + ]) + image = test_transforms(image) + dataset_dict['image'] = image * 255. + result = {'batched_inputs': [dataset_dict]} + return result + + def forward(self, input: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + with torch.no_grad(): + output = self.model(input) + return output + + def postprocess(self, + inputs: Dict[str, Any], + score_thr=0.0) -> Dict[str, Any]: + predictions = inputs['eval_result'][0] + class_names = self.model.classes + results_dict = { + OutputKeys.MASKS: [], + OutputKeys.LABELS: [], + OutputKeys.SCORES: [] + } + if 'sem_seg' in predictions: + semantic_pred = predictions['sem_seg'] + semantic_seg = semantic_pred.argmax(dim=0).detach().cpu().numpy() + semantic_pred = semantic_pred.sigmoid().detach().cpu().numpy() + class_ids = np.unique(semantic_seg) + for class_id in class_ids: + label = class_names[class_id] + mask = np.array(semantic_seg == class_id, dtype=np.float64) + score = (mask * semantic_pred[class_id]).sum() / ( + mask.sum() + 1) + results_dict[OutputKeys.SCORES].append(score) + results_dict[OutputKeys.LABELS].append(label) + results_dict[OutputKeys.MASKS].append(mask) + elif 'parsing' in predictions: + parsing_res = predictions['parsing'] + part_outputs = parsing_res['part_outputs'] + human_outputs = parsing_res['human_outputs'] + + # process semantic_outputs + for output in part_outputs + human_outputs: + score = output['score'] + label = class_names[output['category_id']] + mask = (output['mask'] > 0).float().detach().cpu().numpy() + if score > score_thr: + results_dict[OutputKeys.SCORES].append(score) + results_dict[OutputKeys.LABELS].append(label) + results_dict[OutputKeys.MASKS].append(mask) + else: + raise NotImplementedError + + return results_dict diff --git a/tests/pipelines/test_image_human_parsing.py b/tests/pipelines/test_image_human_parsing.py new file mode 100644 index 00000000..77d75862 --- /dev/null +++ b/tests/pipelines/test_image_human_parsing.py @@ -0,0 +1,48 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import unittest + +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class ImageHumanParsingTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.task = Tasks.image_segmentation + self.model_id_single = 'damo/cv_resnet101_image-single-human-parsing' + self.model_id_multiple = 'damo/cv_resnet101_image-multiple-human-parsing' + + image_single = 'data/test/images/image_single_human_parsing.jpg' + image_multiple = 'data/test/images/image_multiple_human_parsing.jpg' + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_model_name(self): + pipeline_parsing = pipeline( + task=Tasks.image_segmentation, model=self.model_id_single) + print(pipeline_parsing(input=self.image_single)[OutputKeys.LABELS]) + pipeline_parsing = pipeline( + task=Tasks.image_segmentation, model=self.model_id_multiple) + print(pipeline_parsing(input=self.image_multiple)[OutputKeys.LABELS]) + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_run_with_model_from_modelhub(self): + model = Model.from_pretrained(self.model_id_single) + pipeline_parsing = pipeline( + task=Tasks.image_segmentation, model=model, preprocessor=None) + print(pipeline_parsing(input=self.image_single)[OutputKeys.LABELS]) + model = Model.from_pretrained(self.model_id_multiple) + pipeline_parsing = pipeline( + task=Tasks.image_segmentation, model=model, preprocessor=None) + print(pipeline_parsing(input=self.image_multiple)[OutputKeys.LABELS]) + + @unittest.skip('demo compatibility test is only enabled on a needed-basis') + def test_demo_compatibility(self): + self.compatibility_check() + + +if __name__ == '__main__': + unittest.main()