From 588e41c7878df1fb7e6ec48bf667683f2afa0aed Mon Sep 17 00:00:00 2001 From: "mulin.lyh" Date: Mon, 22 Jan 2024 15:52:30 +0800 Subject: [PATCH 1/2] fix lint issue --- modelscope/metainfo.py | 9 +- modelscope/models/cv/__init__.py | 11 +- .../image_local_feature_matching/__init__.py | 2 +- .../loftr_model.py | 33 +- .../src/loftr/backbone/__init__.py | 3 +- .../src/loftr/backbone/resnet_fpn.py | 54 +++- .../src/loftr/loftr.py | 42 ++- .../src/loftr/loftr_module/__init__.py | 2 +- .../src/loftr/loftr_module/fine_preprocess.py | 44 ++- .../loftr/loftr_module/linear_attention.py | 19 +- .../src/loftr/loftr_module/transformer.py | 40 ++- .../src/loftr/utils/coarse_matching.py | 37 ++- .../src/loftr/utils/fine_matching.py | 46 +-- .../src/loftr/utils/geometry.py | 29 +- .../src/loftr/utils/position_encoding.py | 8 +- .../src/loftr/utils/supervision.py | 59 ++-- .../src/utils/plotting.py | 81 +++-- .../cv/image_matching_fast/config/__init__.py | 2 +- .../cv/image_matching_fast/config/default.py | 26 +- .../image_matching_fast/lightglue/aliked.py | 290 +++++++++--------- .../cv/image_matching_fast/lightglue/disk.py | 28 +- .../lightglue/lightglue.py | 283 +++++++++-------- .../cv/image_matching_fast/lightglue/sift.py | 135 ++++---- .../lightglue/superpoint.py | 74 ++--- .../cv/image_matching_fast/lightglue/utils.py | 53 ++-- .../cv/image_matching_fast/lightglue/viz2d.py | 61 ++-- .../cv/image_matching_fast/lightglue_model.py | 54 ++-- modelscope/pipelines/cv/__init__.py | 4 +- .../image_local_feature_matching_pipeline.py | 7 +- .../cv/image_matching_fast_pipeline.py | 5 +- .../test_image_local_feature_matching.py | 5 +- tests/pipelines/test_image_matching_fast.py | 2 +- 32 files changed, 868 insertions(+), 680 deletions(-) diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index d3ccffd1..e723e990 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -808,7 +808,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { (Pipelines.panorama_depth_estimation, 'damo/cv_unifuse_panorama-depth-estimation'), Tasks.image_local_feature_matching: - (Pipelines.image_local_feature_matching, 'Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data'), + (Pipelines.image_local_feature_matching, + 'Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data'), Tasks.image_style_transfer: (Pipelines.image_style_transfer, 'damo/cv_aams_style-transfer_damo'), Tasks.face_image_generation: (Pipelines.face_image_generation, @@ -832,9 +833,9 @@ DEFAULT_MODEL_FOR_PIPELINE = { Tasks.image_object_detection: (Pipelines.image_object_detection_auto, 'damo/cv_yolox_image-object-detection-auto'), - Tasks.ocr_recognition: - (Pipelines.ocr_recognition, - 'damo/cv_convnextTiny_ocr-recognition-general_damo'), + Tasks.ocr_recognition: ( + Pipelines.ocr_recognition, + 'damo/cv_convnextTiny_ocr-recognition-general_damo'), Tasks.skin_retouching: (Pipelines.skin_retouching, 'damo/cv_unet_skin-retouching'), Tasks.faq_question_answering: ( diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index 39f46f5d..a271e37d 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -8,10 +8,11 @@ from . import (action_recognition, animal_recognition, bad_image_detecting, face_reconstruction, human3d_animation, human_reconstruction, image_classification, image_color_enhance, image_colorization, image_defrcn_fewshot, image_denoise, image_editing, - image_inpainting, image_instance_segmentation, image_matching, - image_mvs_depth_estimation, image_panoptic_segmentation, - image_portrait_enhancement, image_probing_model, - image_quality_assessment_degradation, + image_inpainting, image_instance_segmentation, + image_local_feature_matching, image_matching, + image_matching_fast, image_mvs_depth_estimation, + image_panoptic_segmentation, image_portrait_enhancement, + image_probing_model, image_quality_assessment_degradation, image_quality_assessment_man, image_quality_assessment_mos, image_reid_person, image_restoration, image_semantic_segmentation, image_super_resolution_pasd, @@ -29,6 +30,6 @@ from . import (action_recognition, animal_recognition, bad_image_detecting, video_panoptic_segmentation, video_single_object_tracking, video_stabilization, video_summarization, video_super_resolution, vidt, virual_tryon, vision_middleware, - vop_retrieval, image_local_feature_matching,image_matching_fast) + vop_retrieval) # yapf: enable diff --git a/modelscope/models/cv/image_local_feature_matching/__init__.py b/modelscope/models/cv/image_local_feature_matching/__init__.py index 256843b8..eecc611e 100644 --- a/modelscope/models/cv/image_local_feature_matching/__init__.py +++ b/modelscope/models/cv/image_local_feature_matching/__init__.py @@ -19,4 +19,4 @@ else: _import_structure, module_spec=__spec__, extra_objects={}, - ) \ No newline at end of file + ) diff --git a/modelscope/models/cv/image_local_feature_matching/loftr_model.py b/modelscope/models/cv/image_local_feature_matching/loftr_model.py index 157dfa28..d47b9da2 100644 --- a/modelscope/models/cv/image_local_feature_matching/loftr_model.py +++ b/modelscope/models/cv/image_local_feature_matching/loftr_model.py @@ -1,21 +1,22 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import os.path as osp - import io -import cv2 -import torch -import numpy as np +import os.path as osp from copy import deepcopy +import cv2 +import matplotlib.cm as cm +import numpy as np +import torch + from modelscope.metainfo import Models from modelscope.models.base.base_torch_model import TorchModel from modelscope.models.builder import MODELS -from modelscope.models.cv.image_local_feature_matching.src.loftr import \ - LoFTR, default_cfg -from modelscope.models.cv.image_local_feature_matching.src.utils.plotting import make_matching_figure +from modelscope.models.cv.image_local_feature_matching.src.loftr import ( + LoFTR, default_cfg) +from modelscope.models.cv.image_local_feature_matching.src.utils.plotting import \ + make_matching_figure from modelscope.outputs import OutputKeys from modelscope.utils.constant import ModelFile, Tasks -import matplotlib.cm as cm @MODELS.register_module( @@ -51,15 +52,19 @@ class LocalFeatureMatching(TorchModel): def postprocess(self, Inputs): # Draw color = cm.jet(Inputs['conf'].cpu().numpy()) - img0, img1, mkpts0, mkpts1 = Inputs["image0"].squeeze().cpu().numpy(), Inputs["image1"].squeeze().cpu().numpy(), Inputs["kpts0"].cpu().numpy(), Inputs["kpts1"].cpu().numpy() + img0, img1, mkpts0, mkpts1 = Inputs['image0'].squeeze().cpu().numpy( + ), Inputs['image1'].squeeze().cpu().numpy(), Inputs['kpts0'].cpu( + ).numpy(), Inputs['kpts1'].cpu().numpy() text = [ 'LoFTR', 'Matches: {}'.format(len(Inputs['kpts0'])), ] - img0, img1 = (img0 * 255).astype(np.uint8), (img1 * 255).astype(np.uint8) - fig = make_matching_figure(img0, img1, mkpts0, mkpts1, color, text=text) + img0, img1 = (img0 * 255).astype(np.uint8), (img1 * 255).astype( + np.uint8) + fig = make_matching_figure( + img0, img1, mkpts0, mkpts1, color, text=text) io_buf = io.BytesIO() - fig.savefig(io_buf, format="png", dpi=75) + fig.savefig(io_buf, format='png', dpi=75) io_buf.seek(0) buf_data = np.frombuffer(io_buf.getvalue(), dtype=np.uint8) io_buf.close() @@ -71,4 +76,4 @@ class LocalFeatureMatching(TorchModel): def inference(self, data): results = self.forward(data) - return results \ No newline at end of file + return results diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py index b6e731b3..af4f526d 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py @@ -8,4 +8,5 @@ def build_backbone(config): elif config['resolution'] == (16, 4): return ResNetFPN_16_4(config['resnetfpn']) else: - raise ValueError(f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.") + raise ValueError( + f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.") diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py index 985e5b3f..ea7583d1 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py @@ -4,15 +4,28 @@ import torch.nn.functional as F def conv1x1(in_planes, out_planes, stride=1): """1x1 convolution without padding""" - return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False) + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=1, + stride=stride, + padding=0, + bias=False) def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" - return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) class BasicBlock(nn.Module): + def __init__(self, in_planes, planes, stride=1): super().__init__() self.conv1 = conv3x3(in_planes, planes, stride) @@ -26,8 +39,7 @@ class BasicBlock(nn.Module): else: self.downsample = nn.Sequential( conv1x1(in_planes, planes, stride=stride), - nn.BatchNorm2d(planes) - ) + nn.BatchNorm2d(planes)) def forward(self, x): y = x @@ -37,7 +49,7 @@ class BasicBlock(nn.Module): if self.downsample is not None: x = self.downsample(x) - return self.relu(x+y) + return self.relu(x + y) class ResNetFPN_8_2(nn.Module): @@ -57,7 +69,8 @@ class ResNetFPN_8_2(nn.Module): self.in_planes = initial_dim # Networks - self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) + self.conv1 = nn.Conv2d( + 1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(initial_dim) self.relu = nn.ReLU(inplace=True) @@ -84,7 +97,8 @@ class ResNetFPN_8_2(nn.Module): for m in self.modules(): if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) @@ -107,13 +121,15 @@ class ResNetFPN_8_2(nn.Module): # FPN x3_out = self.layer3_outconv(x3) - x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True) + x3_out_2x = F.interpolate( + x3_out, scale_factor=2., mode='bilinear', align_corners=True) x2_out = self.layer2_outconv(x2) - x2_out = self.layer2_outconv2(x2_out+x3_out_2x) + x2_out = self.layer2_outconv2(x2_out + x3_out_2x) - x2_out_2x = F.interpolate(x2_out, scale_factor=2., mode='bilinear', align_corners=True) + x2_out_2x = F.interpolate( + x2_out, scale_factor=2., mode='bilinear', align_corners=True) x1_out = self.layer1_outconv(x1) - x1_out = self.layer1_outconv2(x1_out+x2_out_2x) + x1_out = self.layer1_outconv2(x1_out + x2_out_2x) return [x3_out, x1_out] @@ -135,7 +151,8 @@ class ResNetFPN_16_4(nn.Module): self.in_planes = initial_dim # Networks - self.conv1 = nn.Conv2d(1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) + self.conv1 = nn.Conv2d( + 1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(initial_dim) self.relu = nn.ReLU(inplace=True) @@ -164,7 +181,8 @@ class ResNetFPN_16_4(nn.Module): for m in self.modules(): if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) @@ -188,12 +206,14 @@ class ResNetFPN_16_4(nn.Module): # FPN x4_out = self.layer4_outconv(x4) - x4_out_2x = F.interpolate(x4_out, scale_factor=2., mode='bilinear', align_corners=True) + x4_out_2x = F.interpolate( + x4_out, scale_factor=2., mode='bilinear', align_corners=True) x3_out = self.layer3_outconv(x3) - x3_out = self.layer3_outconv2(x3_out+x4_out_2x) + x3_out = self.layer3_outconv2(x3_out + x4_out_2x) - x3_out_2x = F.interpolate(x3_out, scale_factor=2., mode='bilinear', align_corners=True) + x3_out_2x = F.interpolate( + x3_out, scale_factor=2., mode='bilinear', align_corners=True) x2_out = self.layer2_outconv(x2) - x2_out = self.layer2_outconv2(x2_out+x3_out_2x) + x2_out = self.layer2_outconv2(x2_out + x3_out_2x) return [x4_out, x2_out] diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py index 79c491ee..34cac887 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py @@ -3,13 +3,14 @@ import torch.nn as nn from einops.einops import rearrange from .backbone import build_backbone -from .utils.position_encoding import PositionEncodingSine -from .loftr_module import LocalFeatureTransformer, FinePreprocess +from .loftr_module import FinePreprocess, LocalFeatureTransformer from .utils.coarse_matching import CoarseMatching from .utils.fine_matching import FineMatching +from .utils.position_encoding import PositionEncodingSine class LoFTR(nn.Module): + def __init__(self, config): super().__init__() # Misc @@ -23,11 +24,11 @@ class LoFTR(nn.Module): self.loftr_coarse = LocalFeatureTransformer(config['coarse']) self.coarse_matching = CoarseMatching(config['match_coarse']) self.fine_preprocess = FinePreprocess(config) - self.loftr_fine = LocalFeatureTransformer(config["fine"]) + self.loftr_fine = LocalFeatureTransformer(config['fine']) self.fine_matching = FineMatching() def forward(self, data): - """ + """ Update: data (dict): { 'image0': (torch.Tensor): (N, 1, H, W) @@ -39,18 +40,24 @@ class LoFTR(nn.Module): # 1. Local Feature CNN data.update({ 'bs': data['image0'].size(0), - 'hw0_i': data['image0'].shape[2:], 'hw1_i': data['image1'].shape[2:] + 'hw0_i': data['image0'].shape[2:], + 'hw1_i': data['image1'].shape[2:] }) if data['hw0_i'] == data['hw1_i']: # faster & better BN convergence - feats_c, feats_f = self.backbone(torch.cat([data['image0'], data['image1']], dim=0)) - (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(data['bs']), feats_f.split(data['bs']) + feats_c, feats_f = self.backbone( + torch.cat([data['image0'], data['image1']], dim=0)) + (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split( + data['bs']), feats_f.split(data['bs']) else: # handle different input shapes - (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(data['image0']), self.backbone(data['image1']) + (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone( + data['image0']), self.backbone(data['image1']) data.update({ - 'hw0_c': feat_c0.shape[2:], 'hw1_c': feat_c1.shape[2:], - 'hw0_f': feat_f0.shape[2:], 'hw1_f': feat_f1.shape[2:] + 'hw0_c': feat_c0.shape[2:], + 'hw1_c': feat_c1.shape[2:], + 'hw0_f': feat_f0.shape[2:], + 'hw1_f': feat_f1.shape[2:] }) # 2. coarse-level loftr module @@ -60,16 +67,21 @@ class LoFTR(nn.Module): mask_c0 = mask_c1 = None # mask is useful in training if 'mask0' in data: - mask_c0, mask_c1 = data['mask0'].flatten(-2), data['mask1'].flatten(-2) - feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, mask_c1) + mask_c0, mask_c1 = data['mask0'].flatten( + -2), data['mask1'].flatten(-2) + feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, + mask_c1) # 3. match coarse-level - self.coarse_matching(feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1) + self.coarse_matching( + feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1) # 4. fine-level refinement - feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(feat_f0, feat_f1, feat_c0, feat_c1, data) + feat_f0_unfold, feat_f1_unfold = self.fine_preprocess( + feat_f0, feat_f1, feat_c0, feat_c1, data) if feat_f0_unfold.size(0) != 0: # at least one coarse level predicted - feat_f0_unfold, feat_f1_unfold = self.loftr_fine(feat_f0_unfold, feat_f1_unfold) + feat_f0_unfold, feat_f1_unfold = self.loftr_fine( + feat_f0_unfold, feat_f1_unfold) # 5. match fine-level self.fine_matching(feat_f0_unfold, feat_f1_unfold, data) diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py index ca51db4f..8d83af7e 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py @@ -1,2 +1,2 @@ -from .transformer import LocalFeatureTransformer from .fine_preprocess import FinePreprocess +from .transformer import LocalFeatureTransformer diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py index 5bb8eefd..8624eab5 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py @@ -5,6 +5,7 @@ from einops.einops import rearrange, repeat class FinePreprocess(nn.Module): + def __init__(self, config): super().__init__() @@ -17,14 +18,14 @@ class FinePreprocess(nn.Module): self.d_model_f = d_model_f if self.cat_c_feat: self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True) - self.merge_feat = nn.Linear(2*d_model_f, d_model_f, bias=True) + self.merge_feat = nn.Linear(2 * d_model_f, d_model_f, bias=True) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: - nn.init.kaiming_normal_(p, mode="fan_out", nonlinearity="relu") + nn.init.kaiming_normal_(p, mode='fan_out', nonlinearity='relu') def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data): W = self.W @@ -32,28 +33,41 @@ class FinePreprocess(nn.Module): data.update({'W': W}) if data['b_ids'].shape[0] == 0: - feat0 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device) - feat1 = torch.empty(0, self.W**2, self.d_model_f, device=feat_f0.device) + feat0 = torch.empty( + 0, self.W**2, self.d_model_f, device=feat_f0.device) + feat1 = torch.empty( + 0, self.W**2, self.d_model_f, device=feat_f0.device) return feat0, feat1 # 1. unfold(crop) all local windows - feat_f0_unfold = F.unfold(feat_f0, kernel_size=(W, W), stride=stride, padding=W//2) - feat_f0_unfold = rearrange(feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2) - feat_f1_unfold = F.unfold(feat_f1, kernel_size=(W, W), stride=stride, padding=W//2) - feat_f1_unfold = rearrange(feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2) + feat_f0_unfold = F.unfold( + feat_f0, kernel_size=(W, W), stride=stride, padding=W // 2) + feat_f0_unfold = rearrange( + feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2) + feat_f1_unfold = F.unfold( + feat_f1, kernel_size=(W, W), stride=stride, padding=W // 2) + feat_f1_unfold = rearrange( + feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2) # 2. select only the predicted matches - feat_f0_unfold = feat_f0_unfold[data['b_ids'], data['i_ids']] # [n, ww, cf] + feat_f0_unfold = feat_f0_unfold[data['b_ids'], + data['i_ids']] # [n, ww, cf] feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']] # option: use coarse-level loftr feature as context: concat and linear if self.cat_c_feat: - feat_c_win = self.down_proj(torch.cat([feat_c0[data['b_ids'], data['i_ids']], - feat_c1[data['b_ids'], data['j_ids']]], 0)) # [2n, c] - feat_cf_win = self.merge_feat(torch.cat([ - torch.cat([feat_f0_unfold, feat_f1_unfold], 0), # [2n, ww, cf] - repeat(feat_c_win, 'n c -> n ww c', ww=W**2), # [2n, ww, cf] - ], -1)) + feat_c_win = self.down_proj( + torch.cat([ + feat_c0[data['b_ids'], data['i_ids']], + feat_c1[data['b_ids'], data['j_ids']] + ], 0)) # [2n, c] + feat_cf_win = self.merge_feat( + torch.cat( + [ + torch.cat([feat_f0_unfold, feat_f1_unfold], + 0), # [2n, ww, cf] + repeat(feat_c_win, 'n c -> n ww c', ww = W ** 2), # [2n, ww, cf] + ], -1)) # yapf: disable feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0) return feat_f0_unfold, feat_f1_unfold diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py index b73c5a6a..8e4f11d1 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py @@ -4,7 +4,7 @@ Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_trans """ import torch -from torch.nn import Module, Dropout +from torch.nn import Dropout, Module def elu_feature_map(x): @@ -12,6 +12,7 @@ def elu_feature_map(x): class LinearAttention(Module): + def __init__(self, eps=1e-6): super().__init__() self.feature_map = elu_feature_map @@ -40,14 +41,16 @@ class LinearAttention(Module): v_length = values.size(1) values = values / v_length # prevent fp16 overflow - KV = torch.einsum("nshd,nshv->nhdv", K, values) # (S,D)' @ S,V - Z = 1 / (torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1)) + self.eps) - queried_values = torch.einsum("nlhd,nhdv,nlh->nlhv", Q, KV, Z) * v_length + KV = torch.einsum('nshd,nshv->nhdv', K, values) # (S,D)' @ S,V + Z = 1 / (torch.einsum('nlhd,nhd->nlh', Q, K.sum(dim=1)) + self.eps) + queried_values = torch.einsum('nlhd,nhdv,nlh->nlhv', Q, KV, + Z) * v_length return queried_values.contiguous() class FullAttention(Module): + def __init__(self, use_dropout=False, attention_dropout=0.1): super().__init__() self.use_dropout = use_dropout @@ -66,9 +69,11 @@ class FullAttention(Module): """ # Compute the unnormalized attention and apply the masks - QK = torch.einsum("nlhd,nshd->nlsh", queries, keys) + QK = torch.einsum('nlhd,nshd->nlsh', queries, keys) if kv_mask is not None: - QK.masked_fill_(~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), float('-inf')) + QK.masked_fill_( + ~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), + float('-inf')) # Compute the attention and the weighted average softmax_temp = 1. / queries.size(3)**.5 # sqrt(D) @@ -76,6 +81,6 @@ class FullAttention(Module): if self.use_dropout: A = self.dropout(A) - queried_values = torch.einsum("nlsh,nshd->nlhd", A, values) + queried_values = torch.einsum('nlsh,nshd->nlhd', A, values) return queried_values.contiguous() diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py index d79390ca..4c28f20d 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py @@ -1,14 +1,14 @@ import copy + import torch import torch.nn as nn -from .linear_attention import LinearAttention, FullAttention + +from .linear_attention import FullAttention, LinearAttention class LoFTREncoderLayer(nn.Module): - def __init__(self, - d_model, - nhead, - attention='linear'): + + def __init__(self, d_model, nhead, attention='linear'): super(LoFTREncoderLayer, self).__init__() self.dim = d_model // nhead @@ -18,14 +18,15 @@ class LoFTREncoderLayer(nn.Module): self.q_proj = nn.Linear(d_model, d_model, bias=False) self.k_proj = nn.Linear(d_model, d_model, bias=False) self.v_proj = nn.Linear(d_model, d_model, bias=False) - self.attention = LinearAttention() if attention == 'linear' else FullAttention() + self.attention = LinearAttention( + ) if attention == 'linear' else FullAttention() self.merge = nn.Linear(d_model, d_model, bias=False) # feed-forward network self.mlp = nn.Sequential( - nn.Linear(d_model*2, d_model*2, bias=False), + nn.Linear(d_model * 2, d_model * 2, bias=False), nn.ReLU(True), - nn.Linear(d_model*2, d_model, bias=False), + nn.Linear(d_model * 2, d_model, bias=False), ) # norm and dropout @@ -44,11 +45,16 @@ class LoFTREncoderLayer(nn.Module): query, key, value = x, source, source # multi-head attention - query = self.q_proj(query).view(bs, -1, self.nhead, self.dim) # [N, L, (H, D)] - key = self.k_proj(key).view(bs, -1, self.nhead, self.dim) # [N, S, (H, D)] + query = self.q_proj(query).view(bs, -1, self.nhead, + self.dim) # [N, L, (H, D)] + key = self.k_proj(key).view(bs, -1, self.nhead, + self.dim) # [N, S, (H, D)] value = self.v_proj(value).view(bs, -1, self.nhead, self.dim) - message = self.attention(query, key, value, q_mask=x_mask, kv_mask=source_mask) # [N, L, (H, D)] - message = self.merge(message.view(bs, -1, self.nhead*self.dim)) # [N, L, C] + message = self.attention( + query, key, value, q_mask=x_mask, + kv_mask=source_mask) # [N, L, (H, D)] + message = self.merge(message.view(bs, -1, + self.nhead * self.dim)) # [N, L, C] message = self.norm1(message) # feed-forward network @@ -68,8 +74,11 @@ class LocalFeatureTransformer(nn.Module): self.d_model = config['d_model'] self.nhead = config['nhead'] self.layer_names = config['layer_names'] - encoder_layer = LoFTREncoderLayer(config['d_model'], config['nhead'], config['attention']) - self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))]) + encoder_layer = LoFTREncoderLayer(config['d_model'], config['nhead'], + config['attention']) + self.layers = nn.ModuleList([ + copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names)) + ]) self._reset_parameters() def _reset_parameters(self): @@ -86,7 +95,8 @@ class LocalFeatureTransformer(nn.Module): mask1 (torch.Tensor): [N, S] (optional) """ - assert self.d_model == feat0.size(2), "the feature number of src and transformer must be equal" + assert self.d_model == feat0.size( + 2), 'the feature number of src and transformer must be equal' for layer, name in zip(self.layers, self.layer_names): if name == 'self': diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py index a9726333..c7835689 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py @@ -5,6 +5,7 @@ from einops.einops import rearrange INF = 1e9 + def mask_border(m, b: int, v): """ Mask borders with value Args: @@ -45,7 +46,7 @@ def mask_border_with_padding(m, bd, v, p_m0, p_m1): def compute_max_candidates(p_m0, p_m1): """Compute the max candidates of all pairs within a batch - + Args: p_m0, p_m1 (torch.Tensor): padded masks """ @@ -57,6 +58,7 @@ def compute_max_candidates(p_m0, p_m1): class CoarseMatching(nn.Module): + def __init__(self, config): super().__init__() self.config = config @@ -75,7 +77,7 @@ class CoarseMatching(nn.Module): try: from .superglue import log_optimal_transport except ImportError: - raise ImportError("download superglue.py first!") + raise ImportError('download superglue.py first!') self.log_optimal_transport = log_optimal_transport self.bin_score = nn.Parameter( torch.tensor(config['skh_init_bin_score'], requires_grad=True)) @@ -103,28 +105,27 @@ class CoarseMatching(nn.Module): 'mconf' (torch.Tensor): [M]} NOTE: M' != M during training. """ - N, L, S, C = feat_c0.size(0), feat_c0.size(1), feat_c1.size(1), feat_c0.size(2) + _, L, S, _ = feat_c0.size(0), feat_c0.size(1), feat_c1.size( + 1), feat_c0.size(2) # normalize feat_c0, feat_c1 = map(lambda feat: feat / feat.shape[-1]**.5, [feat_c0, feat_c1]) if self.match_type == 'dual_softmax': - sim_matrix = torch.einsum("nlc,nsc->nls", feat_c0, + sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0, feat_c1) / self.temperature if mask_c0 is not None: sim_matrix.masked_fill_( - ~(mask_c0[..., None] * mask_c1[:, None]).bool(), - -INF) + ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF) conf_matrix = F.softmax(sim_matrix, 1) * F.softmax(sim_matrix, 2) elif self.match_type == 'sinkhorn': # sinkhorn, dustbin included - sim_matrix = torch.einsum("nlc,nsc->nls", feat_c0, feat_c1) + sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0, feat_c1) if mask_c0 is not None: sim_matrix[:, :L, :S].masked_fill_( - ~(mask_c0[..., None] * mask_c1[:, None]).bool(), - -INF) + ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF) # build uniform prior & use sinkhorn log_assign_matrix = self.log_optimal_transport( @@ -207,10 +208,10 @@ class CoarseMatching(nn.Module): else: num_candidates_max = compute_max_candidates( data['mask0'], data['mask1']) - num_matches_train = int(num_candidates_max * - self.train_coarse_percent) + num_matches_train = int(num_candidates_max + * self.train_coarse_percent) num_matches_pred = len(b_ids) - assert self.train_pad_num_gt_min < num_matches_train, "min-num-gt-pad should be less than num-train-matches" + assert self.train_pad_num_gt_min < num_matches_train, 'min-num-gt-pad should be less than num-train-matches' # pred_indices is to select from prediction if num_matches_pred <= num_matches_train - self.train_pad_num_gt_min: @@ -223,11 +224,13 @@ class CoarseMatching(nn.Module): # gt_pad_indices is to select from gt padding. e.g. max(3787-4800, 200) gt_pad_indices = torch.randint( - len(data['spv_b_ids']), - (max(num_matches_train - num_matches_pred, - self.train_pad_num_gt_min), ), - device=_device) - mconf_gt = torch.zeros(len(data['spv_b_ids']), device=_device) # set conf of gt paddings to all zero + len(data['spv_b_ids']), + (max(num_matches_train - num_matches_pred, + self.train_pad_num_gt_min), ), + device=_device) + mconf_gt = torch.zeros( + len(data['spv_b_ids']), + device=_device) # set conf of gt paddings to all zero b_ids, i_ids, j_ids, mconf = map( lambda x, y: torch.cat([x[pred_indices], y[gt_pad_indices]], diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py index 689518d9..35903212 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py @@ -1,4 +1,5 @@ import math + import torch import torch.nn as nn @@ -7,8 +8,8 @@ def create_meshgrid( height: int, width: int, normalized_coordinates: bool = True, - device = None, - dtype = None, + device=None, + dtype=None, ): """Generate a coordinate grid for an image. @@ -48,7 +49,8 @@ def create_meshgrid( if normalized_coordinates: xs = (xs / (width - 1) - 0.5) * 2 ys = (ys / (height - 1) - 0.5) * 2 - base_grid = torch.stack(torch.meshgrid([xs, ys], indexing="ij"), dim=-1) # WxHx2 + base_grid = torch.stack( + torch.meshgrid([xs, ys], indexing='ij'), dim=-1) # WxHx2 return base_grid.permute(1, 0, 2).unsqueeze(0) # 1xHxWx2 @@ -120,7 +122,7 @@ class FineMatching(nn.Module): # corner case: if no coarse matches found if M == 0: - assert self.training == False, "M is always >0, when training, see coarse_matching.py" + assert self.training is False, 'M is always >0, when training, see coarse_matching.py' # logger.warning('No matches found in coarse-level.') data.update({ 'expec_f': torch.empty(0, 3, device=feat_f0.device), @@ -129,35 +131,41 @@ class FineMatching(nn.Module): }) return - feat_f0_picked = feat_f0_picked = feat_f0[:, WW//2, :] + feat_f0_picked = feat_f0_picked = feat_f0[:, WW // 2, :] sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1) softmax_temp = 1. / C**.5 - heatmap = torch.softmax(softmax_temp * sim_matrix, dim=1).view(-1, W, W) + heatmap = torch.softmax( + softmax_temp * sim_matrix, dim=1).view(-1, W, W) # compute coordinates from heatmap - coords_normalized = spatial_expectation2d(heatmap[None], True)[0] # [M, 2] - grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(1, -1, 2) # [1, WW, 2] + coords_normalized = spatial_expectation2d(heatmap[None], + True)[0] # [M, 2] + grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape( + 1, -1, 2) # [1, WW, 2] # compute std over - var = torch.sum(grid_normalized**2 * heatmap.view(-1, WW, 1), dim=1) - coords_normalized**2 # [M, 2] - std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)), -1) # [M] clamp needed for numerical stability - + var = torch.sum( + grid_normalized**2 * heatmap.view(-1, WW, 1), + dim=1) - coords_normalized**2 # [M, 2] + std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)), + -1) # [M] clamp needed for numerical stability + # for fine-level supervision - data.update({'expec_f': torch.cat([coords_normalized, std.unsqueeze(1)], -1)}) + data.update( + {'expec_f': + torch.cat([coords_normalized, std.unsqueeze(1)], -1)}) # compute absolute kpt coords self.get_fine_match(coords_normalized, data) @torch.no_grad() def get_fine_match(self, coords_normed, data): - W, WW, C, scale = self.W, self.WW, self.C, self.scale + W, _, _, scale = self.W, self.WW, self.C, self.scale # mkpts0_f and mkpts1_f mkpts0_f = data['mkpts0_c'] - scale1 = scale * data['scale1'][data['b_ids']] if 'scale0' in data else scale - mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])] + scale1 = scale * data['scale1'][ + data['b_ids']] if 'scale0' in data else scale + mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])] # yapf: disable - data.update({ - "mkpts0_f": mkpts0_f, - "mkpts1_f": mkpts1_f - }) + data.update({'mkpts0_f': mkpts0_f, 'mkpts1_f': mkpts1_f}) diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py index f95cdb65..214a3a7a 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py @@ -6,7 +6,7 @@ def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1): """ Warp kpts0 from I0 to I1 with depth, K and Rt Also check covisibility and depth consistency. Depth is consistent if relative error < 0.2 (hard-coded). - + Args: kpts0 (torch.Tensor): [N, L, 2] - , depth0 (torch.Tensor): [N, H, W], @@ -21,34 +21,37 @@ def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1): kpts0_long = kpts0.round().long() # Sample depth, get calculable_mask on depth != 0 - kpts0_depth = torch.stack( - [depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]] for i in range(kpts0.shape[0])], dim=0 - ) # (N, L) + kpts0_depth = torch.stack([ + depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]] + for i in range(kpts0.shape[0]) + ], + dim=0) # noqa E501 nonzero_mask = kpts0_depth != 0 # Unproject - kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1) * kpts0_depth[..., None] # (N, L, 3) + kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], + dim=-1) * kpts0_depth[..., None] # (N, L, 3) kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1) # (N, 3, L) # Rigid Transform - w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]] # (N, 3, L) + w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, + [3]] # (N, 3, L) w_kpts0_depth_computed = w_kpts0_cam[:, 2, :] # Project w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1) # (N, L, 3) - w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4) # (N, L, 2), +1e-4 to avoid zero depth + w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4 + ) # (N, L, 2), +1e-4 to avoid zero depth # Covisible Check h, w = depth1.shape[1:3] - covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w-1) * \ - (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h-1) + covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w - 1) * (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h - 1) # noqa E501 yapf: disable w_kpts0_long = w_kpts0.long() w_kpts0_long[~covisible_mask, :] = 0 - w_kpts0_depth = torch.stack( - [depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0 - ) # (N, L) - consistent_mask = ((w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2 + w_kpts0_depth = torch.stack([depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0) # noqa E501 yapf: disable + consistent_mask = ( + (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2 valid_mask = nonzero_mask * covisible_mask * consistent_mask return valid_mask, w_kpts0 diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py index 732d28c8..c5e7355d 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py @@ -1,4 +1,5 @@ import math + import torch from torch import nn @@ -23,16 +24,17 @@ class PositionEncodingSine(nn.Module): y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0) x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0) if temp_bug_fix: - div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / (d_model//2))) + div_term = torch.exp(torch.arange(0, d_model // 2, 2).float() * (-math.log(10000.0) / (d_model // 2))) # noqa E501 yapf: disable else: # a buggy implementation (for backward compatability only) - div_term = torch.exp(torch.arange(0, d_model//2, 2).float() * (-math.log(10000.0) / d_model//2)) + div_term = torch.exp(torch.arange(0, d_model // 2, 2).float() * (-math.log(10000.0) / d_model // 2)) # noqa E501 yapf: disable div_term = div_term[:, None, None] # [C//4, 1, 1] pe[0::4, :, :] = torch.sin(x_position * div_term) pe[1::4, :, :] = torch.cos(x_position * div_term) pe[2::4, :, :] = torch.sin(y_position * div_term) pe[3::4, :, :] = torch.cos(y_position * div_term) - self.register_buffer('pe', pe.unsqueeze(0), persistent=False) # [1, C, H, W] + self.register_buffer( + 'pe', pe.unsqueeze(0), persistent=False) # [1, C, H, W] def forward(self, x): """ diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py index 4749e24a..02d25d05 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py @@ -1,13 +1,13 @@ from math import log -from loguru import logger import torch from einops import repeat from kornia.utils import create_meshgrid +from loguru import logger from .geometry import warp_kpts -############## ↓ Coarse-Level supervision ↓ ############## +# ↓ Coarse-Level supervision ↓ ############## @torch.no_grad() @@ -30,7 +30,7 @@ def spvs_coarse(data, config): 'spv_w_pt0_i': [N, hw0, 2], in original image resolution 'spv_pt1_i': [N, hw1, 2], in original image resolution } - + NOTE: - for scannet dataset, there're 3 kinds of resolution {i, c, f} - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f} @@ -46,9 +46,14 @@ def spvs_coarse(data, config): # 2. warp grids # create kpts in meshgrid and resize them to image resolution - grid_pt0_c = create_meshgrid(h0, w0, False, device).reshape(1, h0*w0, 2).repeat(N, 1, 1) # [N, hw, 2] + grid_pt0_c = create_meshgrid(h0, w0, False, + device).reshape(1, h0 * w0, + 2).repeat(N, 1, + 1) # [N, hw, 2] grid_pt0_i = scale0 * grid_pt0_c - grid_pt1_c = create_meshgrid(h1, w1, False, device).reshape(1, h1*w1, 2).repeat(N, 1, 1) + grid_pt1_c = create_meshgrid(h1, w1, False, + device).reshape(1, h1 * w1, + 2).repeat(N, 1, 1) grid_pt1_i = scale1 * grid_pt1_c # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt @@ -59,8 +64,10 @@ def spvs_coarse(data, config): # warp kpts bi-directionally and resize them to coarse-level resolution # (no depth consistency check, since it leads to worse results experimentally) # (unhandled edge case: points with 0-depth will be warped to the left-up corner) - _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'], data['T_0to1'], data['K0'], data['K1']) - _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'], data['T_1to0'], data['K1'], data['K0']) + _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'], + data['T_0to1'], data['K0'], data['K1']) + _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'], + data['T_1to0'], data['K1'], data['K0']) w_pt0_c = w_pt0_i / scale1 w_pt1_c = w_pt1_i / scale0 @@ -72,16 +79,21 @@ def spvs_coarse(data, config): # corner case: out of boundary def out_bound_mask(pt, w, h): - return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + (pt[..., 1] >= h) + return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + ( + pt[..., 1] >= h) + nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0 nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0 - loop_back = torch.stack([nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], dim=0) - correct_0to1 = loop_back == torch.arange(h0*w0, device=device)[None].repeat(N, 1) + loop_back = torch.stack( + [nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], + dim=0) + correct_0to1 = loop_back == torch.arange( + h0 * w0, device=device)[None].repeat(N, 1) correct_0to1[:, 0] = False # ignore the top-left corner # 4. construct a gt conf_matrix - conf_matrix_gt = torch.zeros(N, h0*w0, h1*w1, device=device) + conf_matrix_gt = torch.zeros(N, h0 * w0, h1 * w1, device=device) b_ids, i_ids = torch.where(correct_0to1 != 0) j_ids = nearest_index1[b_ids, i_ids] @@ -90,27 +102,22 @@ def spvs_coarse(data, config): # 5. save coarse matches(gt) for training fine level if len(b_ids) == 0: - logger.warning(f"No groundtruth coarse match found for: {data['pair_names']}") + logger.warning( + f"No groundtruth coarse match found for: {data['pair_names']}") # this won't affect fine-level loss calculation b_ids = torch.tensor([0], device=device) i_ids = torch.tensor([0], device=device) j_ids = torch.tensor([0], device=device) - data.update({ - 'spv_b_ids': b_ids, - 'spv_i_ids': i_ids, - 'spv_j_ids': j_ids - }) + data.update({'spv_b_ids': b_ids, 'spv_i_ids': i_ids, 'spv_j_ids': j_ids}) # 6. save intermediate results (for fast fine-level computation) - data.update({ - 'spv_w_pt0_i': w_pt0_i, - 'spv_pt1_i': grid_pt1_i - }) + data.update({'spv_w_pt0_i': w_pt0_i, 'spv_pt1_i': grid_pt1_i}) def compute_supervision_coarse(data, config): - assert len(set(data['dataset_name'])) == 1, "Do not support mixed datasets training!" + assert len(set( + data['dataset_name'])) == 1, 'Do not support mixed datasets training!' data_source = data['dataset_name'][0] if data_source.lower() in ['scannet', 'megadepth']: spvs_coarse(data, config) @@ -118,7 +125,8 @@ def compute_supervision_coarse(data, config): raise ValueError(f'Unknown data source: {data_source}') -############## ↓ Fine-Level supervision ↓ ############## +# ↓ Fine-Level supervision ↓ ############## + @torch.no_grad() def spvs_fine(data, config): @@ -139,8 +147,9 @@ def spvs_fine(data, config): # 3. compute gt scale = scale * data['scale1'][b_ids] if 'scale0' in data else scale # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later - expec_f_gt = (w_pt0_i[b_ids, i_ids] - pt1_i[b_ids, j_ids]) / scale / radius # [M, 2] - data.update({"expec_f_gt": expec_f_gt}) + expec_f_gt = (w_pt0_i[b_ids, i_ids] + - pt1_i[b_ids, j_ids]) / scale / radius # [M, 2] + data.update({'expec_f_gt': expec_f_gt}) def compute_supervision_fine(data, config): diff --git a/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py b/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py index 3d4c5ca5..206f9037 100644 --- a/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py +++ b/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py @@ -1,7 +1,8 @@ import bisect -import numpy as np -import matplotlib.pyplot as plt + import matplotlib +import matplotlib.pyplot as plt +import numpy as np def _compute_conf_thresh(data): @@ -17,21 +18,30 @@ def _compute_conf_thresh(data): # --- VISUALIZATION --- # -def make_matching_figure( - img0, img1, mkpts0, mkpts1, color, - kpts0=None, kpts1=None, text=[], dpi=75, path=None): + +def make_matching_figure(img0, + img1, + mkpts0, + mkpts1, + color, + kpts0=None, + kpts1=None, + text=[], + dpi=75, + path=None): # draw image pair - assert mkpts0.shape[0] == mkpts1.shape[0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}' + assert mkpts0.shape[0] == mkpts1.shape[ + 0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}' fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi) axes[0].imshow(img0, cmap='gray') axes[1].imshow(img1, cmap='gray') - for i in range(2): # clear all frames + for i in range(2): # clear all frames axes[i].get_yaxis().set_ticks([]) axes[i].get_xaxis().set_ticks([]) for spine in axes[i].spines.values(): spine.set_visible(False) plt.tight_layout(pad=1) - + if kpts0 is not None: assert kpts1 is not None axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2) @@ -43,19 +53,28 @@ def make_matching_figure( transFigure = fig.transFigure.inverted() fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0)) fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1)) - fig.lines = [matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]), - (fkpts0[i, 1], fkpts1[i, 1]), - transform=fig.transFigure, c=color[i], linewidth=1) - for i in range(len(mkpts0))] - + fig.lines = [ + matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]), + (fkpts0[i, 1], fkpts1[i, 1]), + transform=fig.transFigure, + c=color[i], + linewidth=1) for i in range(len(mkpts0)) + ] + axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4) axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4) # put txts txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w' fig.text( - 0.01, 0.99, '\n'.join(text), transform=fig.axes[0].transAxes, - fontsize=15, va='top', ha='left', color=txt_color) + 0.01, + 0.99, + '\n'.join(text), + transform=fig.axes[0].transAxes, + fontsize=15, + va='top', + ha='left', + color=txt_color) # save or return figure if path: @@ -68,12 +87,14 @@ def make_matching_figure( def _make_evaluation_figure(data, b_id, alpha='dynamic'): b_mask = data['m_bids'] == b_id conf_thr = _compute_conf_thresh(data) - - img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype(np.int32) - img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype(np.int32) + + img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype( + np.int32) + img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype( + np.int32) kpts0 = data['mkpts0_f'][b_mask].cpu().numpy() kpts1 = data['mkpts1_f'][b_mask].cpu().numpy() - + # for megadepth, we visualize matches on the resized image if 'scale0' in data: kpts0 = kpts0 / data['scale0'][b_id].cpu().numpy()[[1, 0]] @@ -92,18 +113,18 @@ def _make_evaluation_figure(data, b_id, alpha='dynamic'): if alpha == 'dynamic': alpha = dynamic_alpha(len(correct_mask)) color = error_colormap(epi_errs, conf_thr, alpha=alpha) - + text = [ f'#Matches {len(kpts0)}', f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}', f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}' ] - + # make the figure - figure = make_matching_figure(img0, img1, kpts0, kpts1, - color, text=text) + figure = make_matching_figure(img0, img1, kpts0, kpts1, color, text=text) return figure + def _make_confidence_figure(data, b_id): # TODO: Implement confidence figure raise NotImplementedError() @@ -111,7 +132,7 @@ def _make_confidence_figure(data, b_id): def make_matching_figures(data, config, mode='evaluation'): """ Make matching figures for a batch. - + Args: data (Dict): a batch updated by PL_LoFTR. config (Dict): matcher config @@ -123,8 +144,7 @@ def make_matching_figures(data, config, mode='evaluation'): for b_id in range(data['image0'].size(0)): if mode == 'evaluation': fig = _make_evaluation_figure( - data, b_id, - alpha=config.TRAINER.PLOT_MATCHES_ALPHA) + data, b_id, alpha=config.TRAINER.PLOT_MATCHES_ALPHA) elif mode == 'confidence': fig = _make_confidence_figure(data, b_id) else: @@ -144,11 +164,14 @@ def dynamic_alpha(n_matches, if _range[1] is None: return _range[0] return _range[1] + (milestones[loc + 1] - n_matches) / ( - milestones[loc + 1] - milestones[loc]) * (_range[0] - _range[1]) + milestones[loc + 1] - milestones[loc]) * ( + _range[0] - _range[1]) def error_colormap(err, thr, alpha=1.0): - assert alpha <= 1.0 and alpha > 0, f"Invaid alpha value: {alpha}" + assert alpha <= 1.0 and alpha > 0, f'Invaid alpha value: {alpha}' x = 1 - np.clip(err / (thr * 2), 0, 1) return np.clip( - np.stack([2-x*2, x*2, np.zeros_like(x), np.ones_like(x)*alpha], -1), 0, 1) + np.stack([2 - x * 2, x * 2, + np.zeros_like(x), + np.ones_like(x) * alpha], -1), 0, 1) diff --git a/modelscope/models/cv/image_matching_fast/config/__init__.py b/modelscope/models/cv/image_matching_fast/config/__init__.py index add40b36..84c52f69 100644 --- a/modelscope/models/cv/image_matching_fast/config/__init__.py +++ b/modelscope/models/cv/image_matching_fast/config/__init__.py @@ -1 +1 @@ -from .default import lightglue_default_conf \ No newline at end of file +from .default import lightglue_default_conf diff --git a/modelscope/models/cv/image_matching_fast/config/default.py b/modelscope/models/cv/image_matching_fast/config/default.py index 06c8203c..0100b96c 100644 --- a/modelscope/models/cv/image_matching_fast/config/default.py +++ b/modelscope/models/cv/image_matching_fast/config/default.py @@ -1,15 +1,15 @@ lightglue_default_conf = { - "features":"superpoint", # superpoint disk aliked sift - "name": "lightglue", # just for interfacing - "input_dim": 256, # input descriptor dimension (autoselected from weights) - "descriptor_dim": 256, - "add_scale_ori": False, - "n_layers": 9, - "num_heads": 4, - "flash": True, # enable FlashAttention if available. - "mp": False, # enable mixed precision - "depth_confidence": 0.95, # early stopping, disable with -1 - "width_confidence": 0.99, # point pruning, disable with -1 - "filter_threshold": 0.1, # match threshold - "weights": None, + 'features': 'superpoint', # superpoint disk aliked sift + 'name': 'lightglue', # just for interfacing + 'input_dim': 256, # input descriptor dimension (autoselected from weights) + 'descriptor_dim': 256, + 'add_scale_ori': False, + 'n_layers': 9, + 'num_heads': 4, + 'flash': True, # enable FlashAttention if available. + 'mp': False, # enable mixed precision + 'depth_confidence': 0.95, # early stopping, disable with -1 + 'width_confidence': 0.99, # point pruning, disable with -1 + 'filter_threshold': 0.1, # match threshold + 'weights': None, } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/aliked.py b/modelscope/models/cv/image_matching_fast/lightglue/aliked.py index 1161e1fc..71ff4f95 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/aliked.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/aliked.py @@ -45,16 +45,15 @@ from torchvision.models import resnet from .utils import Extractor -def get_patches( - tensor: torch.Tensor, required_corners: torch.Tensor, ps: int -) -> torch.Tensor: +def get_patches(tensor: torch.Tensor, required_corners: torch.Tensor, + ps: int) -> torch.Tensor: c, h, w = tensor.shape corner = (required_corners - ps / 2 + 1).long() corner[:, 0] = corner[:, 0].clamp(min=0, max=w - 1 - ps) corner[:, 1] = corner[:, 1].clamp(min=0, max=h - 1 - ps) offset = torch.arange(0, ps) - kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {} + kw = {'indexing': 'ij'} if torch.__version__ >= '1.10' else {} x, y = torch.meshgrid(offset, offset, **kw) patches = torch.stack((x, y)).permute(2, 1, 0).unsqueeze(2) patches = patches.to(corner) + corner[None, None] @@ -70,8 +69,7 @@ def simple_nms(scores: torch.Tensor, nms_radius: int): zeros = torch.zeros_like(scores) max_mask = scores == torch.nn.functional.max_pool2d( - scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius - ) + scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius) for _ in range(2): supp_mask = ( @@ -80,18 +78,19 @@ def simple_nms(scores: torch.Tensor, nms_radius: int): kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius, - ) - > 0 - ) + ) > 0) supp_scores = torch.where(supp_mask, zeros, scores) new_max_mask = supp_scores == torch.nn.functional.max_pool2d( - supp_scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius - ) + supp_scores, + kernel_size=nms_radius * 2 + 1, + stride=1, + padding=nms_radius) max_mask = max_mask | (new_max_mask & (~supp_mask)) return torch.where(max_mask, scores, zeros) class DKD(nn.Module): + def __init__( self, radius: int = 2, @@ -115,14 +114,15 @@ class DKD(nn.Module): self.n_limit = n_limit self.kernel_size = 2 * self.radius + 1 self.temperature = 0.1 # tuned temperature - self.unfold = nn.Unfold(kernel_size=self.kernel_size, padding=self.radius) + self.unfold = nn.Unfold( + kernel_size=self.kernel_size, padding=self.radius) # local xy grid x = torch.linspace(-self.radius, self.radius, self.kernel_size) # (kernel_size*kernel_size) x 2 : (w,h) - kw = {"indexing": "ij"} if torch.__version__ >= "1.10" else {} + kw = {'indexing': 'ij'} if torch.__version__ >= '1.10' else {} self.hw_grid = ( - torch.stack(torch.meshgrid([x, x], **kw)).view(2, -1).t()[:, [1, 0]] - ) + torch.stack(torch.meshgrid([x, x], **kw)).view(2, -1).t()[:, + [1, 0]]) def forward( self, @@ -141,29 +141,32 @@ class DKD(nn.Module): nms_scores = simple_nms(scores_nograd, self.radius) # remove border - nms_scores[:, :, : self.radius, :] = 0 - nms_scores[:, :, :, : self.radius] = 0 + nms_scores[:, :, :self.radius, :] = 0 + nms_scores[:, :, :, :self.radius] = 0 if image_size is not None: for i in range(scores_map.shape[0]): w, h = image_size[i].long() - nms_scores[i, :, h.item() - self.radius :, :] = 0 - nms_scores[i, :, :, w.item() - self.radius :] = 0 + nms_scores[i, :, h.item() - self.radius:, :] = 0 + nms_scores[i, :, :, w.item() - self.radius:] = 0 else: - nms_scores[:, :, -self.radius :, :] = 0 - nms_scores[:, :, :, -self.radius :] = 0 + nms_scores[:, :, -self.radius:, :] = 0 + nms_scores[:, :, :, -self.radius:] = 0 # detect keypoints without grad if self.top_k > 0: topk = torch.topk(nms_scores.view(b, -1), self.top_k) - indices_keypoints = [topk.indices[i] for i in range(b)] # B x top_k + indices_keypoints = [topk.indices[i] + for i in range(b)] # B x top_k else: if self.scores_th > 0: masks = nms_scores > self.scores_th if masks.sum() == 0: - th = scores_nograd.reshape(b, -1).mean(dim=1) # th = self.scores_th + th = scores_nograd.reshape(b, -1).mean( + dim=1) # th = self.scores_th masks = nms_scores > th.reshape(b, 1, 1, 1) else: - th = scores_nograd.reshape(b, -1).mean(dim=1) # th = self.scores_th + th = scores_nograd.reshape(b, -1).mean( + dim=1) # th = self.scores_th masks = nms_scores > th.reshape(b, 1, 1, 1) masks = masks.reshape(b, -1) @@ -174,7 +177,7 @@ class DKD(nn.Module): if len(indices) > self.n_limit: kpts_sc = scores[indices] sort_idx = kpts_sc.sort(descending=True)[1] - sel_idx = sort_idx[: self.n_limit] + sel_idx = sort_idx[:self.n_limit] indices = indices[sel_idx] indices_keypoints.append(indices) @@ -190,34 +193,34 @@ class DKD(nn.Module): for b_idx in range(b): patch = patches[b_idx].t() # (H*W) x (kernel**2) indices_kpt = indices_keypoints[ - b_idx - ] # one dimension vector, say its size is M + b_idx] # one dimension vector, say its size is M patch_scores = patch[indices_kpt] # M x (kernel**2) keypoints_xy_nms = torch.stack( - [indices_kpt % w, torch.div(indices_kpt, w, rounding_mode="trunc")], + [ + indices_kpt % w, + torch.div(indices_kpt, w, rounding_mode='trunc') + ], dim=1, ) # Mx2 # max is detached to prevent undesired backprop loops in the graph max_v = patch_scores.max(dim=1).values.detach()[:, None] x_exp = ( - (patch_scores - max_v) / self.temperature - ).exp() # M * (kernel**2), in [0, 1] + (patch_scores - max_v) + / self.temperature).exp() # M * (kernel**2), in [0, 1] # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} } - xy_residual = ( - x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None] - ) # Soft-argmax, Mx2 + xy_residual = (x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None] + ) # Soft-argmax, Mx2 hw_grid_dist2 = ( torch.norm( (self.hw_grid[None, :, :] - xy_residual[:, None, :]) / self.radius, dim=-1, - ) - ** 2 - ) - scoredispersity = (x_exp * hw_grid_dist2).sum(dim=1) / x_exp.sum(dim=1) + )**2) + scoredispersity = (x_exp * hw_grid_dist2).sum( + dim=1) / x_exp.sum(dim=1) # compute result keypoints keypoints_xy = keypoints_xy_nms + xy_residual @@ -226,11 +229,9 @@ class DKD(nn.Module): kptscore = torch.nn.functional.grid_sample( scores_map[b_idx].unsqueeze(0), keypoints_xy.view(1, 1, -1, 2), - mode="bilinear", + mode='bilinear', align_corners=True, - )[ - 0, 0, 0, : - ] # CxN + )[0, 0, 0, :] # CxN keypoints.append(keypoints_xy) scoredispersitys.append(scoredispersity) @@ -238,24 +239,25 @@ class DKD(nn.Module): else: for b_idx in range(b): indices_kpt = indices_keypoints[ - b_idx - ] # one dimension vector, say its size is M + b_idx] # one dimension vector, say its size is M # To avoid warning: UserWarning: __floordiv__ is deprecated keypoints_xy_nms = torch.stack( - [indices_kpt % w, torch.div(indices_kpt, w, rounding_mode="trunc")], + [ + indices_kpt % w, + torch.div(indices_kpt, w, rounding_mode='trunc') + ], dim=1, ) # Mx2 keypoints_xy = keypoints_xy_nms / wh * 2 - 1 # (w,h) -> (-1~1,-1~1) kptscore = torch.nn.functional.grid_sample( scores_map[b_idx].unsqueeze(0), keypoints_xy.view(1, 1, -1, 2), - mode="bilinear", + mode='bilinear', align_corners=True, - )[ - 0, 0, 0, : - ] # CxN + )[0, 0, 0, :] # CxN keypoints.append(keypoints_xy) - scoredispersitys.append(kptscore) # for jit.script compatability + scoredispersitys.append( + kptscore) # for jit.script compatability kptscores.append(kptscore) return keypoints, scoredispersitys, kptscores @@ -278,17 +280,18 @@ class InputPadder(object): def pad(self, x: torch.Tensor): assert x.ndim == 4 - return F.pad(x, self._pad, mode="replicate") + return F.pad(x, self._pad, mode='replicate') def unpad(self, x: torch.Tensor): assert x.ndim == 4 ht = x.shape[-2] wd = x.shape[-1] c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] - return x[..., c[0] : c[1], c[2] : c[3]] + return x[..., c[0]:c[1], c[2]:c[3]] class DeformableConv2d(nn.Module): + def __init__( self, in_channels, @@ -304,9 +307,8 @@ class DeformableConv2d(nn.Module): self.padding = padding self.mask = mask - self.channel_num = ( - 3 * kernel_size * kernel_size if mask else 2 * kernel_size * kernel_size - ) + self.channel_num = (3 * kernel_size * kernel_size if mask else 2 + * kernel_size * kernel_size) self.offset_conv = nn.Conv2d( in_channels, self.channel_num, @@ -356,10 +358,10 @@ def get_conv( stride=1, padding=1, bias=False, - conv_type="conv", + conv_type='conv', mask=False, ): - if conv_type == "conv": + if conv_type == 'conv': conv = nn.Conv2d( inplanes, planes, @@ -368,7 +370,7 @@ def get_conv( padding=padding, bias=bias, ) - elif conv_type == "dcn": + elif conv_type == 'dcn': conv = DeformableConv2d( inplanes, planes, @@ -384,13 +386,14 @@ def get_conv( class ConvBlock(nn.Module): + def __init__( self, in_channels, out_channels, gate: Optional[Callable[..., nn.Module]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None, - conv_type: str = "conv", + conv_type: str = 'conv', mask: bool = False, ): super().__init__() @@ -401,12 +404,18 @@ class ConvBlock(nn.Module): if norm_layer is None: norm_layer = nn.BatchNorm2d self.conv1 = get_conv( - in_channels, out_channels, kernel_size=3, conv_type=conv_type, mask=mask - ) + in_channels, + out_channels, + kernel_size=3, + conv_type=conv_type, + mask=mask) self.bn1 = norm_layer(out_channels) self.conv2 = get_conv( - out_channels, out_channels, kernel_size=3, conv_type=conv_type, mask=mask - ) + out_channels, + out_channels, + kernel_size=3, + conv_type=conv_type, + mask=mask) self.bn2 = norm_layer(out_channels) def forward(self, x): @@ -430,7 +439,7 @@ class ResBlock(nn.Module): dilation: int = 1, gate: Optional[Callable[..., nn.Module]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None, - conv_type: str = "conv", + conv_type: str = 'conv', mask: bool = False, ) -> None: super(ResBlock, self).__init__() @@ -441,18 +450,17 @@ class ResBlock(nn.Module): if norm_layer is None: norm_layer = nn.BatchNorm2d if groups != 1 or base_width != 64: - raise ValueError("ResBlock only supports groups=1 and base_width=64") + raise ValueError( + 'ResBlock only supports groups=1 and base_width=64') if dilation > 1: - raise NotImplementedError("Dilation > 1 not supported in ResBlock") + raise NotImplementedError('Dilation > 1 not supported in ResBlock') # Both self.conv1 and self.downsample layers # downsample the input when stride != 1 self.conv1 = get_conv( - inplanes, planes, kernel_size=3, conv_type=conv_type, mask=mask - ) + inplanes, planes, kernel_size=3, conv_type=conv_type, mask=mask) self.bn1 = norm_layer(planes) self.conv2 = get_conv( - planes, planes, kernel_size=3, conv_type=conv_type, mask=mask - ) + planes, planes, kernel_size=3, conv_type=conv_type, mask=mask) self.bn2 = norm_layer(planes) self.downsample = downsample self.stride = stride @@ -477,14 +485,15 @@ class ResBlock(nn.Module): class SDDH(nn.Module): + def __init__( - self, - dims: int, - kernel_size: int = 3, - n_pos: int = 8, - gate=nn.ReLU(), - conv2D=False, - mask=False, + self, + dims: int, + kernel_size: int = 3, + n_pos: int = 8, + gate=nn.ReLU(), + conv2D=False, + mask=False, ): super(SDDH, self).__init__() self.kernel_size = kernel_size @@ -518,18 +527,21 @@ class SDDH(nn.Module): # sampled feature conv self.sf_conv = nn.Conv2d( - dims, dims, kernel_size=1, stride=1, padding=0, bias=False - ) + dims, dims, kernel_size=1, stride=1, padding=0, bias=False) # convM if not conv2D: # deformable desc weights agg_weights = torch.nn.Parameter(torch.rand(n_pos, dims, dims)) - self.register_parameter("agg_weights", agg_weights) + self.register_parameter('agg_weights', agg_weights) else: self.convM = nn.Conv2d( - dims * n_pos, dims, kernel_size=1, stride=1, padding=0, bias=False - ) + dims * n_pos, + dims, + kernel_size=1, + stride=1, + padding=0, + bias=False) def forward(self, x, keypoints): # x: [B,C,H,W] @@ -548,29 +560,28 @@ class SDDH(nn.Module): if self.kernel_size > 1: patch = self.get_patches_func( - xi, kptsi_wh.long(), self.kernel_size - ) # [N_kpts, C, K, K] + xi, kptsi_wh.long(), self.kernel_size) # [N_kpts, C, K, K] else: kptsi_wh_long = kptsi_wh.long() patch = ( - xi[:, kptsi_wh_long[:, 1], kptsi_wh_long[:, 0]] - .permute(1, 0) - .reshape(N_kpts, c, 1, 1) - ) + xi[:, kptsi_wh_long[:, 1], + kptsi_wh_long[:, + 0]].permute(1, + 0).reshape(N_kpts, c, 1, 1)) offset = self.offset_conv(patch).clamp( - -max_offset, max_offset - ) # [N_kpts, 2*n_pos, 1, 1] + -max_offset, max_offset) # [N_kpts, 2*n_pos, 1, 1] if self.mask: - offset = ( - offset[:, :, 0, 0].view(N_kpts, 3, self.n_pos).permute(0, 2, 1) - ) # [N_kpts, n_pos, 3] + offset = (offset[:, :, 0, 0].view(N_kpts, 3, + self.n_pos).permute(0, 2, 1) + ) # [N_kpts, n_pos, 3] offset = offset[:, :, :-1] # [N_kpts, n_pos, 2] - mask_weight = torch.sigmoid(offset[:, :, -1]) # [N_kpts, n_pos] + mask_weight = torch.sigmoid(offset[:, :, + -1]) # [N_kpts, n_pos] else: - offset = ( - offset[:, :, 0, 0].view(N_kpts, 2, self.n_pos).permute(0, 2, 1) - ) # [N_kpts, n_pos, 2] + offset = (offset[:, :, 0, 0].view(N_kpts, 2, + self.n_pos).permute(0, 2, 1) + ) # [N_kpts, n_pos, 2] offsets.append(offset) # for visualization # get sample positions @@ -580,26 +591,23 @@ class SDDH(nn.Module): # sample features features = F.grid_sample( - xi.unsqueeze(0), pos, mode="bilinear", align_corners=True - ) # [1,C,(N_kpts*n_pos),1] - features = features.reshape(c, N_kpts, self.n_pos, 1).permute( - 1, 0, 2, 3 - ) # [N_kpts, C, n_pos, 1] + xi.unsqueeze(0), pos, mode='bilinear', + align_corners=True) # [1,C,(N_kpts*n_pos),1] + features = features.reshape(c, N_kpts, self.n_pos, + 1).permute(1, 0, 2, + 3) # [N_kpts, C, n_pos, 1] if self.mask: - features = torch.einsum("ncpo,np->ncpo", features, mask_weight) + features = torch.einsum('ncpo,np->ncpo', features, mask_weight) features = torch.selu_(self.sf_conv(features)).squeeze( - -1 - ) # [N_kpts, C, n_pos] + -1) # [N_kpts, C, n_pos] # convM if not self.conv2D: - descs = torch.einsum( - "ncp,pcd->nd", features, self.agg_weights - ) # [N_kpts, C] + descs = torch.einsum('ncp,pcd->nd', features, + self.agg_weights) # [N_kpts, C] else: - features = features.reshape(N_kpts, -1)[ - :, :, None, None - ] # [N_kpts, C*n_pos, 1, 1] + features = features.reshape( + N_kpts, -1)[:, :, None, None] # [N_kpts, C*n_pos, 1, 1] descs = self.convM(features).squeeze() # [N_kpts, C] # normalize @@ -611,34 +619,34 @@ class SDDH(nn.Module): class ALIKED(Extractor): default_conf = { - "model_name": "aliked-n16", - "max_num_keypoints": -1, - "detection_threshold": 0.2, - "nms_radius": 2, + 'model_name': 'aliked-n16', + 'max_num_keypoints': -1, + 'detection_threshold': 0.2, + 'nms_radius': 2, } - checkpoint_url = "https://github.com/Shiaoming/ALIKED/raw/main/models/{}.pth" + checkpoint_url = 'https://github.com/Shiaoming/ALIKED/raw/main/models/{}.pth' n_limit_max = 20000 # c1, c2, c3, c4, dim, K, M cfgs = { - "aliked-t16": [8, 16, 32, 64, 64, 3, 16], - "aliked-n16": [16, 32, 64, 128, 128, 3, 16], - "aliked-n16rot": [16, 32, 64, 128, 128, 3, 16], - "aliked-n32": [16, 32, 64, 128, 128, 3, 32], + 'aliked-t16': [8, 16, 32, 64, 64, 3, 16], + 'aliked-n16': [16, 32, 64, 128, 128, 3, 16], + 'aliked-n16rot': [16, 32, 64, 128, 128, 3, 16], + 'aliked-n32': [16, 32, 64, 128, 128, 3, 32], } preprocess_conf = { - "resize": 1024, + 'resize': 1024, } - required_data_keys = ["image"] + required_data_keys = ['image'] def __init__(self, **conf): super().__init__(**conf) # Update with default configuration. conf = self.conf c1, c2, c3, c4, dim, K, M = self.cfgs[conf.model_name] - conv_types = ["conv", "conv", "dcn", "dcn"] + conv_types = ['conv', 'conv', 'dcn', 'dcn'] conv2D = False mask = False @@ -647,7 +655,8 @@ class ALIKED(Extractor): self.pool4 = nn.AvgPool2d(kernel_size=4, stride=4) self.norm = nn.BatchNorm2d self.gate = nn.SELU(inplace=True) - self.block1 = ConvBlock(3, c1, self.gate, self.norm, conv_type=conv_types[0]) + self.block1 = ConvBlock( + 3, c1, self.gate, self.norm, conv_type=conv_types[0]) self.block2 = self.get_resblock(c1, c2, conv_types[1], mask) self.block3 = self.get_resblock(c2, c3, conv_types[2], mask) self.block4 = self.get_resblock(c3, c4, conv_types[3], mask) @@ -657,17 +666,13 @@ class ALIKED(Extractor): self.conv3 = resnet.conv1x1(c3, dim // 4) self.conv4 = resnet.conv1x1(dim, dim // 4) self.upsample2 = nn.Upsample( - scale_factor=2, mode="bilinear", align_corners=True - ) + scale_factor=2, mode='bilinear', align_corners=True) self.upsample4 = nn.Upsample( - scale_factor=4, mode="bilinear", align_corners=True - ) + scale_factor=4, mode='bilinear', align_corners=True) self.upsample8 = nn.Upsample( - scale_factor=8, mode="bilinear", align_corners=True - ) + scale_factor=8, mode='bilinear', align_corners=True) self.upsample32 = nn.Upsample( - scale_factor=32, mode="bilinear", align_corners=True - ) + scale_factor=32, mode='bilinear', align_corners=True) self.score_head = nn.Sequential( resnet.conv1x1(dim, 8), self.gate, @@ -677,19 +682,19 @@ class ALIKED(Extractor): self.gate, resnet.conv3x3(4, 1), ) - self.desc_head = SDDH(dim, K, M, gate=self.gate, conv2D=conv2D, mask=mask) + self.desc_head = SDDH( + dim, K, M, gate=self.gate, conv2D=conv2D, mask=mask) self.dkd = DKD( radius=conf.nms_radius, - top_k=-1 if conf.detection_threshold > 0 else conf.max_num_keypoints, + top_k=-1 + if conf.detection_threshold > 0 else conf.max_num_keypoints, scores_th=conf.detection_threshold, n_limit=conf.max_num_keypoints - if conf.max_num_keypoints > 0 - else self.n_limit_max, + if conf.max_num_keypoints > 0 else self.n_limit_max, ) state_dict = torch.hub.load_state_dict_from_url( - self.checkpoint_url.format(conf.model_name), map_location="cpu" - ) + self.checkpoint_url.format(conf.model_name), map_location='cpu') self.load_state_dict(state_dict, strict=True) def get_resblock(self, c_in, c_out, conv_type, mask): @@ -738,13 +743,12 @@ class ALIKED(Extractor): return feature_map, score_map def forward(self, data: dict) -> dict: - image = data["image"] + image = data['image'] if image.shape[1] == 1: image = grayscale_to_rgb(image) feature_map, score_map = self.extract_dense_map(image) keypoints, kptscores, scoredispersitys = self.dkd( - score_map, image_size=data.get("image_size") - ) + score_map, image_size=data.get('image_size')) descriptors, offsets = self.desc_head(feature_map, keypoints) _, _, h, w = image.shape @@ -752,7 +756,7 @@ class ALIKED(Extractor): # no padding required # we can set detection_threshold=-1 and conf.max_num_keypoints > 0 return { - "keypoints": wh * (torch.stack(keypoints) + 1) / 2.0, # B x N x 2 - "descriptors": torch.stack(descriptors), # B x N x D - "keypoint_scores": torch.stack(kptscores), # B x N + 'keypoints': wh * (torch.stack(keypoints) + 1) / 2.0, # B x N x 2 + 'descriptors': torch.stack(descriptors), # B x N x D + 'keypoint_scores': torch.stack(kptscores), # B x N } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/disk.py b/modelscope/models/cv/image_matching_fast/lightglue/disk.py index 8cb2195f..08d521c4 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/disk.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/disk.py @@ -6,20 +6,20 @@ from .utils import Extractor class DISK(Extractor): default_conf = { - "weights": "depth", - "max_num_keypoints": None, - "desc_dim": 128, - "nms_window_size": 5, - "detection_threshold": 0.0, - "pad_if_not_divisible": True, + 'weights': 'depth', + 'max_num_keypoints': None, + 'desc_dim': 128, + 'nms_window_size': 5, + 'detection_threshold': 0.0, + 'pad_if_not_divisible': True, } preprocess_conf = { - "resize": 1024, - "grayscale": False, + 'resize': 1024, + 'grayscale': False, } - required_data_keys = ["image"] + required_data_keys = ['image'] def __init__(self, **conf) -> None: super().__init__(**conf) # Update with default configuration. @@ -28,8 +28,8 @@ class DISK(Extractor): def forward(self, data: dict) -> dict: """Compute keypoints, scores, descriptors for image""" for key in self.required_data_keys: - assert key in data, f"Missing key {key} in data" - image = data["image"] + assert key in data, f'Missing key {key} in data' + image = data['image'] if image.shape[1] == 1: image = kornia.color.grayscale_to_rgb(image) features = self.model( @@ -49,7 +49,7 @@ class DISK(Extractor): descriptors = torch.stack(descriptors, 0) return { - "keypoints": keypoints.to(image).contiguous(), - "keypoint_scores": scores.to(image).contiguous(), - "descriptors": descriptors.to(image).contiguous(), + 'keypoints': keypoints.to(image).contiguous(), + 'keypoint_scores': scores.to(image).contiguous(), + 'descriptors': descriptors.to(image).contiguous(), } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py b/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py index e073c174..16888b55 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py @@ -1,3 +1,4 @@ +import os.path as osp import warnings from pathlib import Path from types import SimpleNamespace @@ -8,13 +9,12 @@ import torch import torch.nn.functional as F from torch import nn -import os.path as osp try: from flash_attn.modules.mha import FlashCrossAttention except ModuleNotFoundError: FlashCrossAttention = None -if FlashCrossAttention or hasattr(F, "scaled_dot_product_attention"): +if FlashCrossAttention or hasattr(F, 'scaled_dot_product_attention'): FLASH_AVAILABLE = True else: FLASH_AVAILABLE = False @@ -23,9 +23,8 @@ torch.backends.cudnn.deterministic = True @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) -def normalize_keypoints( - kpts: torch.Tensor, size: Optional[torch.Tensor] = None -) -> torch.Tensor: +def normalize_keypoints(kpts: torch.Tensor, + size: Optional[torch.Tensor] = None) -> torch.Tensor: if size is None: size = 1 + kpts.max(-2).values - kpts.min(-2).values elif not isinstance(size, torch.Tensor): @@ -41,11 +40,14 @@ def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]: if length <= x.shape[-2]: return x, torch.ones_like(x[..., :1], dtype=torch.bool) pad = torch.ones( - *x.shape[:-2], length - x.shape[-2], x.shape[-1], device=x.device, dtype=x.dtype - ) + *x.shape[:-2], + length - x.shape[-2], + x.shape[-1], + device=x.device, + dtype=x.dtype) y = torch.cat([x, pad], dim=-2) mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device) - mask[..., : x.shape[-2], :] = True + mask[..., :x.shape[-2], :] = True return y, mask @@ -55,12 +57,18 @@ def rotate_half(x: torch.Tensor) -> torch.Tensor: return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2) -def apply_cached_rotary_emb(freqs: torch.Tensor, t: torch.Tensor) -> torch.Tensor: +def apply_cached_rotary_emb(freqs: torch.Tensor, + t: torch.Tensor) -> torch.Tensor: return (t * freqs[0]) + (rotate_half(t) * freqs[1]) class LearnableFourierPositionalEncoding(nn.Module): - def __init__(self, M: int, dim: int, F_dim: int = None, gamma: float = 1.0) -> None: + + def __init__(self, + M: int, + dim: int, + F_dim: int = None, + gamma: float = 1.0) -> None: super().__init__() F_dim = F_dim if F_dim is not None else dim self.gamma = gamma @@ -76,6 +84,7 @@ class LearnableFourierPositionalEncoding(nn.Module): class TokenConfidence(nn.Module): + def __init__(self, dim: int) -> None: super().__init__() self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid()) @@ -89,27 +98,33 @@ class TokenConfidence(nn.Module): class Attention(nn.Module): + def __init__(self, allow_flash: bool) -> None: super().__init__() if allow_flash and not FLASH_AVAILABLE: warnings.warn( - "FlashAttention is not available. For optimal speed, " - "consider installing torch >= 2.0 or flash-attn.", + 'FlashAttention is not available. For optimal speed, ' + 'consider installing torch >= 2.0 or flash-attn.', stacklevel=2, ) self.enable_flash = allow_flash and FLASH_AVAILABLE - self.has_sdp = hasattr(F, "scaled_dot_product_attention") + self.has_sdp = hasattr(F, 'scaled_dot_product_attention') if allow_flash and FlashCrossAttention: self.flash_ = FlashCrossAttention() if self.has_sdp: torch.backends.cuda.enable_flash_sdp(allow_flash) - def forward(self, q, k, v, mask: Optional[torch.Tensor] = None) -> torch.Tensor: - if self.enable_flash and q.device.type == "cuda": + def forward(self, + q, + k, + v, + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + if self.enable_flash and q.device.type == 'cuda': # use torch 2.0 scaled_dot_product_attention with flash if self.has_sdp: args = [x.half().contiguous() for x in [q, k, v]] - v = F.scaled_dot_product_attention(*args, attn_mask=mask).to(q.dtype) + v = F.scaled_dot_product_attention( + *args, attn_mask=mask).to(q.dtype) return v if mask is None else v.nan_to_num() else: assert mask is None @@ -121,18 +136,21 @@ class Attention(nn.Module): v = F.scaled_dot_product_attention(*args, attn_mask=mask) return v if mask is None else v.nan_to_num() else: - s = q.shape[-1] ** -0.5 - sim = torch.einsum("...id,...jd->...ij", q, k) * s + s = q.shape[-1]**-0.5 + sim = torch.einsum('...id,...jd->...ij', q, k) * s if mask is not None: - sim.masked_fill(~mask, -float("inf")) + sim.masked_fill(~mask, -float('inf')) attn = F.softmax(sim, -1) - return torch.einsum("...ij,...jd->...id", attn, v) + return torch.einsum('...ij,...jd->...id', attn, v) class SelfBlock(nn.Module): - def __init__( - self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True - ) -> None: + + def __init__(self, + embed_dim: int, + num_heads: int, + flash: bool = False, + bias: bool = True) -> None: super().__init__() self.embed_dim = embed_dim self.num_heads = num_heads @@ -165,9 +183,12 @@ class SelfBlock(nn.Module): class CrossBlock(nn.Module): - def __init__( - self, embed_dim: int, num_heads: int, flash: bool = False, bias: bool = True - ) -> None: + + def __init__(self, + embed_dim: int, + num_heads: int, + flash: bool = False, + bias: bool = True) -> None: super().__init__() self.heads = num_heads dim_head = embed_dim // num_heads @@ -190,32 +211,35 @@ class CrossBlock(nn.Module): def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor): return func(x0), func(x1) - def forward( - self, x0: torch.Tensor, x1: torch.Tensor, mask: Optional[torch.Tensor] = None - ) -> List[torch.Tensor]: + def forward(self, + x0: torch.Tensor, + x1: torch.Tensor, + mask: Optional[torch.Tensor] = None) -> List[torch.Tensor]: qk0, qk1 = self.map_(self.to_qk, x0, x1) v0, v1 = self.map_(self.to_v, x0, x1) qk0, qk1, v0, v1 = map( lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2), (qk0, qk1, v0, v1), ) - if self.flash is not None and qk0.device.type == "cuda": + if self.flash is not None and qk0.device.type == 'cuda': m0 = self.flash(qk0, qk1, v1, mask) m1 = self.flash( - qk1, qk0, v0, mask.transpose(-1, -2) if mask is not None else None - ) + qk1, qk0, v0, + mask.transpose(-1, -2) if mask is not None else None) else: qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5 - sim = torch.einsum("bhid, bhjd -> bhij", qk0, qk1) + sim = torch.einsum('bhid, bhjd -> bhij', qk0, qk1) if mask is not None: - sim = sim.masked_fill(~mask, -float("inf")) + sim = sim.masked_fill(~mask, -float('inf')) attn01 = F.softmax(sim, dim=-1) attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1) - m0 = torch.einsum("bhij, bhjd -> bhid", attn01, v1) - m1 = torch.einsum("bhji, bhjd -> bhid", attn10.transpose(-2, -1), v0) + m0 = torch.einsum('bhij, bhjd -> bhid', attn01, v1) + m1 = torch.einsum('bhji, bhjd -> bhid', attn10.transpose(-2, -1), + v0) if mask is not None: m0, m1 = m0.nan_to_num(), m1.nan_to_num() - m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), m0, m1) + m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), + m0, m1) m0, m1 = self.map_(self.to_out, m0, m1) x0 = x0 + self.ffn(torch.cat([x0, m0], -1)) x1 = x1 + self.ffn(torch.cat([x1, m1], -1)) @@ -223,6 +247,7 @@ class CrossBlock(nn.Module): class TransformerLayer(nn.Module): + def __init__(self, *args, **kwargs): super().__init__() self.self_attn = SelfBlock(*args, **kwargs) @@ -238,7 +263,8 @@ class TransformerLayer(nn.Module): mask1: Optional[torch.Tensor] = None, ): if mask0 is not None and mask1 is not None: - return self.masked_forward(desc0, desc1, encoding0, encoding1, mask0, mask1) + return self.masked_forward(desc0, desc1, encoding0, encoding1, + mask0, mask1) else: desc0 = self.self_attn(desc0, encoding0) desc1 = self.self_attn(desc1, encoding1) @@ -254,14 +280,14 @@ class TransformerLayer(nn.Module): return self.cross_attn(desc0, desc1, mask) -def sigmoid_log_double_softmax( - sim: torch.Tensor, z0: torch.Tensor, z1: torch.Tensor -) -> torch.Tensor: +def sigmoid_log_double_softmax(sim: torch.Tensor, z0: torch.Tensor, + z1: torch.Tensor) -> torch.Tensor: """create the log assignment matrix from logits and similarity""" b, m, n = sim.shape certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2) scores0 = F.log_softmax(sim, 2) - scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), 2).transpose(-1, -2) + scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), + 2).transpose(-1, -2) scores = sim.new_full((b, m + 1, n + 1), 0) scores[:, :m, :n] = scores0 + scores1 + certainties scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1)) @@ -270,6 +296,7 @@ def sigmoid_log_double_softmax( class MatchAssignment(nn.Module): + def __init__(self, dim: int) -> None: super().__init__() self.dim = dim @@ -281,7 +308,7 @@ class MatchAssignment(nn.Module): mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1) _, _, d = mdesc0.shape mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25 - sim = torch.einsum("bmd,bnd->bmn", mdesc0, mdesc1) + sim = torch.einsum('bmd,bnd->bmn', mdesc0, mdesc1) z0 = self.matchability(desc0) z1 = self.matchability(desc1) scores = sigmoid_log_double_softmax(sim, z0, z1) @@ -315,34 +342,34 @@ class LightGlue(nn.Module): # Point pruning involves an overhead (gather). # Therefore, we only activate it if there are enough keypoints. pruning_keypoint_thresholds = { - "cpu": -1, - "mps": -1, - "cuda": 1024, - "flash": 1536, + 'cpu': -1, + 'mps': -1, + 'cuda': 1024, + 'flash': 1536, } - required_data_keys = ["image0", "image1"] + required_data_keys = ['image0', 'image1'] - version = "v0.1_arxiv" - weight_path = "{}_lightglue.pth" + version = 'v0.1_arxiv' + weight_path = '{}_lightglue.pth' features = { - "superpoint": { - "weights": "superpoint_lightglue", - "input_dim": 256, + 'superpoint': { + 'weights': 'superpoint_lightglue', + 'input_dim': 256, }, - "disk": { - "weights": "disk_lightglue", - "input_dim": 128, + 'disk': { + 'weights': 'disk_lightglue', + 'input_dim': 128, }, - "aliked": { - "weights": "aliked_lightglue", - "input_dim": 128, + 'aliked': { + 'weights': 'aliked_lightglue', + 'input_dim': 128, }, - "sift": { - "weights": "sift_lightglue", - "input_dim": 128, - "add_scale_ori": True, + 'sift': { + 'weights': 'sift_lightglue', + 'input_dim': 128, + 'add_scale_ori': True, }, } @@ -352,77 +379,78 @@ class LightGlue(nn.Module): if conf.features is not None: if conf.features not in self.features: raise ValueError( - f"Unsupported features: {conf.features} not in " - f"{{{','.join(self.features)}}}" - ) + f'Unsupported features: {conf.features} not in ' + f"{{{','.join(self.features)}}}") for k, v in self.features[conf.features].items(): setattr(conf, k, v) if conf.input_dim != conf.descriptor_dim: - self.input_proj = nn.Linear(conf.input_dim, conf.descriptor_dim, bias=True) + self.input_proj = nn.Linear( + conf.input_dim, conf.descriptor_dim, bias=True) else: self.input_proj = nn.Identity() head_dim = conf.descriptor_dim // conf.num_heads self.posenc = LearnableFourierPositionalEncoding( - 2 + 2 * self.conf.add_scale_ori, head_dim, head_dim - ) + 2 + 2 * self.conf.add_scale_ori, head_dim, head_dim) h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim self.transformers = nn.ModuleList( - [TransformerLayer(d, h, conf.flash) for _ in range(n)] - ) + [TransformerLayer(d, h, conf.flash) for _ in range(n)]) - self.log_assignment = nn.ModuleList([MatchAssignment(d) for _ in range(n)]) + self.log_assignment = nn.ModuleList( + [MatchAssignment(d) for _ in range(n)]) self.token_confidence = nn.ModuleList( - [TokenConfidence(d) for _ in range(n - 1)] - ) + [TokenConfidence(d) for _ in range(n - 1)]) self.register_buffer( - "confidence_thresholds", - torch.Tensor( - [self.confidence_threshold(i) for i in range(self.conf.n_layers)] - ), + 'confidence_thresholds', + torch.Tensor([ + self.confidence_threshold(i) for i in range(self.conf.n_layers) + ]), ) state_dict = None if conf.features is not None: - fname = f"{conf.weights}_{self.version.replace('.', '-')}.pth" state_dict = torch.load( - osp.join(model_dir, - self.weight_path.format(conf.features)), map_location="cpu" - ) + osp.join(model_dir, self.weight_path.format(conf.features)), + map_location='cpu') self.load_state_dict(state_dict, strict=False) elif conf.weights is not None: path = Path(__file__).parent - path = path / "weights/{}.pth".format(self.conf.weights) - state_dict = torch.load(str(path), map_location="cpu") + path = path / 'weights/{}.pth'.format(self.conf.weights) + state_dict = torch.load(str(path), map_location='cpu') if state_dict: # rename old state dict entries for i in range(self.conf.n_layers): - pattern = f"self_attn.{i}", f"transformers.{i}.self_attn" - state_dict = {k.replace(*pattern): v for k, v in state_dict.items()} - pattern = f"cross_attn.{i}", f"transformers.{i}.cross_attn" - state_dict = {k.replace(*pattern): v for k, v in state_dict.items()} + pattern = f'self_attn.{i}', f'transformers.{i}.self_attn' + state_dict = { + k.replace(*pattern): v + for k, v in state_dict.items() + } + pattern = f'cross_attn.{i}', f'transformers.{i}.cross_attn' + state_dict = { + k.replace(*pattern): v + for k, v in state_dict.items() + } self.load_state_dict(state_dict, strict=False) # static lengths LightGlue is compiled for (only used with torch.compile) self.static_lengths = None - def compile( - self, mode="reduce-overhead", static_lengths=[256, 512, 768, 1024, 1280, 1536] - ): + def compile(self, + mode='reduce-overhead', + static_lengths=[256, 512, 768, 1024, 1280, 1536]): if self.conf.width_confidence != -1: warnings.warn( - "Point pruning is partially disabled for compiled forward.", + 'Point pruning is partially disabled for compiled forward.', stacklevel=2, ) for i in range(self.conf.n_layers): self.transformers[i].masked_forward = torch.compile( - self.transformers[i].masked_forward, mode=mode, fullgraph=True - ) + self.transformers[i].masked_forward, mode=mode, fullgraph=True) self.static_lengths = static_lengths @@ -447,30 +475,30 @@ class LightGlue(nn.Module): matching_scores1: [B x N] matches: List[[Si x 2]], scores: List[[Si]] """ - with torch.autocast(enabled=self.conf.mp, device_type="cuda"): + with torch.autocast(enabled=self.conf.mp, device_type='cuda'): return self._forward(data) def _forward(self, data: dict) -> dict: for key in self.required_data_keys: - assert key in data, f"Missing key {key} in data" - data0, data1 = data["image0"], data["image1"] - kpts0, kpts1 = data0["keypoints"], data1["keypoints"] + assert key in data, f'Missing key {key} in data' + data0, data1 = data['image0'], data['image1'] + kpts0, kpts1 = data0['keypoints'], data1['keypoints'] b, m, _ = kpts0.shape b, n, _ = kpts1.shape device = kpts0.device - size0, size1 = data0.get("image_size"), data1.get("image_size") + size0, size1 = data0.get('image_size'), data1.get('image_size') kpts0 = normalize_keypoints(kpts0, size0).clone() kpts1 = normalize_keypoints(kpts1, size1).clone() if self.conf.add_scale_ori: kpts0 = torch.cat( - [kpts0] + [data0[k].unsqueeze(-1) for k in ("scales", "oris")], -1 - ) + [kpts0] + [data0[k].unsqueeze(-1) for k in ('scales', 'oris')], + -1) kpts1 = torch.cat( - [kpts1] + [data1[k].unsqueeze(-1) for k in ("scales", "oris")], -1 - ) - desc0 = data0["descriptors"].detach().contiguous() - desc1 = data1["descriptors"].detach().contiguous() + [kpts1] + [data1[k].unsqueeze(-1) for k in ('scales', 'oris')], + -1) + desc0 = data0['descriptors'].detach().contiguous() + desc1 = data1['descriptors'].detach().contiguous() assert desc0.shape[-1] == self.conf.input_dim assert desc1.shape[-1] == self.conf.input_dim @@ -507,14 +535,14 @@ class LightGlue(nn.Module): token0, token1 = None, None for i in range(self.conf.n_layers): desc0, desc1 = self.transformers[i]( - desc0, desc1, encoding0, encoding1, mask0=mask0, mask1=mask1 - ) + desc0, desc1, encoding0, encoding1, mask0=mask0, mask1=mask1) if i == self.conf.n_layers - 1: continue # no early stopping or adaptive width at last layer if do_early_stop: token0, token1 = self.token_confidence[i](desc0, desc1) - if self.check_if_stop(token0[..., :m, :], token1[..., :n, :], i, m + n): + if self.check_if_stop(token0[..., :m, :], token1[..., :n, :], + i, m + n): break if do_point_pruning and desc0.shape[-2] > pruning_th: scores0 = self.log_assignment[i].get_matchability(desc0) @@ -535,7 +563,8 @@ class LightGlue(nn.Module): desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :] scores, _ = self.log_assignment[i](desc0, desc1) - m0, m1, mscores0, mscores1 = filter_matches(scores, self.conf.filter_threshold) + m0, m1, mscores0, mscores1 = filter_matches(scores, + self.conf.filter_threshold) matches, mscores = [], [] for k in range(b): valid = m0[k] > -1 @@ -551,8 +580,10 @@ class LightGlue(nn.Module): if do_point_pruning: m0_ = torch.full((b, m), -1, device=m0.device, dtype=m0.dtype) m1_ = torch.full((b, n), -1, device=m1.device, dtype=m1.dtype) - m0_[:, ind0] = torch.where(m0 == -1, -1, ind1.gather(1, m0.clamp(min=0))) - m1_[:, ind1] = torch.where(m1 == -1, -1, ind0.gather(1, m1.clamp(min=0))) + m0_[:, ind0] = torch.where(m0 == -1, -1, + ind1.gather(1, m0.clamp(min=0))) + m1_[:, ind1] = torch.where(m1 == -1, -1, + ind0.gather(1, m1.clamp(min=0))) mscores0_ = torch.zeros((b, m), device=mscores0.device) mscores1_ = torch.zeros((b, n), device=mscores1.device) mscores0_[:, ind0] = mscores0 @@ -563,15 +594,15 @@ class LightGlue(nn.Module): prune1 = torch.ones_like(mscores1) * self.conf.n_layers pred = { - "matches0": m0, - "matches1": m1, - "matching_scores0": mscores0, - "matching_scores1": mscores1, - "stop": i + 1, - "matches": matches, - "scores": mscores, - "prune0": prune0, - "prune1": prune1, + 'matches0': m0, + 'matches1': m1, + 'matching_scores0': mscores0, + 'matching_scores1': mscores1, + 'stop': i + 1, + 'matches': matches, + 'scores': mscores, + 'prune0': prune0, + 'prune1': prune1, } return pred @@ -581,9 +612,8 @@ class LightGlue(nn.Module): threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.conf.n_layers) return np.clip(threshold, 0, 1) - def get_pruning_mask( - self, confidences: torch.Tensor, scores: torch.Tensor, layer_index: int - ) -> torch.Tensor: + def get_pruning_mask(self, confidences: torch.Tensor, scores: torch.Tensor, + layer_index: int) -> torch.Tensor: """mask points which should be removed""" keep = scores > (1 - self.conf.width_confidence) if confidences is not None: # Low-confidence points are never pruned. @@ -600,11 +630,12 @@ class LightGlue(nn.Module): """evaluate stopping condition""" confidences = torch.cat([confidences0, confidences1], -1) threshold = self.confidence_thresholds[layer_index] - ratio_confident = 1.0 - (confidences < threshold).float().sum() / num_points + ratio_confident = 1.0 - ( + confidences < threshold).float().sum() / num_points # noqa E501 return ratio_confident > self.conf.depth_confidence def pruning_min_kpts(self, device: torch.device): - if self.conf.flash and FLASH_AVAILABLE and device.type == "cuda": - return self.pruning_keypoint_thresholds["flash"] + if self.conf.flash and FLASH_AVAILABLE and device.type == 'cuda': + return self.pruning_keypoint_thresholds['flash'] else: return self.pruning_keypoint_thresholds[device.type] diff --git a/modelscope/models/cv/image_matching_fast/lightglue/sift.py b/modelscope/models/cv/image_matching_fast/lightglue/sift.py index 802fc1c2..435d8f7f 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/sift.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/sift.py @@ -6,15 +6,20 @@ import torch from kornia.color import rgb_to_grayscale from packaging import version +from .utils import Extractor + try: import pycolmap except ImportError: pycolmap = None -from .utils import Extractor - -def filter_dog_point(points, scales, angles, image_shape, nms_radius, scores=None): +def filter_dog_point(points, + scales, + angles, + image_shape, + nms_radius, + scores=None): h, w = image_shape ij = np.round(points - 0.5).astype(int).T[::-1] @@ -72,59 +77,59 @@ def run_opencv_sift(features: cv2.Feature2D, image: np.ndarray) -> np.ndarray: points = np.array([k.pt for k in detections], dtype=np.float32) scores = np.array([k.response for k in detections], dtype=np.float32) scales = np.array([k.size for k in detections], dtype=np.float32) - angles = np.deg2rad(np.array([k.angle for k in detections], dtype=np.float32)) + angles = np.deg2rad( + np.array([k.angle for k in detections], dtype=np.float32)) return points, scores, scales, angles, descriptors class SIFT(Extractor): default_conf = { - "rootsift": True, - "nms_radius": 0, # None to disable filtering entirely. - "max_num_keypoints": 4096, - "backend": "opencv", # in {opencv, pycolmap, pycolmap_cpu, pycolmap_cuda} - "detection_threshold": 0.0066667, # from COLMAP - "edge_threshold": 10, - "first_octave": -1, # only used by pycolmap, the default of COLMAP - "num_octaves": 4, + 'rootsift': True, + 'nms_radius': 0, # None to disable filtering entirely. + 'max_num_keypoints': 4096, + 'backend': + 'opencv', # in {opencv, pycolmap, pycolmap_cpu, pycolmap_cuda} + 'detection_threshold': 0.0066667, # from COLMAP + 'edge_threshold': 10, + 'first_octave': -1, # only used by pycolmap, the default of COLMAP + 'num_octaves': 4, } preprocess_conf = { - "resize": 1024, + 'resize': 1024, } - required_data_keys = ["image"] + required_data_keys = ['image'] def __init__(self, **conf): super().__init__(**conf) # Update with default configuration. backend = self.conf.backend - if backend.startswith("pycolmap"): + if backend.startswith('pycolmap'): if pycolmap is None: raise ImportError( - "Cannot find module pycolmap: install it with pip" - "or use backend=opencv." - ) + 'Cannot find module pycolmap: install it with pip' + 'or use backend=opencv.') options = { - "peak_threshold": self.conf.detection_threshold, - "edge_threshold": self.conf.edge_threshold, - "first_octave": self.conf.first_octave, - "num_octaves": self.conf.num_octaves, - "normalization": pycolmap.Normalization.L2, # L1_ROOT is buggy. + 'peak_threshold': self.conf.detection_threshold, + 'edge_threshold': self.conf.edge_threshold, + 'first_octave': self.conf.first_octave, + 'num_octaves': self.conf.num_octaves, + 'normalization': + pycolmap.Normalization.L2, # L1_ROOT is buggy. } - device = ( - "auto" if backend == "pycolmap" else backend.replace("pycolmap_", "") - ) - if ( - backend == "pycolmap_cpu" or not pycolmap.has_cuda - ) and pycolmap.__version__ < "0.5.0": + device = ('auto' if backend == 'pycolmap' else backend.replace( + 'pycolmap_', '')) + if (backend == 'pycolmap_cpu' or not pycolmap.has_cuda + ) and pycolmap.__version__ < '0.5.0': # noqa E501 warnings.warn( - "The pycolmap CPU SIFT is buggy in version < 0.5.0, " - "consider upgrading pycolmap or use the CUDA version.", + 'The pycolmap CPU SIFT is buggy in version < 0.5.0, ' + 'consider upgrading pycolmap or use the CUDA version.', stacklevel=1, ) else: - options["max_num_features"] = self.conf.max_num_keypoints + options['max_num_features'] = self.conf.max_num_keypoints self.sift = pycolmap.Sift(options=options, device=device) - elif backend == "opencv": + elif backend == 'opencv': self.sift = cv2.SIFT_create( contrastThreshold=self.conf.detection_threshold, nfeatures=self.conf.max_num_keypoints, @@ -132,56 +137,52 @@ class SIFT(Extractor): nOctaveLayers=self.conf.num_octaves, ) else: - backends = {"opencv", "pycolmap", "pycolmap_cpu", "pycolmap_cuda"} - raise ValueError( - f"Unknown backend: {backend} not in " f"{{{','.join(backends)}}}." - ) + backends = {'opencv', 'pycolmap', 'pycolmap_cpu', 'pycolmap_cuda'} + raise ValueError(f'Unknown backend: {backend} not in ' + f"{{{','.join(backends)}}}.") def extract_single_image(self, image: torch.Tensor): image_np = image.cpu().numpy().squeeze(0) - if self.conf.backend.startswith("pycolmap"): - if version.parse(pycolmap.__version__) >= version.parse("0.5.0"): + if self.conf.backend.startswith('pycolmap'): + if version.parse(pycolmap.__version__) >= version.parse('0.5.0'): detections, descriptors = self.sift.extract(image_np) scores = None # Scores are not exposed by COLMAP anymore. else: detections, scores, descriptors = self.sift.extract(image_np) keypoints = detections[:, :2] # Keep only (x, y). scales, angles = detections[:, -2:].T - if scores is not None and ( - self.conf.backend == "pycolmap_cpu" or not pycolmap.has_cuda - ): + if scores is not None and (self.conf.backend == 'pycolmap_cpu' + or not pycolmap.has_cuda): # Set the scores as a combination of abs. response and scale. scores = np.abs(scores) * scales - elif self.conf.backend == "opencv": + elif self.conf.backend == 'opencv': # TODO: Check if opencv keypoints are already in corner convention keypoints, scores, scales, angles, descriptors = run_opencv_sift( - self.sift, (image_np * 255.0).astype(np.uint8) - ) + self.sift, (image_np * 255.0).astype(np.uint8)) pred = { - "keypoints": keypoints, - "scales": scales, - "oris": angles, - "descriptors": descriptors, + 'keypoints': keypoints, + 'scales': scales, + 'oris': angles, + 'descriptors': descriptors, } if scores is not None: - pred["keypoint_scores"] = scores + pred['keypoint_scores'] = scores # sometimes pycolmap returns points outside the image. We remove them - if self.conf.backend.startswith("pycolmap"): - is_inside = ( - pred["keypoints"] + 0.5 < np.array([image_np.shape[-2:][::-1]]) - ).all(-1) + if self.conf.backend.startswith('pycolmap'): + is_inside = (pred['keypoints'] + 0.5 < np.array( + [image_np.shape[-2:][::-1]])).all(-1) pred = {k: v[is_inside] for k, v in pred.items()} if self.conf.nms_radius is not None: keep = filter_dog_point( - pred["keypoints"], - pred["scales"], - pred["oris"], + pred['keypoints'], + pred['scales'], + pred['oris'], image_np.shape, self.conf.nms_radius, - scores=pred.get("keypoint_scores"), + scores=pred.get('keypoint_scores'), ) pred = {k: v[keep] for k, v in pred.items()} @@ -189,14 +190,15 @@ class SIFT(Extractor): if scores is not None: # Keep the k keypoints with highest score num_points = self.conf.max_num_keypoints - if num_points is not None and len(pred["keypoints"]) > num_points: - indices = torch.topk(pred["keypoint_scores"], num_points).indices + if num_points is not None and len(pred['keypoints']) > num_points: + indices = torch.topk(pred['keypoint_scores'], + num_points).indices pred = {k: v[indices] for k, v in pred.items()} return pred def forward(self, data: dict) -> dict: - image = data["image"] + image = data['image'] if image.shape[1] == 3: image = rgb_to_grayscale(image) device = image.device @@ -204,13 +206,16 @@ class SIFT(Extractor): pred = [] for k in range(len(image)): img = image[k] - if "image_size" in data.keys(): + if 'image_size' in data.keys(): # avoid extracting points in padded areas - w, h = data["image_size"][k] + w, h = data['image_size'][k] img = img[:, :h, :w] p = self.extract_single_image(img) pred.append(p) - pred = {k: torch.stack([p[k] for p in pred], 0).to(device) for k in pred[0]} + pred = { + k: torch.stack([p[k] for p in pred], 0).to(device) + for k in pred[0] + } if self.conf.rootsift: - pred["descriptors"] = sift_to_rootsift(pred["descriptors"]) + pred['descriptors'] = sift_to_rootsift(pred['descriptors']) return pred diff --git a/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py b/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py index 99280b40..0f628458 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py @@ -42,12 +42,13 @@ # Adapted by Remi Pautrat, Philipp Lindenberger +import os.path as osp + import torch from kornia.color import rgb_to_grayscale from torch import nn from .utils import Extractor -import os.path as osp def simple_nms(scores, nms_radius: int): @@ -56,8 +57,7 @@ def simple_nms(scores, nms_radius: int): def max_pool(x): return torch.nn.functional.max_pool2d( - x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius - ) + x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius) zeros = torch.zeros_like(scores) max_mask = scores == max_pool(scores) @@ -80,19 +80,14 @@ def sample_descriptors(keypoints, descriptors, s: int = 8): """Interpolate descriptors at keypoint locations""" b, c, h, w = descriptors.shape keypoints = keypoints - s / 2 + 0.5 - keypoints /= torch.tensor( - [(w * s - s / 2 - 0.5), (h * s - s / 2 - 0.5)], - ).to( - keypoints - )[None] + keypoints /= torch.tensor([(w * s - s / 2 - 0.5), + (h * s - s / 2 - 0.5)], ).to(keypoints)[None] keypoints = keypoints * 2 - 1 # normalize to (-1, 1) - args = {"align_corners": True} if torch.__version__ >= "1.3" else {} + args = {'align_corners': True} if torch.__version__ >= '1.3' else {} descriptors = torch.nn.functional.grid_sample( - descriptors, keypoints.view(b, 1, -1, 2), mode="bilinear", **args - ) + descriptors, keypoints.view(b, 1, -1, 2), mode='bilinear', **args) descriptors = torch.nn.functional.normalize( - descriptors.reshape(b, c, -1), p=2, dim=1 - ) + descriptors.reshape(b, c, -1), p=2, dim=1) return descriptors @@ -106,20 +101,20 @@ class SuperPoint(Extractor): """ default_conf = { - "descriptor_dim": 256, - "nms_radius": 4, - "max_num_keypoints": None, - "detection_threshold": 0.0005, - "remove_borders": 4, + 'descriptor_dim': 256, + 'nms_radius': 4, + 'max_num_keypoints': None, + 'detection_threshold': 0.0005, + 'remove_borders': 4, } preprocess_conf = { - "resize": 1024, + 'resize': 1024, } - required_data_keys = ["image"] + required_data_keys = ['image'] - def __init__(self,model_dir, **conf): + def __init__(self, model_dir, **conf): super().__init__(**conf) # Update with default configuration. self.relu = nn.ReLU(inplace=True) self.pool = nn.MaxPool2d(kernel_size=2, stride=2) @@ -139,21 +134,19 @@ class SuperPoint(Extractor): self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1) self.convDb = nn.Conv2d( - c5, self.conf.descriptor_dim, kernel_size=1, stride=1, padding=0 - ) + c5, self.conf.descriptor_dim, kernel_size=1, stride=1, padding=0) - - weights_path = osp.join(model_dir,"superpoint_v1.pth") - self.load_state_dict(torch.load(weights_path, map_location="cpu")) + weights_path = osp.join(model_dir, 'superpoint_v1.pth') + self.load_state_dict(torch.load(weights_path, map_location='cpu')) if self.conf.max_num_keypoints is not None and self.conf.max_num_keypoints <= 0: - raise ValueError("max_num_keypoints must be positive or None") + raise ValueError('max_num_keypoints must be positive or None') def forward(self, data: dict) -> dict: """Compute keypoints, scores, descriptors for image""" for key in self.required_data_keys: - assert key in data, f"Missing key {key} in data" - image = data["image"] + assert key in data, f'Missing key {key} in data' + image = data['image'] if image.shape[1] == 3: image = rgb_to_grayscale(image) @@ -193,20 +186,18 @@ class SuperPoint(Extractor): # Separate into batches keypoints = [ - torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i] for i in range(b) + torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i] + for i in range(b) ] scores = [scores[best_kp[0] == i] for i in range(b)] # Keep the k keypoints with highest score if self.conf.max_num_keypoints is not None: keypoints, scores = list( - zip( - *[ - top_k_keypoints(k, s, self.conf.max_num_keypoints) - for k, s in zip(keypoints, scores) - ] - ) - ) + zip(*[ + top_k_keypoints(k, s, self.conf.max_num_keypoints) + for k, s in zip(keypoints, scores) + ])) # Convert (h, w) to (x, y) keypoints = [torch.flip(k, [1]).float() for k in keypoints] @@ -223,7 +214,10 @@ class SuperPoint(Extractor): ] return { - "keypoints": torch.stack(keypoints, 0), - "keypoint_scores": torch.stack(scores, 0), - "descriptors": torch.stack(descriptors, 0).transpose(-1, -2).contiguous(), + 'keypoints': + torch.stack(keypoints, 0), + 'keypoint_scores': + torch.stack(scores, 0), + 'descriptors': + torch.stack(descriptors, 0).transpose(-1, -2).contiguous(), } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/utils.py b/modelscope/models/cv/image_matching_fast/lightglue/utils.py index d1c1ab2e..86621e17 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/utils.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/utils.py @@ -11,11 +11,11 @@ import torch class ImagePreprocessor: default_conf = { - "resize": None, # target edge length, None for no resizing - "side": "long", - "interpolation": "bilinear", - "align_corners": None, - "antialias": True, + 'resize': None, # target edge length, None for no resizing + 'side': 'long', + 'interpolation': 'bilinear', + 'align_corners': None, + 'antialias': True, } def __init__(self, **conf) -> None: @@ -52,7 +52,9 @@ def map_tensor(input_, func: Callable): return input_ -def batch_to_device(batch: dict, device: str = "cpu", non_blocking: bool = True): +def batch_to_device(batch: dict, + device: str = 'cpu', + non_blocking: bool = True): """Move batch (dict) to device""" def _func(tensor): @@ -72,11 +74,11 @@ def rbd(data: dict) -> dict: def read_image(path: Path, grayscale: bool = False) -> np.ndarray: """Read an image from path as RGB or grayscale""" if not Path(path).exists(): - raise FileNotFoundError(f"No image at path {path}.") + raise FileNotFoundError(f'No image at path {path}.') mode = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR image = cv2.imread(str(path), mode) if image is None: - raise IOError(f"Could not read image at {path}.") + raise IOError(f'Could not read image at {path}.') if not grayscale: image = image[..., ::-1] return image @@ -89,20 +91,20 @@ def numpy_image_to_torch(image: np.ndarray) -> torch.Tensor: elif image.ndim == 2: image = image[None] # add channel axis else: - raise ValueError(f"Not an image: {image.shape}") + raise ValueError(f'Not an image: {image.shape}') return torch.tensor(image / 255.0, dtype=torch.float) def resize_image( image: np.ndarray, size: Union[List[int], int], - fn: str = "max", - interp: Optional[str] = "area", + fn: str = 'max', + interp: Optional[str] = 'area', ) -> np.ndarray: """Resize an image to a fixed size, or according to max or min edge.""" h, w = image.shape[:2] - fn = {"max": max, "min": min}[fn] + fn = {'max': max, 'min': min}[fn] if isinstance(size, int): scale = size / fn(h, w) h_new, w_new = int(round(h * scale)), int(round(w * scale)) @@ -111,12 +113,12 @@ def resize_image( h_new, w_new = size scale = (w_new / w, h_new / h) else: - raise ValueError(f"Incorrect new size: {size}") + raise ValueError(f'Incorrect new size: {size}') mode = { - "linear": cv2.INTER_LINEAR, - "cubic": cv2.INTER_CUBIC, - "nearest": cv2.INTER_NEAREST, - "area": cv2.INTER_AREA, + 'linear': cv2.INTER_LINEAR, + 'cubic': cv2.INTER_CUBIC, + 'nearest': cv2.INTER_NEAREST, + 'area': cv2.INTER_AREA, }[interp] return cv2.resize(image, (w_new, h_new), interpolation=mode), scale @@ -129,6 +131,7 @@ def load_image(path: Path, resize: int = None, **kwargs) -> torch.Tensor: class Extractor(torch.nn.Module): + def __init__(self, **conf): super().__init__() self.conf = SimpleNamespace(**{**self.default_conf, **conf}) @@ -140,10 +143,14 @@ class Extractor(torch.nn.Module): img = img[None] # add batch dim assert img.dim() == 4 and img.shape[0] == 1 shape = img.shape[-2:][::-1] - img, scales = ImagePreprocessor(**{**self.preprocess_conf, **conf})(img) - feats = self.forward({"image": img}) - feats["image_size"] = torch.tensor(shape)[None].to(img).float() - feats["keypoints"] = (feats["keypoints"] + 0.5) / scales[None] - 0.5 + img, scales = ImagePreprocessor(**{ + **self.preprocess_conf, + **conf + })( + img) + feats = self.forward({'image': img}) + feats['image_size'] = torch.tensor(shape)[None].to(img).float() + feats['keypoints'] = (feats['keypoints'] + 0.5) / scales[None] - 0.5 return feats @@ -152,13 +159,13 @@ def match_pair( matcher, image0: torch.Tensor, image1: torch.Tensor, - device: str = "cpu", + device: str = 'cpu', **preprocess, ): """Match a pair of images (image0, image1) with an extractor and matcher""" feats0 = extractor.extract(image0, **preprocess) feats1 = extractor.extract(image1, **preprocess) - matches01 = matcher({"image0": feats0, "image1": feats1}) + matches01 = matcher({'image0': feats0, 'image1': feats1}) data = [feats0, feats1, matches01] # remove batch dim and move to target device feats0, feats1, matches01 = [batch_to_device(rbd(x), device) for x in data] diff --git a/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py b/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py index 22dc3f65..13ea8a58 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py +++ b/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py @@ -22,10 +22,12 @@ def cm_RdGn(x): def cm_BlRdGn(x_): """Custom colormap: blue (-1) -> red (0.0) -> green (1).""" x = np.clip(x_, 0, 1)[..., None] * 2 - c = x * np.array([[0, 1.0, 0, 1.0]]) + (2 - x) * np.array([[1.0, 0, 0, 1.0]]) + c = x * np.array([[0, 1.0, 0, 1.0]]) + (2 - x) * np.array( + [[1.0, 0, 0, 1.0]]) xn = -np.clip(x_, -1, 0)[..., None] * 2 - cn = xn * np.array([[0, 0.1, 1, 1.0]]) + (2 - xn) * np.array([[1.0, 0, 0, 1.0]]) + cn = xn * np.array([[0, 0.1, 1, 1.0]]) + (2 - xn) * np.array( + [[1.0, 0, 0, 1.0]]) out = np.clip(np.where(x_[..., None] < 0, cn, c), 0, 1) return out @@ -39,7 +41,12 @@ def cm_prune(x_): return cm_BlRdGn(norm_x) -def plot_images(imgs, titles=None, cmaps="gray", dpi=100, pad=0.5, adaptive=True): +def plot_images(imgs, + titles=None, + cmaps='gray', + dpi=100, + pad=0.5, + adaptive=True): """Plot a set of images horizontally. Args: imgs: list of NumPy RGB (H, W, 3) or PyTorch RGB (3, H, W) or mono (H, W). @@ -49,9 +56,8 @@ def plot_images(imgs, titles=None, cmaps="gray", dpi=100, pad=0.5, adaptive=True """ # conversion to (H, W, 3) for torch.Tensor imgs = [ - img.permute(1, 2, 0).cpu().numpy() - if (isinstance(img, torch.Tensor) and img.dim() == 3) - else img + img.permute(1, 2, 0).cpu().numpy() if + (isinstance(img, torch.Tensor) and img.dim() == 3) else img for img in imgs ] @@ -65,8 +71,7 @@ def plot_images(imgs, titles=None, cmaps="gray", dpi=100, pad=0.5, adaptive=True ratios = [4 / 3] * n figsize = [sum(ratios) * 4.5, 4.5] fig, ax = plt.subplots( - 1, n, figsize=figsize, dpi=dpi, gridspec_kw={"width_ratios": ratios} - ) + 1, n, figsize=figsize, dpi=dpi, gridspec_kw={'width_ratios': ratios}) if n == 1: ax = [ax] for i in range(n): @@ -81,7 +86,7 @@ def plot_images(imgs, titles=None, cmaps="gray", dpi=100, pad=0.5, adaptive=True fig.tight_layout(pad=pad) -def plot_keypoints(kpts, colors="lime", ps=4, axes=None, a=1.0): +def plot_keypoints(kpts, colors='lime', ps=4, axes=None, a=1.0): """Plot keypoints for existing images. Args: kpts: list of ndarrays of size (N, 2). @@ -100,7 +105,14 @@ def plot_keypoints(kpts, colors="lime", ps=4, axes=None, a=1.0): ax.scatter(k[:, 0], k[:, 1], c=c, s=ps, linewidths=0, alpha=alpha) -def plot_matches(kpts0, kpts1, color=None, lw=1.5, ps=4, a=1.0, labels=None, axes=None): +def plot_matches(kpts0, + kpts1, + color=None, + lw=1.5, + ps=4, + a=1.0, + labels=None, + axes=None): """Plot matches for a pair of existing images. Args: kpts0, kpts1: corresponding keypoints of size (N, 2). @@ -160,25 +172,28 @@ def add_text( text, pos=(0.01, 0.99), fs=15, - color="w", - lcolor="k", + color='w', + lcolor='k', lwidth=2, - ha="left", - va="top", + ha='left', + va='top', ): ax = plt.gcf().axes[idx] t = ax.text( - *pos, text, fontsize=fs, ha=ha, va=va, color=color, transform=ax.transAxes - ) + *pos, + text, + fontsize=fs, + ha=ha, + va=va, + color=color, + transform=ax.transAxes) if lcolor is not None: - t.set_path_effects( - [ - path_effects.Stroke(linewidth=lwidth, foreground=lcolor), - path_effects.Normal(), - ] - ) + t.set_path_effects([ + path_effects.Stroke(linewidth=lwidth, foreground=lcolor), + path_effects.Normal(), + ]) def save_plot(path, **kw): """Save the current figure without any white margin.""" - plt.savefig(path, bbox_inches="tight", pad_inches=0, **kw) + plt.savefig(path, bbox_inches='tight', pad_inches=0, **kw) diff --git a/modelscope/models/cv/image_matching_fast/lightglue_model.py b/modelscope/models/cv/image_matching_fast/lightglue_model.py index c899a627..8043051c 100644 --- a/modelscope/models/cv/image_matching_fast/lightglue_model.py +++ b/modelscope/models/cv/image_matching_fast/lightglue_model.py @@ -13,9 +13,9 @@ from modelscope.models.base.base_torch_model import TorchModel from modelscope.models.builder import MODELS from modelscope.outputs import OutputKeys from modelscope.utils.constant import ModelFile, Tasks -from .lightglue import LightGlue, SuperPoint, DISK, ALIKED, SIFT -from .lightglue.utils import rbd, numpy_image_to_torch from .config.default import lightglue_default_conf +from .lightglue import ALIKED, DISK, SIFT, LightGlue, SuperPoint +from .lightglue.utils import numpy_image_to_torch, rbd @MODELS.register_module( @@ -30,20 +30,28 @@ class LightGlueImageMatching(TorchModel): super().__init__(model_dir, **kwargs) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 'mps', 'cpu' + self.device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu') # 'mps', 'cpu' + + features = lightglue_default_conf.get('features', 'superpoint') - features = lightglue_default_conf.get('features','superpoint') - if features == 'disk': - self.extractor = DISK(max_num_keypoints=max_num_keypoints).eval().to(self.device) + self.extractor = DISK( + max_num_keypoints=max_num_keypoints).eval().to(self.device) elif features == 'aliked': - self.extractor = ALIKED(max_num_keypoints=max_num_keypoints).eval().to(self.device) + self.extractor = ALIKED( + max_num_keypoints=max_num_keypoints).eval().to(self.device) elif features == 'sift': - self.extractor = SIFT(max_num_keypoints=max_num_keypoints).eval().to(self.device) + self.extractor = SIFT( + max_num_keypoints=max_num_keypoints).eval().to(self.device) else: - self.extractor = SuperPoint(model_dir=model_dir, max_num_keypoints=max_num_keypoints).eval().to(self.device) - - self.matcher = LightGlue(model_dir=model_dir, default_conf=lightglue_default_conf).eval().to(self.device) + self.extractor = SuperPoint( + model_dir=model_dir, + max_num_keypoints=max_num_keypoints).eval().to(self.device) + + self.matcher = LightGlue( + model_dir=model_dir, + default_conf=lightglue_default_conf).eval().to(self.device) def forward(self, inputs): ''' @@ -51,9 +59,11 @@ class LightGlueImageMatching(TorchModel): inputs: a dict with keys 'image0', 'image1' ''' - feats0 = self.extractor.extract(numpy_image_to_torch(inputs['image0']).to(self.device)) - feats1 = self.extractor.extract(numpy_image_to_torch(inputs['image1']).to(self.device)) - matches01 = self.matcher({"image0": feats0, "image1": feats1}) + feats0 = self.extractor.extract( + numpy_image_to_torch(inputs['image0']).to(self.device)) + feats1 = self.extractor.extract( + numpy_image_to_torch(inputs['image1']).to(self.device)) + matches01 = self.matcher({'image0': feats0, 'image1': feats1}) return [feats0, feats1, matches01] @@ -63,17 +73,21 @@ class LightGlueImageMatching(TorchModel): inputs: a list of feats0, feats1, matches01 ''' matching_result = inputs - feats0, feats1, matches01 = [ - rbd(x) for x in matching_result - ] # remove batch dimension + feats0, feats1, matches01 = [rbd(x) for x in matching_result + ] # remove batch dimension - kpts0, kpts1, matches = feats0["keypoints"], feats1["keypoints"], matches01["matches"] + kpts0, kpts1, matches = feats0['keypoints'], feats1[ + 'keypoints'], matches01['matches'] m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]] # match confidence - confidence = matches01["scores"] + confidence = matches01['scores'] - matches_result = {'kpts0': m_kpts0,'kpts1': m_kpts1,'confidence': confidence} + matches_result = { + 'kpts0': m_kpts0, + 'kpts1': m_kpts1, + 'confidence': confidence + } results = {OutputKeys.MATCHES: matches_result} return results diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 4d74e965..17e210ac 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -296,7 +296,9 @@ else: ], 'human3d_render_pipeline': ['Human3DRenderPipeline'], 'human3d_animation_pipeline': ['Human3DAnimationPipeline'], - 'image_local_feature_matching_pipeline': ['ImageLocalFeatureMatchingPipeline'], + 'image_local_feature_matching_pipeline': [ + 'ImageLocalFeatureMatchingPipeline' + ], 'rife_video_frame_interpolation_pipeline': [ 'RIFEVideoFrameInterpolationPipeline' ], diff --git a/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py b/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py index 81fc60d0..a49ca08d 100644 --- a/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py +++ b/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py @@ -27,8 +27,10 @@ class ImageLocalFeatureMatchingPipeline(Pipeline): >>> from modelscope.pipelines import pipeline - >>> matcher = pipeline(Tasks.image_local_feature_matching, model='Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data') - >>> matcher([['https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching1.jpg','https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching2.jpg']]) + >>> matcher = pipeline(Tasks.image_local_feature_matching, + >>> model='Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data') + >>> matcher([['https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching1.jpg', + >>> 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching2.jpg']]) >>> [{ >>> 'matches': [array([[720.5 , 187.8 ], >>> [707.4 , 198.23334], @@ -69,7 +71,6 @@ class ImageLocalFeatureMatchingPipeline(Pipeline): """ super().__init__(model=model, **kwargs) - def load_image(self, img_name): img = LoadImage.convert_to_ndarray(img_name).astype(np.float32) img = img / 255. diff --git a/modelscope/pipelines/cv/image_matching_fast_pipeline.py b/modelscope/pipelines/cv/image_matching_fast_pipeline.py index 92e9b72b..8af15f72 100644 --- a/modelscope/pipelines/cv/image_matching_fast_pipeline.py +++ b/modelscope/pipelines/cv/image_matching_fast_pipeline.py @@ -67,10 +67,7 @@ class ImageMatchingFastPipeline(Pipeline): img1 = self.load_image(input[0]) img2 = self.load_image(input[1]) - return { - 'image0':img1, - 'image1':img2 - } + return {'image0': img1, 'image1': img2} def forward(self, input: Dict[str, Any]) -> list: results = self.model.inference(input) diff --git a/tests/pipelines/test_image_local_feature_matching.py b/tests/pipelines/test_image_local_feature_matching.py index 84c99d01..1a1503db 100644 --- a/tests/pipelines/test_image_local_feature_matching.py +++ b/tests/pipelines/test_image_local_feature_matching.py @@ -26,11 +26,12 @@ class ImageLocalFeatureMatchingTest(unittest.TestCase): 'data/test/images/image_matching1.jpg', 'data/test/images/image_matching2.jpg' ]] - estimator = pipeline(Tasks.image_local_feature_matching, model=self.model_id) + estimator = pipeline( + Tasks.image_local_feature_matching, model=self.model_id) result = estimator(input_location) kpts0, kpts1, conf = result[0][OutputKeys.MATCHES] vis_img = result[0][OutputKeys.OUTPUT_IMG] - cv2.imwrite("vis_demo.jpg", vis_img) + cv2.imwrite('vis_demo.jpg', vis_img) print('test_image_local_feature_matching DONE') diff --git a/tests/pipelines/test_image_matching_fast.py b/tests/pipelines/test_image_matching_fast.py index fa352cdd..87769c4d 100644 --- a/tests/pipelines/test_image_matching_fast.py +++ b/tests/pipelines/test_image_matching_fast.py @@ -32,7 +32,7 @@ class ImageMatchingFastTest(unittest.TestCase): kpts1, confidence, output_filename='lightglue-matches.png', - method="lightglue") + method='lightglue') print('test_image_matching DONE') From c3bb9e71cf45040438991ae90d75fff28f08712f Mon Sep 17 00:00:00 2001 From: liuyhwangyh Date: Mon, 22 Jan 2024 22:51:09 +0800 Subject: [PATCH 2/2] add download retry reason message and some optimize (#734) 1. optimize download retyr message 2. fix input_output pipeline_info bug on python3.8 Co-authored-by: mulin.lyh --- modelscope/hub/constants.py | 4 +--- modelscope/hub/file_download.py | 12 +++++------ modelscope/hub/snapshot_download.py | 8 +++++++- modelscope/hub/utils/caching.py | 32 +++++++++++++++++++++++------ modelscope/utils/input_output.py | 5 ++++- 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index 3ebc167d..362f323d 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -19,7 +19,7 @@ REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete'] API_HTTP_CLIENT_TIMEOUT = 60 API_RESPONSE_FIELD_DATA = 'Data' API_FILE_DOWNLOAD_RETRY_TIMES = 5 -API_FILE_DOWNLOAD_TIMEOUT = 30 +API_FILE_DOWNLOAD_TIMEOUT = 60 API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 16 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken' API_RESPONSE_FIELD_USERNAME = 'Username' @@ -29,8 +29,6 @@ MODELSCOPE_CLOUD_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' MODELSCOPE_CLOUD_USERNAME = 'MODELSCOPE_USERNAME' MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG' ONE_YEAR_SECONDS = 24 * 365 * 60 * 60 -MODEL_META_FILE_NAME = '.mdl' -MODEL_META_MODEL_ID = 'id' MODELSCOPE_REQUEST_ID = 'X-Request-ID' diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index c37b716a..e4cc21fe 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -190,7 +190,7 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): def download_part_with_retry(params): # unpack parameters - progress, start, end, url, file_name, cookies, headers = params + model_file_name, progress, start, end, url, file_name, cookies, headers = params get_headers = {} if headers is None else copy.deepcopy(headers) get_headers['Range'] = 'bytes=%s-%s' % (start, end) get_headers['X-Request-ID'] = str(uuid.uuid4().hex) @@ -216,8 +216,8 @@ def download_part_with_retry(params): break except (Exception) as e: # no matter what exception, we will retry. retry = retry.increment('GET', url, error=e) - logger.warning('Download file from: %s to: %s failed, will retry' % - (start, end)) + logger.warning('Downloading: %s failed, reason: %s will retry' % + (model_file_name, e)) retry.sleep() @@ -246,10 +246,10 @@ def parallel_download( for idx in range(int(file_size / PART_SIZE)): start = idx * PART_SIZE end = (idx + 1) * PART_SIZE - 1 - tasks.append( - (progress, start, end, url, temp_file.name, cookies, headers)) + tasks.append((file_name, progress, start, end, url, temp_file.name, + cookies, headers)) if end + 1 < file_size: - tasks.append((progress, end + 1, file_size - 1, url, + tasks.append((file_name, progress, end + 1, file_size - 1, url, temp_file.name, cookies, headers)) parallels = MODELSCOPE_DOWNLOAD_PARALLELS if MODELSCOPE_DOWNLOAD_PARALLELS <= 4 else 4 with ThreadPoolExecutor( diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py index 078dd65f..aafd4cd9 100644 --- a/modelscope/hub/snapshot_download.py +++ b/modelscope/hub/snapshot_download.py @@ -103,6 +103,10 @@ def snapshot_download(model_id: str, 'Snapshot': 'True' } } + if cache.cached_model_revision is not None: + snapshot_header[ + 'cached_model_revision'] = cache.cached_model_revision + model_files = _api.get_model_files( model_id=model_id, revision=revision, @@ -158,7 +162,9 @@ def snapshot_download(model_id: str, temp_file = os.path.join(temp_cache_dir, model_file['Name']) if FILE_HASH in model_file: file_integrity_validation(temp_file, model_file[FILE_HASH]) - # put file to cache + # put file into to cache cache.put_file(model_file, temp_file) + cache.save_model_version(revision=revision) + return os.path.join(cache.get_root_location()) diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py index f92aaaf4..78f3929d 100644 --- a/modelscope/hub/utils/caching.py +++ b/modelscope/hub/utils/caching.py @@ -6,7 +6,6 @@ import pickle import tempfile from shutil import move, rmtree -from modelscope.hub.constants import MODEL_META_FILE_NAME, MODEL_META_MODEL_ID from modelscope.utils.logger import get_logger logger = get_logger() @@ -16,6 +15,9 @@ logger = get_logger() class FileSystemCache(object): KEY_FILE_NAME = '.msc' + MODEL_META_FILE_NAME = '.mdl' + MODEL_META_MODEL_ID = 'id' + MODEL_VERSION_FILE_NAME = '.mv' """Local file cache. """ @@ -133,24 +135,42 @@ class ModelFileSystemCache(FileSystemCache): self.load_model_meta() else: super().__init__(os.path.join(cache_root, owner, name)) - self.model_meta = {MODEL_META_MODEL_ID: '%s/%s' % (owner, name)} + self.model_meta = { + FileSystemCache.MODEL_META_MODEL_ID: '%s/%s' % (owner, name) + } self.save_model_meta() + self.cached_model_revision = self.load_model_version() def load_model_meta(self): meta_file_path = os.path.join(self.cache_root_location, - MODEL_META_FILE_NAME) + FileSystemCache.MODEL_META_FILE_NAME) if os.path.exists(meta_file_path): with open(meta_file_path, 'rb') as f: self.model_meta = pickle.load(f) else: - self.model_meta = {MODEL_META_MODEL_ID: 'unknown'} + self.model_meta = {FileSystemCache.MODEL_META_MODEL_ID: 'unknown'} + + def load_model_version(self): + model_version_file_path = os.path.join( + self.cache_root_location, FileSystemCache.MODEL_VERSION_FILE_NAME) + if os.path.exists(model_version_file_path): + with open(model_version_file_path, 'r') as f: + return f.read().strip() + else: + return None + + def save_model_version(self, revision: str): + model_version_file_path = os.path.join( + self.cache_root_location, FileSystemCache.MODEL_VERSION_FILE_NAME) + with open(model_version_file_path, 'w') as f: + f.write(revision) def get_model_id(self): - return self.model_meta[MODEL_META_MODEL_ID] + return self.model_meta[FileSystemCache.MODEL_META_MODEL_ID] def save_model_meta(self): meta_file_path = os.path.join(self.cache_root_location, - MODEL_META_FILE_NAME) + FileSystemCache.MODEL_META_FILE_NAME) with open(meta_file_path, 'wb') as f: pickle.dump(self.model_meta, f) diff --git a/modelscope/utils/input_output.py b/modelscope/utils/input_output.py index 5e3e1305..b8e1df9a 100644 --- a/modelscope/utils/input_output.py +++ b/modelscope/utils/input_output.py @@ -547,6 +547,9 @@ class PipelineInfomation(): }, } + def __getitem__(self, key): + return self.__dict__.get('_%s' % key) + def is_url(url: str): """Check the input url is valid url. @@ -645,7 +648,7 @@ def call_pipeline_with_json(pipeline_info: PipelineInfomation, # result = pipeline(**pipeline_inputs) # else: pipeline_inputs, parameters = service_base64_input_to_pipeline_input( - pipeline_info['task_name'], body) + pipeline_info.task_name, body) result = pipeline(pipeline_inputs, **parameters) return result