From 9187103e3a32d4048e79e57d23fa596b2d1bffd5 Mon Sep 17 00:00:00 2001 From: "yichang.zyc" Date: Tue, 1 Nov 2022 09:57:31 +0800 Subject: [PATCH] =?UTF-8?q?[to=20#42322933]=E5=85=BC=E5=AE=B9=E6=96=B0?= =?UTF-8?q?=E5=A2=9Eclip=20huge=E6=A8=A1=E5=9E=8B=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20Link:=20https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/coder?= =?UTF-8?q?eview/10585552?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * compatiable with vit huge, and set clip base default mm-ebed pipeline --- modelscope/models/multi_modal/clip/model.py | 6 ++++-- modelscope/pipelines/builder.py | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py index b1c84292..9b82e4a1 100644 --- a/modelscope/models/multi_modal/clip/model.py +++ b/modelscope/models/multi_modal/clip/model.py @@ -349,11 +349,13 @@ class CLIP(nn.Module): text_num_hidden_layers: int, text_type_vocab_size: int, tokenizer: FullTokenizer, + # vision_head_width, added this param for ViT-H + vision_head_width: int = 64, ): super().__init__() if isinstance(vision_layers, (tuple, list)): - vision_heads = vision_width * 32 // 64 + vision_heads = vision_width * 32 // vision_head_width self.visual = ModifiedResNet( layers=vision_layers, output_dim=embed_dim, @@ -361,7 +363,7 @@ class CLIP(nn.Module): input_resolution=image_resolution, width=vision_width) else: - vision_heads = vision_width // 64 + vision_heads = vision_width // vision_head_width self.visual = VisualTransformer( input_resolution=image_resolution, patch_size=vision_patch_size, diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index 498c9ed8..70f8f11c 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -93,9 +93,8 @@ DEFAULT_MODEL_FOR_PIPELINE = { 'damo/cv_resnet50_live-category'), Tasks.video_category: (Pipelines.video_category, 'damo/cv_resnet50_video-category'), - Tasks.multi_modal_embedding: - (Pipelines.multi_modal_embedding, - 'damo/multi-modal_clip-vit-large-patch14_zh'), + Tasks.multi_modal_embedding: (Pipelines.multi_modal_embedding, + 'damo/multi-modal_clip-vit-base-patch16_zh'), Tasks.generative_multi_modal_embedding: (Pipelines.generative_multi_modal_embedding, 'damo/multi-modal_gemm-vit-large-patch14_generative-multi-modal-embedding'