From 25298e4f127a874578ffe4f579f10f48ade9bf42 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Mon, 22 Aug 2022 10:48:46 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"[to=20#42322933]=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E4=B8=AD=E6=96=87CLIP=E6=A8=A1=E5=9E=8Binference=E4=BB=A3?= =?UTF-8?q?=E7=A0=81"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2ad757a43420be7089f88d9778486274b7976f85. --- modelscope/models/multi_modal/__init__.py | 2 - .../models/multi_modal/clip/__init__.py | 2 +- .../models/multi_modal/clip/bert_tokenizer.py | 422 ----------- .../models/multi_modal/clip/clip_bert.py | 29 + .../models/multi_modal/clip/clip_vit.py | 131 ++++ .../multi_modal/clip/configuration_bert.py | 82 --- modelscope/models/multi_modal/clip/model.py | 677 ------------------ .../models/multi_modal/clip/modeling_bert.py | 507 ------------- .../models/multi_modal/mplug/clip/clip.py | 62 +- tests/pipelines/test_multi_modal_embedding.py | 66 +- ...test_clip_multi_modal_embedding_trainer.py | 60 ++ 11 files changed, 251 insertions(+), 1789 deletions(-) delete mode 100644 modelscope/models/multi_modal/clip/bert_tokenizer.py create mode 100644 modelscope/models/multi_modal/clip/clip_bert.py create mode 100644 modelscope/models/multi_modal/clip/clip_vit.py delete mode 100644 modelscope/models/multi_modal/clip/configuration_bert.py delete mode 100644 modelscope/models/multi_modal/clip/model.py delete mode 100644 modelscope/models/multi_modal/clip/modeling_bert.py create mode 100644 tests/trainers/test_clip_multi_modal_embedding_trainer.py diff --git a/modelscope/models/multi_modal/__init__.py b/modelscope/models/multi_modal/__init__.py index 6c40a3da..9a0636ee 100644 --- a/modelscope/models/multi_modal/__init__.py +++ b/modelscope/models/multi_modal/__init__.py @@ -12,8 +12,6 @@ if TYPE_CHECKING: from .mplug_for_visual_question_answering import \ MPlugForVisualQuestionAnswering from .ofa_for_all_tasks import OfaForAllTasks - from .ofa_for_text_to_image_synthesis_model import \ - OfaForTextToImageSynthesis else: _import_structure = { diff --git a/modelscope/models/multi_modal/clip/__init__.py b/modelscope/models/multi_modal/clip/__init__.py index 3fd492b9..bb2fb3b2 100644 --- a/modelscope/models/multi_modal/clip/__init__.py +++ b/modelscope/models/multi_modal/clip/__init__.py @@ -1 +1 @@ -from .model import CLIPForMultiModalEmbedding +from .clip_model import CLIPForMultiModalEmbedding diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py deleted file mode 100644 index 8d356f42..00000000 --- a/modelscope/models/multi_modal/clip/bert_tokenizer.py +++ /dev/null @@ -1,422 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes.""" - -from __future__ import absolute_import, division, print_function -import collections -import os -import re -import unicodedata - -import six - - -def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): - """Checks whether the casing config is consistent with the checkpoint name.""" - - # The casing has to be passed in by the user and there is no explicit check - # as to whether it matches the checkpoint. The casing information probably - # should have been stored in the bert_config.json file, but it's not, so - # we have to heuristically detect it to validate. - - if not init_checkpoint: - return - - m = re.match('^.*?([A-Za-z0-9_-]+)/bert_model.ckpt', init_checkpoint) - if m is None: - return - - model_name = m.group(1) - - lower_models = [ - 'uncased_L-24_H-1024_A-16', 'uncased_L-12_H-768_A-12', - 'multilingual_L-12_H-768_A-12', 'chinese_L-12_H-768_A-12' - ] - - cased_models = [ - 'cased_L-12_H-768_A-12', 'cased_L-24_H-1024_A-16', - 'multi_cased_L-12_H-768_A-12' - ] - - is_bad_config = False - if model_name in lower_models and not do_lower_case: - is_bad_config = True - actual_flag = 'False' - case_name = 'lowercased' - opposite_flag = 'True' - - if model_name in cased_models and do_lower_case: - is_bad_config = True - actual_flag = 'True' - case_name = 'cased' - opposite_flag = 'False' - - if is_bad_config: - raise ValueError( - 'You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. ' - 'However, `%s` seems to be a %s model, so you ' - 'should pass in `--do_lower_case=%s` so that the fine-tuning matches ' - 'how the model was pre-training. If this error is wrong, please ' - 'just comment out this check.' % - (actual_flag, init_checkpoint, model_name, case_name, - opposite_flag)) - - -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode('utf-8', 'ignore') - else: - raise ValueError('Unsupported string type: %s' % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode('utf-8', 'ignore') - elif isinstance(text, unicode): - return text - else: - raise ValueError('Unsupported string type: %s' % (type(text))) - else: - raise ValueError('Not running on Python2 or Python 3?') - - -def printable_text(text): - """Returns text encoded in a way suitable for print or `tf.logging`.""" - - # These functions want `str` for both Python2 and Python3, but in one case - # it's a Unicode string and in the other it's a byte string. - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode('utf-8', 'ignore') - else: - raise ValueError('Unsupported string type: %s' % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text - elif isinstance(text, unicode): - return text.encode('utf-8') - else: - raise ValueError('Unsupported string type: %s' % (type(text))) - else: - raise ValueError('Not running on Python2 or Python 3?') - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with open(vocab_file, 'r') as reader: - while True: - token = convert_to_unicode(reader.readline()) - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 - return vocab - - -def convert_by_vocab(vocab, items): - """Converts a sequence of [tokens|ids] using the vocab.""" - output = [] - for item in items: - output.append(vocab[item]) - return output - - -def convert_tokens_to_ids(vocab, tokens): - return convert_by_vocab(vocab, tokens) - - -def convert_ids_to_tokens(inv_vocab, ids): - return convert_by_vocab(inv_vocab, ids) - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class FullTokenizer(object): - """Runs end-to-end tokenziation.""" - - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab(self.vocab, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab(self.inv_vocab, ids) - - @staticmethod - def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): - """ Converts a sequence of tokens (string) in a single string. """ - - def clean_up_tokenization(out_string): - """ Clean up a list of simple English tokenization artifacts - like spaces before punctuations and abreviated forms. - """ - out_string = ( - out_string.replace(' .', '.').replace(' ?', '?').replace( - ' !', '!').replace(' ,', ',').replace(" ' ", "'").replace( - " n't", "n't").replace(" 'm", "'m").replace( - " 's", "'s").replace(" 've", - "'ve").replace(" 're", "'re")) - return out_string - - text = ' '.join(tokens).replace(' ##', '').strip() - if clean_up_tokenization_spaces: - clean_text = clean_up_tokenization(text) - return clean_text - else: - return text - - def vocab_size(self): - return len(self.vocab) - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, do_lower_case=True): - """Constructs a BasicTokenizer. - - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = convert_to_unicode(text) - text = self._clean_text(text) - - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(' '.join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize('NFD', text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == 'Mn': - continue - output.append(char) - return ''.join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return [''.join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(' ') - output.append(char) - output.append(' ') - else: - output.append(char) - return ''.join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) - or (cp >= 0x20000 and cp <= 0x2A6DF) - or (cp >= 0x2A700 and cp <= 0x2B73F) - or (cp >= 0x2B740 and cp <= 0x2B81F) - or (cp >= 0x2B820 and cp <= 0x2CEAF) - or (cp >= 0xF900 and cp <= 0xFAFF) - or (cp >= 0x2F800 and cp <= 0x2FA1F)): - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(' ') - else: - output.append(char) - return ''.join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenziation.""" - - def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer. - - Returns: - A list of wordpiece tokens. - """ - - text = convert_to_unicode(text) - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = ''.join(chars[start:end]) - if start > 0: - substr = '##' + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == ' ' or char == '\t' or char == '\n' or char == '\r': - return True - cat = unicodedata.category(char) - if cat == 'Zs': - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == '\t' or char == '\n' or char == '\r': - return False - cat = unicodedata.category(char) - if cat in ('Cc', 'Cf'): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) - or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith('P'): - return True - return False diff --git a/modelscope/models/multi_modal/clip/clip_bert.py b/modelscope/models/multi_modal/clip/clip_bert.py new file mode 100644 index 00000000..24ccc1fa --- /dev/null +++ b/modelscope/models/multi_modal/clip/clip_bert.py @@ -0,0 +1,29 @@ +import torch.nn as nn +from transformers import BertConfig, BertForMaskedLM + + +class TextTransformer(nn.Module): + + def __init__(self, config_dict, feat_dim=768, use_grad_ckp=True): + super(TextTransformer, self).__init__() + bert_config = BertConfig.from_dict(config_dict) + if use_grad_ckp: + bert_config.gradient_checkpointing = True + + self.bert = BertForMaskedLM(bert_config).bert + + self.projector = nn.Linear( + bert_config.hidden_size, feat_dim, bias=False) + + def forward(self, input_ids, attention_mask): + trans_features = { + 'input_ids': input_ids, + 'attention_mask': attention_mask + } + + output_states = self.bert(**trans_features, return_dict=False) + output_tokens = output_states[0] + + cls_tokens = output_tokens[:, 0, :] + + return self.projector(cls_tokens) diff --git a/modelscope/models/multi_modal/clip/clip_vit.py b/modelscope/models/multi_modal/clip/clip_vit.py new file mode 100644 index 00000000..cfe67426 --- /dev/null +++ b/modelscope/models/multi_modal/clip/clip_vit.py @@ -0,0 +1,131 @@ +# Copyright 2021 The OpenAI CLIP Authors. All rights reserved. + +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from torch import nn + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None, + use_grad_ckp: bool = True): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + + self.use_grad_ckp = use_grad_ckp + + def forward(self, x: torch.Tensor): + if self.use_grad_ckp: + for each_block in self.resblocks: + x = checkpoint.checkpoint(each_block, x) + return x + else: + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + + def __init__(self, input_resolution: int, patch_size: int, width: int, + layers: int, heads: int, output_dim: int, use_grad_ckp: bool): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer( + width, layers, heads, use_grad_ckp=use_grad_ckp) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + class_embeddings = self.class_embedding.to(x.dtype) + \ + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([class_embeddings, x], dim=1) + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x diff --git a/modelscope/models/multi_modal/clip/configuration_bert.py b/modelscope/models/multi_modal/clip/configuration_bert.py deleted file mode 100644 index b75f5db8..00000000 --- a/modelscope/models/multi_modal/clip/configuration_bert.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" BERT model configuration """ - -from __future__ import (absolute_import, division, print_function, - unicode_literals) -import logging - -logger = logging.getLogger(__name__) - - -class BertConfig(object): - r""" - :class:`~transformers.BertConfig` is the configuration class to store the configuration of a - `BertModel`. - - - Arguments: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - """ - - def __init__(self, - vocab_size_or_config_json_file=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act='gelu', - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - output_attentions=False, - output_hidden_states=False): - self.vocab_size = vocab_size_or_config_json_file - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - self.output_attentions = output_attentions - self.output_hidden_states = output_hidden_states diff --git a/modelscope/models/multi_modal/clip/model.py b/modelscope/models/multi_modal/clip/model.py deleted file mode 100644 index 2fb0d7e3..00000000 --- a/modelscope/models/multi_modal/clip/model.py +++ /dev/null @@ -1,677 +0,0 @@ -import os -from collections import OrderedDict -from typing import Any, Dict, Iterable, List, Tuple, Union - -import json -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from PIL import Image -from torchvision.transforms import Compose, Normalize, Resize, ToTensor - -from modelscope.metainfo import Models -from modelscope.models import TorchModel -from modelscope.models.builder import MODELS -from modelscope.models.multi_modal.clip.bert_tokenizer import FullTokenizer -from modelscope.models.multi_modal.clip.configuration_bert import BertConfig -from modelscope.models.multi_modal.clip.modeling_bert import BertModel -from modelscope.utils.constant import ModeKeys, ModelFile, Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['CLIPForMultiModalEmbedding'] - - -class Bottleneck(nn.Module): - expansion = 4 - - def __init__(self, inplanes, planes, stride=1): - super().__init__() - - # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 - self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - - self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - - self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() - - self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) - self.bn3 = nn.BatchNorm2d(planes * self.expansion) - - self.relu = nn.ReLU(inplace=True) - self.downsample = None - self.stride = stride - - if stride > 1 or inplanes != planes * Bottleneck.expansion: - # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 - self.downsample = nn.Sequential( - OrderedDict([('-1', nn.AvgPool2d(stride)), - ('0', - nn.Conv2d( - inplanes, - planes * self.expansion, - 1, - stride=1, - bias=False)), - ('1', nn.BatchNorm2d(planes * self.expansion))])) - - def forward(self, x: torch.Tensor): - identity = x - - out = self.relu(self.bn1(self.conv1(x))) - out = self.relu(self.bn2(self.conv2(out))) - out = self.avgpool(out) - out = self.bn3(self.conv3(out)) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - return out - - -class AttentionPool2d(nn.Module): - - def __init__(self, - spacial_dim: int, - embed_dim: int, - num_heads: int, - output_dim: int = None): - super().__init__() - self.positional_embedding = nn.Parameter( - torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) - self.k_proj = nn.Linear(embed_dim, embed_dim) - self.q_proj = nn.Linear(embed_dim, embed_dim) - self.v_proj = nn.Linear(embed_dim, embed_dim) - self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) - self.num_heads = num_heads - - def forward(self, x): - x = x.reshape(x.shape[0], x.shape[1], - x.shape[2] * x.shape[3]).permute(2, 0, - 1) # NCHW -> (HW)NC - x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC - x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC - x, _ = F.multi_head_attention_forward( - query=x, - key=x, - value=x, - embed_dim_to_check=x.shape[-1], - num_heads=self.num_heads, - q_proj_weight=self.q_proj.weight, - k_proj_weight=self.k_proj.weight, - v_proj_weight=self.v_proj.weight, - in_proj_weight=None, - in_proj_bias=torch.cat( - [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), - bias_k=None, - bias_v=None, - add_zero_attn=False, - dropout_p=0, - out_proj_weight=self.c_proj.weight, - out_proj_bias=self.c_proj.bias, - use_separate_proj_weight=True, - training=self.training, - need_weights=False) - - return x[0] - - -class ModifiedResNet(nn.Module): - """ - A ResNet class that is similar to torchvision's but contains the following changes: - - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. - - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 - - The final pooling layer is a QKV attention instead of an average pool - """ - - def __init__(self, - layers, - output_dim, - heads, - input_resolution=224, - width=64): - super().__init__() - self.output_dim = output_dim - self.input_resolution = input_resolution - - # the 3-layer stem - self.conv1 = nn.Conv2d( - 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(width // 2) - self.conv2 = nn.Conv2d( - width // 2, width // 2, kernel_size=3, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(width // 2) - self.conv3 = nn.Conv2d( - width // 2, width, kernel_size=3, padding=1, bias=False) - self.bn3 = nn.BatchNorm2d(width) - self.avgpool = nn.AvgPool2d(2) - self.relu = nn.ReLU(inplace=True) - - # residual layers - self._inplanes = width # this is a *mutable* variable used during construction - self.layer1 = self._make_layer(width, layers[0]) - self.layer2 = self._make_layer(width * 2, layers[1], stride=2) - self.layer3 = self._make_layer(width * 4, layers[2], stride=2) - self.layer4 = self._make_layer(width * 8, layers[3], stride=2) - - embed_dim = width * 32 # the ResNet feature dimension - self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, - heads, output_dim) - - def _make_layer(self, planes, blocks, stride=1): - layers = [Bottleneck(self._inplanes, planes, stride)] - - self._inplanes = planes * Bottleneck.expansion - for _ in range(1, blocks): - layers.append(Bottleneck(self._inplanes, planes)) - - return nn.Sequential(*layers) - - def forward(self, x): - - def stem(x): - for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), - (self.conv3, self.bn3)]: - x = self.relu(bn(conv(x))) - x = self.avgpool(x) - return x - - x = x.type(self.conv1.weight.dtype) - x = stem(x) - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - x = self.attnpool(x) - - return x - - -class LayerNorm(nn.LayerNorm): - """Subclass torch's LayerNorm to handle fp16.""" - - def forward(self, x: torch.Tensor): - orig_type = x.dtype - ret = super().forward(x.type(torch.float32)) - return ret.type(orig_type) - - -class QuickGELU(nn.Module): - - def forward(self, x: torch.Tensor): - return x * torch.sigmoid(1.702 * x) - - -class ResidualAttentionBlock(nn.Module): - - def __init__(self, - d_model: int, - n_head: int, - attn_mask: torch.Tensor = None): - super().__init__() - - self.attn = nn.MultiheadAttention(d_model, n_head) - self.ln_1 = LayerNorm(d_model) - self.mlp = nn.Sequential( - OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), - ('gelu', QuickGELU()), - ('c_proj', nn.Linear(d_model * 4, d_model))])) - self.ln_2 = LayerNorm(d_model) - self.attn_mask = attn_mask - - def attention(self, x: torch.Tensor): - self.attn_mask = self.attn_mask.to( - dtype=x.dtype, - device=x.device) if self.attn_mask is not None else None - return self.attn( - x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] - - def forward(self, x: torch.Tensor): - x = x + self.attention(self.ln_1(x)) - x = x + self.mlp(self.ln_2(x)) - return x - - -class Transformer(nn.Module): - - def __init__(self, - width: int, - layers: int, - heads: int, - attn_mask: torch.Tensor = None): - super().__init__() - self.width = width - self.layers = layers - self.resblocks = nn.Sequential(*[ - ResidualAttentionBlock(width, heads, attn_mask) - for _ in range(layers) - ]) - - def forward(self, x: torch.Tensor): - return self.resblocks(x) - - -class VisualTransformer(nn.Module): - - def __init__(self, input_resolution: int, patch_size: int, width: int, - layers: int, heads: int, output_dim: int): - super().__init__() - self.input_resolution = input_resolution - self.output_dim = output_dim - self.conv1 = nn.Conv2d( - in_channels=3, - out_channels=width, - kernel_size=patch_size, - stride=patch_size, - bias=False) - - scale = width**-0.5 - self.class_embedding = nn.Parameter(scale * torch.randn(width)) - self.positional_embedding = nn.Parameter(scale * torch.randn( - (input_resolution // patch_size)**2 + 1, width)) - self.ln_pre = LayerNorm(width) - - self.transformer = Transformer(width, layers, heads) - - self.ln_post = LayerNorm(width) - self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) - - def forward(self, x: torch.Tensor): - x = self.conv1(x) # shape = [*, width, grid, grid] - x = x.reshape(x.shape[0], x.shape[1], - -1) # shape = [*, width, grid ** 2] - x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] - x = torch.cat( - [ # noqa - self.class_embedding.to(x.dtype) + torch.zeros( # noqa - x.shape[0], - 1, - x.shape[-1], - dtype=x.dtype, - device=x.device), - x # noqa - ], - dim=1) # noqa shape = [*, grid ** 2 + 1, width] - x = x + self.positional_embedding.to(x.dtype) - x = self.ln_pre(x) - - x = x.permute(1, 0, 2) # NLD -> LND - x = self.transformer(x) - x = x.permute(1, 0, 2) # LND -> NLD - - x = self.ln_post(x[:, 0, :]) - - if self.proj is not None: - x = x @ self.proj - - return x - - -class CLIP(nn.Module): - - def __init__( - self, - embed_dim: int, - # vision - image_resolution: int, - vision_layers: Union[Tuple[int, int, int, int], int], - vision_width: int, - vision_patch_size: int, - # text - vocab_size: int, - text_attention_probs_dropout_prob: float, - text_hidden_act: str, - text_hidden_dropout_prob: float, - text_hidden_size: int, - text_initializer_range: float, - text_intermediate_size: int, - text_max_position_embeddings: int, - text_num_attention_heads: int, - text_num_hidden_layers: int, - text_type_vocab_size: int, - tokenizer: FullTokenizer, - ): - super().__init__() - - if isinstance(vision_layers, (tuple, list)): - vision_heads = vision_width * 32 // 64 - self.visual = ModifiedResNet( - layers=vision_layers, - output_dim=embed_dim, - heads=vision_heads, - input_resolution=image_resolution, - width=vision_width) - else: - vision_heads = vision_width // 64 - self.visual = VisualTransformer( - input_resolution=image_resolution, - patch_size=vision_patch_size, - width=vision_width, - layers=vision_layers, - heads=vision_heads, - output_dim=embed_dim) - - self.bert_config = BertConfig( - vocab_size_or_config_json_file=vocab_size, - hidden_size=text_hidden_size, - num_hidden_layers=text_num_hidden_layers, - num_attention_heads=text_num_attention_heads, - intermediate_size=text_intermediate_size, - hidden_act=text_hidden_act, - hidden_dropout_prob=text_hidden_dropout_prob, - attention_probs_dropout_prob=text_attention_probs_dropout_prob, - max_position_embeddings=text_max_position_embeddings, - type_vocab_size=text_type_vocab_size, - initializer_range=text_initializer_range, - layer_norm_eps=1e-12, - ) - self.bert = BertModel(self.bert_config) - - self.text_projection = nn.Parameter( - torch.empty(text_hidden_size, embed_dim)) - self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) - - self.tokenizer = tokenizer - - self.initialize_parameters() - - def initialize_parameters(self): - self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) - - if isinstance(self.visual, ModifiedResNet): - if self.visual.attnpool is not None: - std = self.visual.attnpool.c_proj.in_features**-0.5 - nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) - nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) - nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) - nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) - - for resnet_block in [ - self.visual.layer1, self.visual.layer2, self.visual.layer3, - self.visual.layer4 - ]: - for name, param in resnet_block.named_parameters(): - if name.endswith('bn3.weight'): - nn.init.zeros_(param) - - if self.text_projection is not None: - nn.init.normal_( - self.text_projection, std=self.bert_config.hidden_size**-0.5) - - @property - def dtype(self): - return self.visual.conv1.weight.dtype - - def encode_image(self, image): - return self.visual(image.type(self.dtype)) - - def encode_text(self, text): - pad_index = self.tokenizer.vocab['[PAD]'] - attn_mask = text.ne(pad_index).type(self.dtype) - x = self.bert( - text, attention_mask=attn_mask)[0].type( - self.dtype) # [batch_size, seq_length, hidden_size] - return x[:, 0, :] @ self.text_projection - - def forward(self, image, text): - assert image is not None or text is not None, 'text and image cannot both be None!' - - if image is None: - return self.encode_text(text) - elif text is None: - return self.encode_image(image) - image_features = self.encode_image(image) - text_features = self.encode_text(text) - - image_features = image_features / image_features.norm( - dim=-1, keepdim=True) - text_features = text_features / text_features.norm( - dim=-1, keepdim=True) - - return image_features, text_features, self.logit_scale.exp() - - def get_similarity(self, image, text): - image_features = self.encode_image(image) - text_features = self.encode_text(text) - - # normalized features - image_features = image_features / image_features.norm( - dim=1, keepdim=True) - text_features = text_features / text_features.norm(dim=1, keepdim=True) - - # cosine similarity as logits - logit_scale = self.logit_scale.exp() - logits_per_image = logit_scale * image_features @ text_features.t() - logits_per_text = logits_per_image.t() - - # shape = [global_batch_size, global_batch_size] - return logits_per_image, logits_per_text - - -def convert_models_to_fp32(model): - for p in model.parameters(): - p.data = p.data.float() - if p.grad: - p.grad.data = p.grad.data.float() - - -def convert_weights(model: nn.Module): - """Convert applicable model parameters to fp16""" - - def _convert_weights_to_fp16(module): - if isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Linear)): - module.weight.data = module.weight.data.half() - if module.bias is not None: - module.bias.data = module.bias.data.half() - - if isinstance(module, nn.MultiheadAttention): - for attr in [ - *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']], - 'in_proj_bias', 'bias_k', 'bias_v' - ]: - tensor = getattr(module, attr) - if tensor is not None: - tensor.data = tensor.data.half() - - if isinstance(module, BertModel): - module.to(torch.half) - - for name in ['text_projection', 'proj']: - if hasattr(module, name): - attr = getattr(module, name) - if attr is not None: - attr.data = attr.data.half() - - model.apply(_convert_weights_to_fp16) - - -def _convert_to_rgb(image): - return image.convert('RGB') - - -def image_transform(image_size=224): - transform = Compose([ - _convert_to_rgb, - Resize((image_size, image_size)), - ToTensor(), - Normalize((0.48145466, 0.4578275, 0.40821073), - (0.26862954, 0.26130258, 0.27577711)), - ]) - return transform - - -@MODELS.register_module(Tasks.multi_modal_embedding, module_name=Models.clip) -class CLIPForMultiModalEmbedding(TorchModel): - - def __init__(self, model_dir, device_id=-1): - super().__init__(model_dir=model_dir, device_id=device_id) - - # Initialize the model. - vision_model_config_file = '{}/vision_model_config.json'.format( - model_dir) - logger.info( - f'Loading vision model config from {vision_model_config_file}') - assert os.path.exists(vision_model_config_file) - - text_model_config_file = '{}/text_model_config.json'.format(model_dir) - logger.info(f'Loading text model config from {text_model_config_file}') - assert os.path.exists(text_model_config_file) - - with open(vision_model_config_file, - 'r') as fv, open(text_model_config_file, 'r') as ft: - model_info = json.load(fv) - for k, v in json.load(ft).items(): - model_info[k] = v - - # image preprocess - self.img_preprocess = image_transform(model_info['image_resolution']) - - # text tokenizer - vocab_file = f'{model_dir}/{ModelFile.VOCAB_FILE}' - self.tokenizer = FullTokenizer(vocab_file=vocab_file) - - # initialize the model - self.clip_model = CLIP(**model_info, tokenizer=self.tokenizer) - convert_weights(self.clip_model) - - # restore the pretrained weight - checkpoint = torch.load( - f'{model_dir}/{ModelFile.TORCH_MODEL_BIN_FILE}', 'cpu') - sd = checkpoint['state_dict'] - if next(iter(sd.items()))[0].startswith('module'): - sd = {k[len('module.'):]: v for k, v in sd.items()} - self.clip_model.load_state_dict(sd) - self.clip_model.eval() - - # place the model - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - if self.device == 'cuda': - self.clip_model.to(self.device) - logger.info('Use GPU for inference') - else: - self.clip_model.float() - logger.info('Use CPU for inference') - - def tokenize(self, - texts: Union[str, List[str]], - context_length: int = 52) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s) - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all baseline models use 24 as the context length - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] - """ - if isinstance(texts, str): - texts = [texts] - - all_tokens = [] - for text in texts: - all_tokens.append( - [self.tokenizer.vocab['[CLS]']] - + self.tokenizer.convert_tokens_to_ids( - self.tokenizer.tokenize(text))[:context_length - 2] - + [self.tokenizer.vocab['[SEP]']]) - - result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) - - for i, tokens in enumerate(all_tokens): - assert len(tokens) <= context_length - result[i, :len(tokens)] = torch.tensor(tokens) - - return result - - def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: - from modelscope.outputs import OutputKeys - output = { - OutputKeys.IMG_EMBEDDING: None, - OutputKeys.TEXT_EMBEDDING: None - } - if 'img' in input and input['img'] is not None: - image_input = input['img'] - - # single image input - if isinstance(image_input, Image.Image): - image_tensor = self.img_preprocess(image_input).unsqueeze(0) - # multi images input - elif isinstance(image_input, list): - if all([isinstance(elem, Image.Image) - for elem in image_input]): - image_tensor = torch.stack( - [self.img_preprocess(elem) for elem in image_input], - dim=0) - else: - unsupported_elem_type = [ - type(elem) for elem in image_input - if not isinstance(elem, Image.Image) - ][0] - raise TypeError( - f'img should be PIL.Image or List[PIL.Image], \ - but got a List containing one {unsupported_elem_type}' - ) - # others - else: - raise TypeError( - f'img should be PIL.Image or List[PIL.Image], but got {type(image_input)}' - ) - - image_tensor = image_tensor.to(self.device) - - with torch.no_grad(): - image_features = self.clip_model.encode_image(image_tensor) - image_features /= image_features.norm( - dim=-1, keepdim=True) # l2-normalize - - output[OutputKeys.IMG_EMBEDDING] = image_features - - if 'text' in input and input['text'] is not None: - text_input = input['text'] - - # single text input - if isinstance(text_input, str): - text_tensor = self.tokenize(text_input) - # multi texts input - elif isinstance(text_input, list): - if all([isinstance(elem, str) for elem in text_input]): - text_tensor = self.tokenize(text_input) - else: - unsupported_elem_type = [ - type(elem) for elem in text_input - if not isinstance(elem, str) - ][0] - raise TypeError( - f'text should be str or List[str], but got a List containing one {unsupported_elem_type}' - ) - # others - else: - raise TypeError( - f'text should be str or List[str], but got {type(text_input)}' - ) - - text_tensor = text_tensor.to(self.device) - - with torch.no_grad(): - text_features = self.clip_model.encode_text(text_tensor) - text_features /= text_features.norm( - dim=-1, keepdim=True) # l2-normalize - output[OutputKeys.TEXT_EMBEDDING] = text_features - - return output - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - return inputs - - @property - def temperature(self): - return 1.0 / self.clip_model.logit_scale.exp() diff --git a/modelscope/models/multi_modal/clip/modeling_bert.py b/modelscope/models/multi_modal/clip/modeling_bert.py deleted file mode 100644 index b5f104ce..00000000 --- a/modelscope/models/multi_modal/clip/modeling_bert.py +++ /dev/null @@ -1,507 +0,0 @@ -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch BERT model. """ - -from __future__ import (absolute_import, division, print_function, - unicode_literals) -import logging -import math -import os -import sys -from io import open - -import json -import torch -from torch import nn - -from .configuration_bert import BertConfig - -logger = logging.getLogger(__name__) - - -def gelu(x): - """ Original Implementation of the gelu activation function in Google Bert repo when initially created. - For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): - 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - Also see https://arxiv.org/abs/1606.08415 - """ - return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) - - -def gelu_new(x): - """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). - Also see https://arxiv.org/abs/1606.08415 - """ - return 0.5 * x * (1 + torch.tanh( - math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - - -def swish(x): - return x * torch.sigmoid(x) - - -ACT2FN = { - 'gelu': gelu, - 'relu': torch.nn.functional.relu, - 'swish': swish, - 'gelu_new': gelu_new -} - -BertLayerNorm = torch.nn.LayerNorm - - -class BertEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings. - """ - - def __init__(self, config): - super(BertEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding( - config.vocab_size, config.hidden_size, padding_idx=0) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, input_ids, token_type_ids=None, position_ids=None): - seq_length = input_ids.size(1) - if position_ids is None: - position_ids = torch.arange( - seq_length, dtype=torch.long, device=input_ids.device) - position_ids = position_ids.unsqueeze(0).expand_as(input_ids) - if token_type_ids is None: - token_type_ids = torch.zeros_like(input_ids) - - words_embeddings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = words_embeddings + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class BertSelfAttention(nn.Module): - - def __init__(self, config): - super(BertSelfAttention, self).__init__() - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - 'The hidden size (%d) is not a multiple of the number of attention ' - 'heads (%d)' % - (config.hidden_size, config.num_attention_heads)) - self.output_attentions = config.output_attentions - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size - / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, - self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward(self, hidden_states, attention_mask=None, head_mask=None): - mixed_query_layer = self.query(hidden_states) - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - - query_layer = self.transpose_for_scores(mixed_query_layer) - key_layer = self.transpose_for_scores(mixed_key_layer) - value_layer = self.transpose_for_scores(mixed_value_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, - key_layer.transpose(-1, -2)) - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + ( - self.all_head_size, ) - context_layer = context_layer.view(*new_context_layer_shape) - - outputs = (context_layer, - attention_probs) if self.output_attentions else ( - context_layer, ) - return outputs - - -class BertSelfOutput(nn.Module): - - def __init__(self, config): - super(BertSelfOutput, self).__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = BertLayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertAttention(nn.Module): - - def __init__(self, config): - super(BertAttention, self).__init__() - self.self = BertSelfAttention(config) - self.output = BertSelfOutput(config) - self.pruned_heads = set() - - def forward(self, input_tensor, attention_mask=None, head_mask=None): - self_outputs = self.self(input_tensor, attention_mask, head_mask) - attention_output = self.output(self_outputs[0], input_tensor) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them - return outputs - - -class BertIntermediate(nn.Module): - - def __init__(self, config): - super(BertIntermediate, self).__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, - str) or (sys.version_info[0] == 2 - and isinstance(config.hidden_act, unicode)): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -class BertOutput(nn.Module): - - def __init__(self, config): - super(BertOutput, self).__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = BertLayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertLayer(nn.Module): - - def __init__(self, config): - super(BertLayer, self).__init__() - self.attention = BertAttention(config) - self.intermediate = BertIntermediate(config) - self.output = BertOutput(config) - - def forward(self, hidden_states, attention_mask=None, head_mask=None): - attention_outputs = self.attention(hidden_states, attention_mask, - head_mask) - attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - outputs = (layer_output, ) + attention_outputs[ - 1:] # add attentions if we output them - return outputs - - -class BertEncoder(nn.Module): - - def __init__(self, config): - super(BertEncoder, self).__init__() - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - self.layer = nn.ModuleList( - [BertLayer(config) for _ in range(config.num_hidden_layers)]) - - def forward(self, hidden_states, attention_mask=None, head_mask=None): - all_hidden_states = () - all_attentions = () - for i, layer_module in enumerate(self.layer): - if self.output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - layer_outputs = layer_module(hidden_states, attention_mask, - head_mask[i]) - hidden_states = layer_outputs[0] - - if self.output_attentions: - all_attentions = all_attentions + (layer_outputs[1], ) - - # Add last layer - if self.output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) - - outputs = (hidden_states, ) - if self.output_hidden_states: - outputs = outputs + (all_hidden_states, ) - if self.output_attentions: - outputs = outputs + (all_attentions, ) - return outputs # last-layer hidden state, (all hidden states), (all attentions) - - -class BertPooler(nn.Module): - - def __init__(self, config): - super(BertPooler, self).__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class BertPredictionHeadTransform(nn.Module): - - def __init__(self, config): - super(BertPredictionHeadTransform, self).__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - if isinstance(config.hidden_act, - str) or (sys.version_info[0] == 2 - and isinstance(config.hidden_act, unicode)): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = BertLayerNorm( - config.hidden_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class BertLMPredictionHead(nn.Module): - - def __init__(self, config): - super(BertLMPredictionHead, self).__init__() - self.transform = BertPredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear( - config.hidden_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) + self.bias - return hidden_states - - -class BertOnlyMLMHead(nn.Module): - - def __init__(self, config): - super(BertOnlyMLMHead, self).__init__() - self.predictions = BertLMPredictionHead(config) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class BertOnlyNSPHead(nn.Module): - - def __init__(self, config): - super(BertOnlyNSPHead, self).__init__() - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) - return seq_relationship_score - - -class BertPreTrainingHeads(nn.Module): - - def __init__(self, config): - super(BertPreTrainingHeads, self).__init__() - self.predictions = BertLMPredictionHead(config) - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, sequence_output, pooled_output): - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -class BertPreTrainedModel(nn.Module): - config_class = BertConfig - base_model_prefix = 'bert' - - def __init__(self, config): - super(BertPreTrainedModel, self).__init__() - self.config = config - - def _init_weights(self, module): - """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_( - mean=0.0, std=self.config.initializer_range) - elif isinstance(module, BertLayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - -class BertModel(BertPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` - Sequence of hidden-states at the output of the last layer of the model. - **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) - of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, - used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertModel.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple - - """ - - def __init__(self, config): - super(BertModel, self).__init__(config) - - self.embeddings = BertEmbeddings(config) - self.encoder = BertEncoder(config) - self.pooler = BertPooler(config) - - self.apply(self._init_weights) - - def forward(self, - input_ids, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None): - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - if token_type_ids is None: - token_type_ids = torch.zeros_like(input_ids) - - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to( - dtype=next(self.parameters()).dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - if head_mask is not None: - if head_mask.dim() == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( - -1).unsqueeze(-1) - head_mask = head_mask.expand(self.config.num_hidden_layers, -1, - -1, -1, -1) - elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( - -1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters( - )).dtype) # switch to fload if need + fp16 compatibility - else: - head_mask = [None] * self.config.num_hidden_layers - - embedding_output = self.embeddings( - input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids) - encoder_outputs = self.encoder( - embedding_output, extended_attention_mask, head_mask=head_mask) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) - - outputs = ( - sequence_output, - pooled_output, - ) + encoder_outputs[ - 1:] # add hidden_states and attentions if they are here - return outputs # sequence_output, pooled_output, (hidden_states), (attentions) diff --git a/modelscope/models/multi_modal/mplug/clip/clip.py b/modelscope/models/multi_modal/mplug/clip/clip.py index aa56e39b..fbdfbd29 100644 --- a/modelscope/models/multi_modal/mplug/clip/clip.py +++ b/modelscope/models/multi_modal/mplug/clip/clip.py @@ -5,69 +5,9 @@ from typing import Tuple, Union import torch import torch.nn.functional as F -import torch.utils.checkpoint as checkpoint from torch import nn - -class QuickGELU(nn.Module): - - def forward(self, x: torch.Tensor): - return x * torch.sigmoid(1.702 * x) - - -class ResidualAttentionBlock(nn.Module): - - def __init__(self, - d_model: int, - n_head: int, - attn_mask: torch.Tensor = None): - super().__init__() - self.attn = nn.MultiheadAttention(d_model, n_head) - self.ln_1 = LayerNorm(d_model) - self.mlp = nn.Sequential( - OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), - ('gelu', QuickGELU()), - ('c_proj', nn.Linear(d_model * 4, d_model))])) - self.ln_2 = LayerNorm(d_model) - self.attn_mask = attn_mask - - def attention(self, x: torch.Tensor): - self.attn_mask = self.attn_mask.to( - dtype=x.dtype, - device=x.device) if self.attn_mask is not None else None - return self.attn( - x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] - - def forward(self, x: torch.Tensor): - x = x + self.attention(self.ln_1(x)) - x = x + self.mlp(self.ln_2(x)) - return x - - -class Transformer(nn.Module): - - def __init__(self, - width: int, - layers: int, - heads: int, - attn_mask: torch.Tensor = None, - use_grad_ckp: bool = True): - super().__init__() - self.width = width - self.layers = layers - self.resblocks = nn.Sequential(*[ - ResidualAttentionBlock(width, heads, attn_mask) - for _ in range(layers) - ]) - self.use_grad_ckp = use_grad_ckp - - def forward(self, x: torch.Tensor): - if self.use_grad_ckp: - for each_block in self.resblocks: - x = checkpoint.checkpoint(each_block, x) - return x - else: - return self.resblocks(x) +from modelscope.models.multi_modal.clip.clip_vit import Transformer class Bottleneck(nn.Module): diff --git a/tests/pipelines/test_multi_modal_embedding.py b/tests/pipelines/test_multi_modal_embedding.py index 6152f279..3bf3af87 100644 --- a/tests/pipelines/test_multi_modal_embedding.py +++ b/tests/pipelines/test_multi_modal_embedding.py @@ -2,58 +2,50 @@ import unittest -import torch +import numpy as np from modelscope.models import Model -from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from modelscope.utils.test_utils import test_level class MultiModalEmbeddingTest(unittest.TestCase): - model_id = 'damo/multi-modal_clip-vit-base-patch16_zh' - test_input = {'text': '皮卡丘'} - model_version = 'dev' + model_id = 'damo/multi-modal_clip-vit-large-patch14_zh' + test_text = {'text': '一张风景图'} - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run(self): - pipeline_multi_modal_embedding = pipeline( - Tasks.multi_modal_embedding, - model=self.model_id, - model_revision=self.model_version) - text_embedding = pipeline_multi_modal_embedding( - self.test_input)[OutputKeys.TEXT_EMBEDDING] - print('l1-norm: {}'.format( - torch.norm(text_embedding, p=1, dim=-1).item())) - print('l2-norm: {}'.format(torch.norm(text_embedding, - dim=-1).item())) # should be 1.0 + pipe_line_multi_modal_embedding = pipeline( + Tasks.multi_modal_embedding, model=self.model_id) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_model_from_modelhub(self): model = Model.from_pretrained(self.model_id) - pipeline_multi_modal_embedding = pipeline( - task=Tasks.multi_modal_embedding, - model=model, - model_revision=self.model_version) - text_embedding = pipeline_multi_modal_embedding( - self.test_input)[OutputKeys.TEXT_EMBEDDING] - print('l1-norm: {}'.format( - torch.norm(text_embedding, p=1, dim=-1).item())) - print('l2-norm: {}'.format(torch.norm(text_embedding, - dim=-1).item())) # should be 1.0 + pipe_line_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding, model=model) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) - @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_model_name(self): + pipe_line_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding, model=self.model_id) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) + + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_with_default_model(self): - pipeline_multi_modal_embedding = pipeline( - task=Tasks.multi_modal_embedding, - model_revision=self.model_version) - text_embedding = pipeline_multi_modal_embedding( - self.test_input)[OutputKeys.TEXT_EMBEDDING] - print('l1-norm: {}'.format( - torch.norm(text_embedding, p=1, dim=-1).item())) - print('l2-norm: {}'.format(torch.norm(text_embedding, - dim=-1).item())) # should be 1.0 + pipe_line_multi_modal_embedding = pipeline( + task=Tasks.multi_modal_embedding) + test_str_embedding = pipe_line_multi_modal_embedding( + self.test_text)['text_embedding'] + print(np.sum(np.abs(test_str_embedding))) if __name__ == '__main__': diff --git a/tests/trainers/test_clip_multi_modal_embedding_trainer.py b/tests/trainers/test_clip_multi_modal_embedding_trainer.py new file mode 100644 index 00000000..03f82854 --- /dev/null +++ b/tests/trainers/test_clip_multi_modal_embedding_trainer.py @@ -0,0 +1,60 @@ +import os +import tempfile +import unittest + +import requests +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +from modelscope.hub.snapshot_download import snapshot_download +from modelscope.metainfo import Trainers +from modelscope.trainers import build_trainer +from modelscope.utils.constant import ModelFile +from modelscope.utils.logger import get_logger +from modelscope.utils.test_utils import test_level + +logger = get_logger() + + +def clip_train_worker(local_rank, ngpus, node_size, node_rank): + global_rank = local_rank + node_rank * ngpus + dist_world_size = node_size * ngpus + + dist.init_process_group( + backend='nccl', world_size=dist_world_size, rank=global_rank) + + model_id = 'damo/multi-modal_clip-vit-large-patch14_zh' + local_model_dir = snapshot_download(model_id) + + default_args = dict( + cfg_file='{}/{}'.format(local_model_dir, ModelFile.CONFIGURATION), + model=model_id, + device_id=local_rank) + trainer = build_trainer( + name=Trainers.clip_multi_modal_embedding, default_args=default_args) + + trainer.train() + trainer.evaluate() + + +class CLIPMultiModalEmbeddingTrainerTest(unittest.TestCase): + + @unittest.skipUnless(test_level() >= 1, 'skip test in current test level') + def test_trainer(self): + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '2001' + NODE_SIZE, NODE_RANK = 1, 0 + logger.info('Train clip with {} machines'.format(NODE_SIZE)) + ngpus = torch.cuda.device_count() + logger.info('Machine: {} has {} GPUs'.format(NODE_RANK, ngpus)) + mp.spawn( + clip_train_worker, + nprocs=ngpus, + args=(ngpus, NODE_SIZE, NODE_RANK)) + logger.info('Training done') + + +if __name__ == '__main__': + unittest.main() + ...