From db0f25a5947c49b62cac7b99309a18540be4b929 Mon Sep 17 00:00:00 2001 From: shuaigezhu Date: Tue, 22 Nov 2022 10:10:34 +0800 Subject: [PATCH 1/6] init --- modelscope/metainfo.py | 3 + modelscope/models/nlp/__init__.py | 2 + modelscope/models/nlp/codegeex/__init__.py | 22 + modelscope/models/nlp/codegeex/codegeex.py | 1030 +++++++++++++++++ .../codegeex/codegeex_for_code_translation.py | 126 ++ modelscope/models/nlp/codegeex/inference.py | 335 ++++++ modelscope/models/nlp/codegeex/tokenizer.py | 186 +++ modelscope/pipelines/nlp/__init__.py | 3 + .../nlp/codegeex_code_translation_pipeline.py | 44 + modelscope/preprocessors/__init__.py | 4 +- modelscope/preprocessors/nlp/__init__.py | 2 + .../nlp/codegeex_preprocessor.py | 25 + modelscope/utils/constant.py | 1 + .../test_CodeGeeX_code_translation.py | 38 + 14 files changed, 1819 insertions(+), 2 deletions(-) create mode 100755 modelscope/models/nlp/codegeex/__init__.py create mode 100755 modelscope/models/nlp/codegeex/codegeex.py create mode 100755 modelscope/models/nlp/codegeex/codegeex_for_code_translation.py create mode 100755 modelscope/models/nlp/codegeex/inference.py create mode 100755 modelscope/models/nlp/codegeex/tokenizer.py create mode 100755 modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py create mode 100755 modelscope/preprocessors/nlp/codegeex_preprocessor.py create mode 100644 tests/pipelines/test_CodeGeeX_code_translation.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index ccd36349..99f4a047 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -84,6 +84,7 @@ class Models(object): ponet = 'ponet' T5 = 'T5' mglm = 'mglm' + codegeex = 'codegeex' bloom = 'bloom' # audio models @@ -255,6 +256,7 @@ class Pipelines(object): document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' mglm_text_summarization = 'mglm-text-summarization' + codegeex_code_translation = 'codegeex-code-translation' translation_en_to_de = 'translation_en_to_de' # keep it underscore translation_en_to_ro = 'translation_en_to_ro' # keep it underscore translation_en_to_fr = 'translation_en_to_fr' # keep it underscore @@ -382,6 +384,7 @@ class Preprocessors(object): document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' mglm_summarization = 'mglm-summarization' + codegeex = 'codegeex' sentence_piece = 'sentence-piece' # audio preprocessor diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 1d71469a..3f9d224c 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -36,6 +36,7 @@ if TYPE_CHECKING: ) from .T5 import T5ForConditionalGeneration from .mglm import MGLMForTextSummarization + from .codegeex import CodeGeeXForCodeTranslation from .task_models import ( FeatureExtractionModel, InformationExtractionModel, @@ -108,6 +109,7 @@ else: 'sentence_embedding': ['SentenceEmbedding'], 'T5': ['T5ForConditionalGeneration'], 'mglm': ['MGLMForTextSummarization'], + 'codegeex': ['CodeGeeXForCodeTranslation'], 'gpt_neo': ['GPTNeoModel'], 'bloom': ['BloomModel'], } diff --git a/modelscope/models/nlp/codegeex/__init__.py b/modelscope/models/nlp/codegeex/__init__.py new file mode 100755 index 00000000..6ee72f80 --- /dev/null +++ b/modelscope/models/nlp/codegeex/__init__.py @@ -0,0 +1,22 @@ +# Modified by Zhipu.AI +# Original Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .codegeex_for_code_translation import CodeGeeXForCodeTranslation +else: + _import_structure = { + 'codegeex_for_code_translation': ['CodeGeeXForCodeTranslation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/nlp/codegeex/codegeex.py b/modelscope/models/nlp/codegeex/codegeex.py new file mode 100755 index 00000000..7a1b76a3 --- /dev/null +++ b/modelscope/models/nlp/codegeex/codegeex.py @@ -0,0 +1,1030 @@ +import math + +import torch +import torch.nn.functional as F +from torch.nn.parameter import Parameter + + +def fast_gelu(x): + """Mindspore's fast gelu implementation.""" + return x / (1 + torch.exp(-1.702 * torch.abs(x))) * torch.exp( + 0.851 * (x - torch.abs(x))) + + +class MLP(torch.nn.Module): + """MLP. + + MLP will take the input with h hidden state, project it to 4*h + hidden dimension, perform nonlinear transformation, and project the + state back into h hidden dimension. At the end, dropout is also + applied. + """ + + def __init__( + self, + hidden_size, + ): + super(MLP, self).__init__() + self.hidden_size = hidden_size + # Project to 4h. + self.dense_h_to_4h = torch.nn.Linear( + self.hidden_size, + 4 * self.hidden_size, + ) + + self.activation_func = fast_gelu + + # Project back to h. + self.dense_4h_to_h = torch.nn.Linear( + 4 * self.hidden_size, + self.hidden_size, + ) + + def forward(self, hidden_states): + # [s, b, 4hp] + intermediate_parallel = self.dense_h_to_4h(hidden_states) + intermediate_parallel = self.activation_func(intermediate_parallel) + # [s, b, h] + output = self.dense_4h_to_h(intermediate_parallel) + + return output + + +class SelfAttention(torch.nn.Module): + """self-attention layer abstract class. + + Self-attention layer takes input with size [b, s, h] + and returns output of the same size. + """ + + def __init__( + self, + hidden_size, + num_attention_heads, + layer_number, + fp16=True, + attention_softmax_in_fp32=True, + ): + super(SelfAttention, self).__init__() + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.fp16 = fp16 + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.layer_number = max(1, layer_number) + + assert self.hidden_size % self.num_attention_heads == 0 + self.hidden_size_per_attention_head = int(self.hidden_size + // self.num_attention_heads) + + self.query = torch.nn.Linear(self.hidden_size, self.hidden_size) + self.key = torch.nn.Linear(self.hidden_size, self.hidden_size) + self.value = torch.nn.Linear(self.hidden_size, self.hidden_size) + + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + self.softmax = torch.nn.Softmax(dim=-1) + + self.dense = torch.nn.Linear(self.hidden_size, self.hidden_size) + + def forward( + self, + hidden_states, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + # hidden_states: [sq, b, h] + + # ===================== + # Query, Key, and Value + # ===================== + + query_layer = self.query(hidden_states) + key_layer = self.key(hidden_states) + value_layer = self.value(hidden_states) + + new_query_layer_shape = query_layer.size()[:-1] + ( + self.num_attention_heads, self.hidden_size_per_attention_head + ) # noqa + query_layer = query_layer.view(*new_query_layer_shape) + + new_query_layer_shape = key_layer.size()[:-1] + ( + self.num_attention_heads, self.hidden_size_per_attention_head) + key_layer = key_layer.view(*new_query_layer_shape) + + new_query_layer_shape = value_layer.size()[:-1] + ( + self.num_attention_heads, self.hidden_size_per_attention_head + ) # noqa + value_layer = value_layer.view(*new_query_layer_shape) + + # ================================== + # Adjust key and value for inference + # ================================== + + if layer_past is not None: + past_key, past_value = layer_past + key_layer = torch.cat((past_key.type_as(key_layer), key_layer), + dim=0) + value_layer = torch.cat( + (past_value.type_as(value_layer), value_layer), dim=0) + if get_key_value: + present = (key_layer, value_layer) + + # =================================== + # Raw attention scores. [b, np, sq, sk] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), + query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.contiguous().view( + output_size[2], output_size[0] * output_size[1], -1) + key_layer = key_layer.contiguous().view( + output_size[3], output_size[0] * output_size[1], -1) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.matmul( + query_layer.transpose(0, 1), + key_layer.transpose(0, 1).transpose(1, 2)) / self.norm_factor + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # ================================================== + # Update attention mask for inference. [b, np, sq, sk] + # ================================================== + + if get_key_value: + with torch.no_grad(): + if layer_past is not None: + attention_mask = attention_mask[ + ..., + attention_scores.size(3) + - 1, :attention_scores.size(3)].unsqueeze(2) + else: + attention_mask = attention_mask[ + ..., :attention_scores.size(3), :attention_scores. + size(3)] + + if context_length is not None: + attention_mask = torch.clone(attention_mask) + attention_mask[:, :, context_length:, :] = True + + # attention scores and attention mask [b, np, sq, sk] + # attention_scores = attention_mask_func(attention_scores, attention_mask) + attention_scores = attention_scores - attention_mask * 10000.0 + if self.attention_softmax_in_fp32: + attention_probs = self.softmax(attention_scores.float()).half() + else: + attention_probs = self.softmax(attention_scores) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sq, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), + query_layer.size(0), value_layer.size(3)) + + # change view [sq, b * np, hn] + value_layer = value_layer.view( + value_layer.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], + output_size[2], -1) + + context_layer = torch.bmm( + attention_probs, + value_layer.unsqueeze(0).transpose(1, 2).squeeze(0)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + ( + self.hidden_size, ) + context_layer = context_layer.view(*new_context_layer_shape) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + if get_key_value: + output = [output, present] + + return output + + +class TopQuerySelfAttention(torch.nn.Module): + """Top query self-attention layer abstract class. + + Self-attention layer takes input with size [b, s, h] + and returns output of the same size. + """ + + def __init__( + self, + hidden_size, + num_attention_heads, + layer_number, + fp16=True, + attention_softmax_in_fp32=True, + ): + super(TopQuerySelfAttention, self).__init__() + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.fp16 = fp16 + self.attention_softmax_in_fp32 = attention_softmax_in_fp32 + self.layer_number = max(1, layer_number) + + assert self.hidden_size % self.num_attention_heads == 0 + self.hidden_size_per_attention_head = int(self.hidden_size + // self.num_attention_heads) + + self.query = torch.nn.Linear(self.hidden_size, self.hidden_size) + self.key = torch.nn.Linear(self.hidden_size, self.hidden_size) + self.value = torch.nn.Linear(self.hidden_size, self.hidden_size) + + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + self.softmax = torch.nn.Softmax(dim=-1) + + self.dense = torch.nn.Linear(self.hidden_size, self.hidden_size) + + def forward( + self, + hidden_states, + query_hidden_state, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + + # hidden_states: [sq, b, h] + query_layer = self.query(query_hidden_state) + key_layer = self.key(hidden_states) + value_layer = self.value(hidden_states) + + new_query_layer_shape = query_layer.size()[:-1] + ( + self.num_attention_heads, self.hidden_size_per_attention_head + ) # noqa + query_layer = query_layer.view(*new_query_layer_shape) + + new_query_layer_shape = key_layer.size()[:-1] + ( + self.num_attention_heads, self.hidden_size_per_attention_head) + key_layer = key_layer.view(*new_query_layer_shape) + + new_query_layer_shape = value_layer.size()[:-1] + ( + self.num_attention_heads, self.hidden_size_per_attention_head + ) # noqa + value_layer = value_layer.view(*new_query_layer_shape) + + # ================================== + # Adjust key and value for inference + # ================================== + + if layer_past is not None: + past_key, past_value = layer_past + key_layer = torch.cat((past_key.type_as(key_layer), key_layer), + dim=0) + value_layer = torch.cat( + (past_value.type_as(value_layer), value_layer), dim=0) + if get_key_value: + present = (key_layer, value_layer) + + # =================================== + # Raw attention scores. [b, np, sq, sk] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), + query_layer.size(0), key_layer.size(0)) + + # [s, b, np, hn] -> [s, b * np, hn] + query_layer = query_layer.contiguous().view( + output_size[2], output_size[0] * output_size[1], -1) + key_layer = key_layer.contiguous().view( + output_size[3], output_size[0] * output_size[1], -1) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.matmul( + query_layer.transpose(0, 1), + key_layer.transpose(0, 1).transpose(1, 2)) / self.norm_factor + + # change view to [b, np, s, s] + attention_scores = matmul_result.view(*output_size) + + # ================================================== + # Update attention mask for inference. [b, np, sq, sk] + # ================================================== + + if get_key_value: + with torch.no_grad(): + if layer_past is not None: + attention_mask = attention_mask[ + ..., + attention_scores.size(3) + - 1, :attention_scores.size(3)].unsqueeze(2) + else: + attention_mask = attention_mask[ + ..., :attention_scores.size(3), :attention_scores. + size(3)] + + if context_length is not None: + attention_mask = torch.clone(attention_mask) + attention_mask[:, :, context_length:, :] = True + + # attention scores and attention mask [b, np, sq, sk] + # attention_scores = attention_mask_func(attention_scores, attention_mask) + attention_scores = attention_scores - attention_mask * 10000.0 + if self.attention_softmax_in_fp32: + attention_probs = self.softmax(attention_scores.float()).half() + else: + attention_probs = self.softmax(attention_scores) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sq, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), + query_layer.size(0), value_layer.size(3)) + + # change view [sq, b * np, hn] + value_layer = value_layer.view( + value_layer.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], + output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm( + attention_probs, + value_layer.unsqueeze(0).transpose(1, 2).squeeze(0)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size,) # noqa + context_layer = context_layer.view(*new_context_layer_shape) + + # ================= + # Output. [sq, b, h] + # ================= + + output = self.dense(context_layer) + + if get_key_value: + output = [output, present] + + return output + + +class TransformerLayer(torch.nn.Module): + """A single transformer layer. + + Transformore layer takes input with size [b, s, h] and returns an + output of the same size. + """ + + def __init__( + self, + hidden_size, + num_attention_heads, + layer_number, + layernorm_epsilon=1e-5, + fp16=True, + attention_softmax_in_fp32=True, + ): + super(TransformerLayer, self).__init__() + self.hidden_size = hidden_size + self.layernorm_epsilon = layernorm_epsilon + self.layer_number = layer_number + + # Layernorm on the input data. + self.input_layernorm = torch.nn.LayerNorm( + hidden_size, eps=self.layernorm_epsilon) + + # Self attention. + self.attention = SelfAttention(hidden_size, num_attention_heads, + layer_number, fp16, + attention_softmax_in_fp32) + + # Layernorm on the input data. + self.post_attention_layernorm = torch.nn.LayerNorm( + self.hidden_size, eps=self.layernorm_epsilon) + self.mlp = MLP(self.hidden_size) + + def forward( + self, + hidden_states, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + # hidden_states: [b, s, h] + # Use FP32 for Layernorm + # layernorm_output = self.input_layernorm(hidden_states.float()).half() + layernorm_output = self.input_layernorm(hidden_states) + + # Self attention. + attention_output = self.attention( + layernorm_output, + attention_mask, + layer_past=layer_past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length) + + if get_key_value: + attention_output, presents = attention_output + + # Residual connection. + residual = hidden_states + layernorm_input = attention_output + residual + + # Use FP32 for Layernorm + # layernorm_output = self.post_attention_layernorm(layernorm_input.float()).half() + layernorm_output = self.post_attention_layernorm(layernorm_input) + mlp_output = self.mlp(layernorm_output) + output = mlp_output + layernorm_input + + if get_key_value: + output = [output, presents] + + return output + + +class TopQueryLayer(torch.nn.Module): + """A single top query layer. + + Top query layer takes input with size [b, s, h] and returns an + output of the same size. + """ + + def __init__( + self, + hidden_size, + num_attention_heads, + layer_number, + layernorm_epsilon=1e-5, + ): + super(TopQueryLayer, self).__init__() + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.layernorm_epsilon = layernorm_epsilon + self.layer_number = layer_number + + # Use FP32 for Layernorm + self.input_layernorm = torch.nn.LayerNorm( + self.hidden_size, eps=self.layernorm_epsilon) + + # Self attention. + self.attention = TopQuerySelfAttention(self.hidden_size, + self.num_attention_heads, + self.layer_number) + # Layernorm on the input data. + self.post_attention_layernorm = torch.nn.LayerNorm( + self.hidden_size, eps=self.layernorm_epsilon) + + # MLP + self.mlp = MLP(self.hidden_size) + + def forward( + self, + hidden_states, + query_hidden_state, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + # hidden_states: [b, s, h] + assert query_hidden_state != None # noqa + + # Use FP32 for Layernorm + # layernorm_output = self.input_layernorm(hidden_states.float()).half() + layernorm_output = self.input_layernorm(hidden_states) + + # Self attention. + attention_output = self.attention( + layernorm_output, + query_hidden_state, + attention_mask, + layer_past=layer_past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length) + + if get_key_value: + attention_output, presents = attention_output + + # Residual connection. + residual = hidden_states + layernorm_input = attention_output + residual + + # Use FP32 for Layernorm + # layernorm_output = self.post_attention_layernorm(layernorm_input.float()).half() + layernorm_output = self.post_attention_layernorm(layernorm_input) + + # MLP. + mlp_output = self.mlp(layernorm_output) + + # Second residual connection. + residual = layernorm_input + output = mlp_output + residual + + if get_key_value: + output = [output, presents] + + return output + + +class Transformer(torch.nn.Module): + """Transformer class.""" + + def __init__( + self, + hidden_size, + num_attention_heads, + num_layers, + layernorm_epsilon=1e-5, + ): + super(Transformer, self).__init__() + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + self.layernorm_epsilon = layernorm_epsilon + # Number of layers: + self.num_layers = num_layers + self.num_unique_layers = None + + ################# + assert self.num_unique_layers is None + ################# + + if self.num_unique_layers is None: + self.num_unique_layers = self.num_layers + assert self.num_layers % self.num_unique_layers == 0, \ + 'number of layers should be divisible by number of unique layers' + + # Transformer layers. + def build_layer(layer_number): + return TransformerLayer(self.hidden_size, self.num_attention_heads, + layer_number) + + self.layers = torch.nn.ModuleList( + [build_layer(i + 1) for i in range(self.num_unique_layers)]) + + self.topQueryLayer = TopQueryLayer(self.hidden_size, + self.num_attention_heads, + self.num_unique_layers) + + self.final_layernorm = torch.nn.LayerNorm( + self.hidden_size, eps=self.layernorm_epsilon) + + def _get_layer_index(self, layer_number): + return layer_number % self.num_unique_layers + + def _get_layer(self, layer_number): + return self.layers[self._get_layer_index(layer_number)] + + def forward( + self, + hidden_states, + query_hidden_state, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + # data format change to avoid explicit tranposes : [b s h] --> [s b h] + hidden_states = hidden_states.transpose(0, 1).contiguous() + query_hidden_state = query_hidden_state.transpose(0, 1).contiguous() + + if get_key_value: + presents = [] + for index in range(self.num_layers): + layer = self._get_layer(index) + past = None + if layer_past is not None: + past = layer_past[index] + hidden_states = layer( + hidden_states, + attention_mask, + layer_past=past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length) + if get_key_value: + hidden_states, present = hidden_states + presents.append(present) + + # Use FP32 for Layernorm + # hidden_states_ = self.final_layernorm(hidden_states.float()).half() + hidden_states_ = self.final_layernorm(hidden_states) + + ################################# + # top query layer + ################################# + past = None + if layer_past is not None: + past = layer_past[self.num_layers] + hidden_states = self.topQueryLayer( + hidden_states_, + query_hidden_state, + attention_mask, + layer_past=past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length) + + if get_key_value: + hidden_states, present = hidden_states + presents.append(present) + + # reverting data format change [s b h] --> [b s h] + output = hidden_states.transpose(0, 1).contiguous() + + if get_key_value: + output = [output, presents] + + return output + + def state_dict_for_save_checkpoint(self, + destination=None, + prefix='', + keep_vars=False): + return self.state_dict(destination, prefix, keep_vars) + + +class Embedding(torch.nn.Module): + """Language model embeddings. + + Arguments: + hidden_size: hidden size + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + """ + + def __init__( + self, + hidden_size, + vocab_size, + max_sequence_length, + ): + super(Embedding, self).__init__() + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + + # Word embeddings. + self.word_embeddings = torch.nn.Embedding(self.vocab_size, + self.hidden_size) + self._word_embeddings_key = 'word_embeddings' + + # Position embedding. + self.position_embeddings = torch.nn.Embedding(self.max_sequence_length, + self.hidden_size) + self.position_embeddings = self.position_embeddings.half() + self._position_embeddings_key = 'position_embeddings' + + def forward(self, input_ids, position_ids): + # Embeddings. + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + embeddings = words_embeddings + position_embeddings + + return embeddings + + def state_dict_for_save_checkpoint(self, + destination=None, + prefix='', + keep_vars=False): + """For easy load.""" + + state_dict_ = {} + state_dict_[self._word_embeddings_key] \ + = self.word_embeddings.state_dict(destination, prefix, keep_vars) + state_dict_[self._position_embeddings_key] \ + = self.position_embeddings.state_dict( + destination, prefix, keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Word embedding. + if self._word_embeddings_key in state_dict: + state_dict_ = state_dict[self._word_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'word_embeddings' in key: + state_dict_[key.split('word_embeddings.')[1]] \ + = state_dict[key] + state_dict_['weight'] = state_dict_['weight'][:self.vocab_size] + self.word_embeddings.load_state_dict(state_dict_, strict=strict) + + # Position embedding. + if self._position_embeddings_key in state_dict: + state_dict_ = state_dict[self._position_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'position_embeddings' in key: + state_dict_[key.split('position_embeddings.')[1]] \ + = state_dict[key] + self.position_embeddings.load_state_dict(state_dict_, strict=strict) + + +class QueryEmbedding(torch.nn.Module): + """Language model embeddings. + + Arguments: + hidden_size: hidden size + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + """ + + def __init__( + self, + hidden_size, + vocab_size, + max_sequence_length, + ): + super(QueryEmbedding, self).__init__() + + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.max_sequence_length = max_sequence_length + + # Top query position embedding (serial). + self.top_query_embeddings = torch.nn.Embedding( + self.max_sequence_length, self.hidden_size) + self.top_query_embeddings = self.top_query_embeddings.half() + self._top_query_embeddings_key = 'top_query_embeddings' + + def forward(self, position_ids): + # Embeddings. + embeddings = self.top_query_embeddings(position_ids) + + return embeddings + + def state_dict_for_save_checkpoint(self, + destination=None, + prefix='', + keep_vars=False): + """For easy load.""" + + state_dict_ = {} + state_dict_[self._top_query_embeddings_key] \ + = self.top_query_embeddings.state_dict( + destination, prefix, keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Position embedding. + if self._top_query_embeddings_key in state_dict: + state_dict_ = state_dict[self._top_query_embeddings_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'top_query_embeddings' in key: + state_dict_[key.split('top_query_embeddings.')[1]] \ + = state_dict[key] + self.top_query_embeddings.load_state_dict(state_dict_, strict=strict) + + +class TransformerLanguageModel(torch.nn.Module): + """Transformer language model. + + Arguments: + transformer_hparams: transformer hyperparameters + attention_mask_func: a function that takes `unmaksed-attention-scores` + with size [b, np, s, s] and an `attention-mask` and will apply + the masking. The function should return a masked score of the + same size [b, np, s, s]. + masked-attention-scores = attention_mask_func( + unmaksed-attention-scores, attention-mask) + vocab_size: vocabulary size + max_sequence_length: maximum size of sequence. This + is used for positional embedding + """ + + def __init__( + self, + hidden_size, + num_layers, + num_attention_heads, + padded_vocab_size, + max_position_embeddings, + ): + super(TransformerLanguageModel, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.num_attention_heads = num_attention_heads + self.padded_vocab_size = padded_vocab_size + self.max_position_embeddings = max_position_embeddings + + # Embeddings + self.embedding = Embedding(self.hidden_size, self.padded_vocab_size, + self.max_position_embeddings) + self._embedding_key = 'embedding' + + # Query embeddings + self.topQueryEmbedding = QueryEmbedding(self.hidden_size, + self.padded_vocab_size, + self.max_position_embeddings) + self._topQueryEmbedding_key = 'topQueryEmbedding' + + # Transformer + self.transformer = Transformer(self.hidden_size, + self.num_attention_heads, + self.num_layers) + self._transformer_key = 'transformer' + + def forward( + self, + input_ids, + position_ids, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + + # Embeddings. + embedding_output = self.embedding(input_ids, position_ids) + query_position_ids = position_ids + queryEmbedding_out = self.topQueryEmbedding(query_position_ids) + + # Transformer. + transformer_output = self.transformer( + embedding_output, + queryEmbedding_out, + attention_mask, + layer_past=layer_past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length) + + return transformer_output + + def state_dict_for_save_checkpoint(self, + destination=None, + prefix='', + keep_vars=False): + """For easy load.""" + + state_dict_ = {} + state_dict_[self._embedding_key] \ + = self.embedding.state_dict_for_save_checkpoint( + destination, prefix, keep_vars) + state_dict_[self._topQueryEmbedding_key] \ + = self.topQueryEmbedding.state_dict_for_save_checkpoint( + destination, prefix, keep_vars) + state_dict_[self._transformer_key] \ + = self.transformer.state_dict_for_save_checkpoint( + destination, prefix, keep_vars) + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + # Embedding. + if self._embedding_key in state_dict: + state_dict_ = state_dict[self._embedding_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if '_embeddings' in key: + state_dict_[key] = state_dict[key] + self.embedding.load_state_dict(state_dict_, strict=strict) + + if self._topQueryEmbedding_key in state_dict: + state_dict_ = state_dict[self._topQueryEmbedding_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if '_embeddings' in key: + state_dict_[key] = state_dict[key] + self.topQueryEmbedding.load_state_dict(state_dict_, strict=strict) + + # Transformer. + if self._transformer_key in state_dict: + state_dict_ = state_dict[self._transformer_key] + else: + # for backward compatibility. + state_dict_ = {} + for key in state_dict.keys(): + if 'transformer.' in key: + state_dict_[key.split('transformer.')[1]] = state_dict[key] + self.transformer.load_state_dict(state_dict_, strict=strict) + + +class CodeGeeXModel(torch.nn.Module): + """CodeGeeX: A Multilingual Code Generation Model.""" + + def __init__( + self, + hidden_size, + num_layers, + num_attention_heads, + padded_vocab_size, + max_position_embeddings, + ): + super(CodeGeeXModel, self).__init__() + + self.language_model = TransformerLanguageModel( + hidden_size, num_layers, num_attention_heads, padded_vocab_size, + max_position_embeddings) + self._language_model_key = 'language_model' + + def forward( + self, + input_ids, + position_ids, + attention_mask, + layer_past=None, + get_key_value=False, + prompt_length=None, + context_length=None, + ): + # Language model. + lm_output = self.language_model( + input_ids, + position_ids, + attention_mask, + layer_past=layer_past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length) + + if get_key_value: + lm_output, presents = lm_output + + output = F.linear( + lm_output, + self.language_model.embedding.word_embeddings.weight.half()) + + if get_key_value: + output = [output, presents] + + return output + + def state_dict_for_save_checkpoint(self, + destination=None, + prefix='', + keep_vars=False): + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint( + destination, prefix, keep_vars) + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + if self._language_model_key in state_dict: + state_dict = state_dict[self._language_model_key] + self.language_model.load_state_dict(state_dict, strict=strict) diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py new file mode 100755 index 00000000..0e9d161b --- /dev/null +++ b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py @@ -0,0 +1,126 @@ +# Copyright (c) 2022 Zhipu.AI + +import copy +import os +import random +import time +from typing import Dict + +import numpy as np +import torch +from IPython import embed + +from modelscope.metainfo import Models +from modelscope.models.base import Tensor, TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks +from .codegeex import CodeGeeXModel +from .inference import get_token_stream +from .tokenizer import CodeGeeXTokenizer + + +def model_provider(): + """Build the model.""" + + hidden_size = 5120 + num_attention_heads = 40 + num_layers = 39 + padded_vocab_size = 52224 + max_position_embeddings = 2048 + + model = CodeGeeXModel(hidden_size, num_layers, num_attention_heads, + padded_vocab_size, max_position_embeddings) + + return model + + +@MODELS.register_module(Tasks.code_translation, module_name=Models.codegeex) +class CodeGeeXForCodeTranslation(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the fast poem model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + + # loading tokenizer + print('Loading tokenizer ...') + self.tokenizer = CodeGeeXTokenizer( + tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b') + # loading model + state_dict_path = model_dir + '/ckpt_ms_translation_0817.pt' + print('Loading state dict ...') + state_dict = torch.load(state_dict_path, map_location='cpu') + state_dict = state_dict['module'] + + print('Building CodeGeeX model ...') + self.model = model_provider() + self.model.load_state_dict(state_dict) + self.model.eval() + self.model.half() + self.model.cuda() + + def forward(self, input: Dict[str, str]) -> Dict[str, str]: + micro_batch_size = 1 + seq_length = 2048 + out_seq_length = 256 + bad_ids = None + print('Generating ...') + src_lang = input['source language'] + dst_lang = input['target language'] + prompt = input['prompt'] + prompt = f'code translation\n{src_lang}:\n{prompt}\n{dst_lang}:\n' + t0 = time.perf_counter() + tokenizer = self.tokenizer + model = self.model + for prompt in [prompt]: + tokens = tokenizer.encode_code(prompt) + print(tokens) + print('Current prompt:') + print(prompt) + n_token_prompt = len(tokens) + print('N_token_prompt:', n_token_prompt) + token_stream = get_token_stream( + model, + tokenizer, + seq_length, + out_seq_length, + [copy.deepcopy(tokens) for _ in range(micro_batch_size)], + micro_batch_size=micro_batch_size, + bad_ids=bad_ids, + greedy=True, + ) + is_finished = [False for _ in range(micro_batch_size)] + for i, generated in enumerate(token_stream): + generated_tokens = generated[0] + for j in range(micro_batch_size): + if is_finished[j]: + continue + if generated_tokens[j].cpu().numpy( + )[-1] == tokenizer.eos_token_id or len( + generated_tokens[j]) >= out_seq_length: + is_finished[j] = True + generated_tokens_ = generated_tokens[j].cpu().numpy( + ).tolist() + generated_code = tokenizer.decode_code( + generated_tokens_[n_token_prompt:]) + generated_code = ''.join(generated_code) + t1 = time.perf_counter() + print('Total generation time:', t1 - t0, '# Tokens:', + len(generated_tokens_) - n_token_prompt) + print( + f'{(t1 - t0) / (len(generated_tokens_) - n_token_prompt)}s/token' + ) + print( + '================================= Generated code:' + ) + print(generated_code) + t0 = time.perf_counter() + if all(is_finished): + break + + print('Generation finished.') + return {OutputKeys.TEXT: generated_code} diff --git a/modelscope/models/nlp/codegeex/inference.py b/modelscope/models/nlp/codegeex/inference.py new file mode 100755 index 00000000..76a9458b --- /dev/null +++ b/modelscope/models/nlp/codegeex/inference.py @@ -0,0 +1,335 @@ +import copy +import os +import time +import typing +from dataclasses import dataclass + +import json +import torch +import torch.nn.functional as F + + +def get_ltor_masks_and_position_ids( + data, + eod_token, + reset_position_ids, + reset_attention_mask, +): + """Build masks and position id for left to right model.""" + + # Extract batch size and sequence length. + micro_batch_size, seq_length = data.size() + + # Attention mask (lower triangular). + if reset_attention_mask: + att_mask_batch = micro_batch_size + else: + att_mask_batch = 1 + attention_mask = torch.tril( + torch.ones((att_mask_batch, seq_length, seq_length), + device=data.device)).view(att_mask_batch, 1, seq_length, + seq_length) + + # Position ids. + position_ids = torch.arange( + seq_length, dtype=torch.long, device=data.device) + position_ids = position_ids.unsqueeze(0).expand_as(data) + # We need to clone as the ids will be modifed based on batch index. + if reset_position_ids: + position_ids = position_ids.clone() + + if reset_position_ids or reset_attention_mask: + # Loop through the batches: + for b in range(micro_batch_size): + + # Find indecies where EOD token is. + eod_index = position_ids[b, data[b] == eod_token] + # Detach indecies from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask: + attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1):] -= i + 1 - prev_index + prev_index = i + 1 + + # Convert attention mask to binary: + attention_mask = attention_mask < 0.5 + + return attention_mask, position_ids + + +def get_batch( + context_tokens, + micro_batch_size, + eod_token, + reset_position_ids=False, + reset_attention_mask=False, +): + """Generate batch from context tokens.""" + tokens = context_tokens.view(micro_batch_size, -1).contiguous().cuda() + # Get the attention mask and postition ids. + attention_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + eod_token, + reset_position_ids, + reset_attention_mask, + ) + + return tokens, attention_mask, position_ids + + +def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + """This function has been mostly taken from huggingface conversational + ai code at + https://medium.com/huggingface/how-to-build-a-state-of-the-art- + conversational-ai-with-transfer-learning-2d818ac26313""" + + if top_k > 0: + # Remove all tokens with a probability less than the + # last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, + None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + # Cconvert to 1D + sorted_logits, sorted_indices = torch.sort( + logits, descending=True, dim=-1) + cumulative_probs = torch.cumsum( + F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token + # above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ + ..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + for i in range(sorted_indices.size(0)): + indices_to_remove = sorted_indices[i][sorted_indices_to_remove[i]] + logits[i][indices_to_remove] = filter_value + + return logits + + +def pad_batch(batch, pad_id, seq_length): + context_lengths = [] + for tokens in batch: + context_length = len(tokens) + if context_length < seq_length: + tokens.extend([pad_id] * (seq_length - context_length)) + context_lengths.append(context_length) + return batch, context_lengths + + +def forward_step( + model, + tokens, + seq_length, + position_ids, + attention_mask, + layer_past=None, + get_key_value=None, + prompt_length=None, + context_length=None, +): + # Forward pass through the model. + output_tensor = model( + tokens, + position_ids, + attention_mask, + layer_past=layer_past, + get_key_value=get_key_value, + prompt_length=prompt_length, + context_length=context_length, + ) + + if get_key_value: + output_tensor, layer_past = output_tensor + + if get_key_value: + return output_tensor, layer_past + + return output_tensor + + +def get_token_stream( + model, + tokenizer, + seq_length, + out_seq_length, + context_tokens, + return_scores: bool = False, + prompt_length: int = None, + micro_batch_size: int = None, + bad_ids: List = None, + temperature: float = 1.0, + topp: float = 1.0, + topk: int = 0.0, + greedy: bool = False, +): + context_tokens, context_lengths = pad_batch(context_tokens, + tokenizer.eos_token_id, + seq_length) + + context_tokens_tensor = torch.cuda.LongTensor(context_tokens) + context_length_tensor = torch.cuda.LongTensor(context_lengths) + context_length = context_length_tensor.min().item() + tokens, attention_mask, position_ids = get_batch( + context_tokens_tensor, + micro_batch_size, + tokenizer.eos_token_id, + ) + + batch_token_iterator = sample_sequence_batch( + model, + tokenizer, + context_tokens_tensor, + context_length_tensor, + attention_mask, + position_ids, + seq_length=seq_length, + out_seq_length=out_seq_length, + return_scores=return_scores, + prompt_length=prompt_length, + bad_ids=bad_ids, + temperature=temperature, + topp=topp, + topk=topk, + greedy=greedy, + ) + + for tokens, lengths in batch_token_iterator: + context_length += 1 + if tokens is not None: + yield tokens[:, :context_length], lengths + else: + yield None, None + + +def switch(val1, val2, boolean): + boolean = boolean.type_as(val1) + return (1 - boolean) * val1 + boolean * val2 + + +def sample_sequence_batch( + model, + tokenizer, + context_tokens, + context_lengths, + attention_mask, + position_ids, + seq_length, + out_seq_length, + maxlen=None, + return_scores: bool = False, + prompt_length: int = None, + bad_ids: List = None, + temperature: float = 1.0, + topp: float = 1.0, + topk: int = 0.0, + recompute: bool = False, + greedy: bool = False, +): + model.eval() + with torch.no_grad(): + context_length = context_lengths.min().item() + eos_id = tokenizer.eos_token_id + + counter = 0 + org_context_length = context_length + + layer_past = None + batch_size = context_tokens.size(0) + is_done = torch.zeros([batch_size]).byte().cuda() + tokens = context_tokens + if maxlen is None: + maxlen = seq_length - 1 + if maxlen > (org_context_length + out_seq_length): + maxlen = org_context_length + out_seq_length + + lengths = torch.ones([batch_size]).long().cuda() * maxlen + if return_scores: + scores = torch.zeros([batch_size]).float().cuda() + + while context_length <= (maxlen): + + if recompute: + logits = model( + tokens, + position_ids, + attention_mask, + prompt_length=prompt_length, + context_length=context_length, + ) + logits = logits[:, context_length - 1, :] + else: + if counter == 0: + tokens2use = tokens[:, :context_length] + positions2use = position_ids[:, :context_length] + else: + tokens2use = tokens[:, context_length - 1].view( + batch_size, -1) + positions2use = position_ids[:, context_length - 1].view( + batch_size, -1) + logits, layer_past = model( + tokens2use, + positions2use, + attention_mask, + layer_past=layer_past, + get_key_value=True, + prompt_length=prompt_length, + context_length=context_length, + ) + logits = logits[:, -1].view(batch_size, -1).contiguous() + + if bad_ids is not None: + for bad_id in bad_ids: + logits[:, bad_id] = -10000 + if greedy: + prev = torch.argmax(logits, dim=-1).view(-1) + else: + logits = logits.float() + if return_scores: + orig_log_probs = torch.log_softmax(logits, dim=-1) + logits /= temperature + logits = top_k_logits(logits, top_k=topk, top_p=topp) + log_probs = F.softmax(logits, dim=-1) + prev = torch.multinomial(log_probs, num_samples=1).view(-1) + + started = context_lengths <= context_length + + new_tokens = switch(tokens[:, context_length].view(-1), prev, + started) + + if not greedy and return_scores: + indices = prev.view(-1, 1) + new_scores = orig_log_probs.gather(1, indices).view(-1) + new_scores = new_scores * started + new_scores = new_scores * is_done.bool().logical_not() + scores += new_scores + + tokens[:, context_length] = new_tokens + done_token = (prev == eos_id).byte() & started.byte() + just_finished = (done_token & ~is_done).bool() + lengths[just_finished.view(-1)] = context_length + is_done = is_done | done_token + done = torch.all(is_done) + + if return_scores: + yield tokens, (lengths, scores) + else: + yield tokens, lengths + + context_length += 1 + counter += 1 + if done: + break diff --git a/modelscope/models/nlp/codegeex/tokenizer.py b/modelscope/models/nlp/codegeex/tokenizer.py new file mode 100755 index 00000000..66958d7d --- /dev/null +++ b/modelscope/models/nlp/codegeex/tokenizer.py @@ -0,0 +1,186 @@ +import typing + +import torch +from transformers import AutoTokenizer +from transformers.models.gpt2 import GPT2TokenizerFast + + +def encode_whitespaces(text, start_extra_id: int, max_len: int): + """ Encode whitespaces to extra tokens in GPT-J. + + >>> encode_whitespaces('a\\n b\\n c', 10, 10) + 'a\\n<|extratoken_10|>b\\n<|extratoken_11|>c' + """ + + def push_acc_space(acc_len: int, text: str): + if acc_len == 0: + return text + if acc_len == 1: + return text + ' ' + assert acc_len <= max_len, f'Max whitespace run length {max_len}, but found {acc_len}' + extra_id = start_extra_id - 2 + acc_len + extra_token = f'<|extratoken_{extra_id}|>' + return text + extra_token + + acc_len = 0 + res = '' + for ch in text: + if ch == ' ': + acc_len += 1 + if acc_len == max_len: + res = push_acc_space(acc_len, res) + acc_len = 0 + else: + res = push_acc_space(acc_len, res) + acc_len = 0 + res = res + ch + + res = push_acc_space(acc_len, res) + + return res + + +def decode_whitespaces(text: str, start_extra_id: int, max_len: int): + """ Decode the whitespace-encoded strings produced by encode_whitespace. + + >>> text = 'a\\n b\\n c' + >>> s, l = 10, 10 + >>> text == decode_whitespaces(encode_whitespaces(text, s, l), s, l) + True + """ + for l in range(2, max_len + 1): # noqa + token_id = start_extra_id - 2 + l + token = f'<|extratoken_{token_id}|>' + text = text.replace(token, ' ' * l) + return text + + +class Code13BDictionary(object): + + def __init__( + self, + dict_file: str, + extra_token_ids: List[str] = None, + pad_to_vocab_size: int = -1, + ): + self._idx = dict() + self._count = dict() + self._num_symbols = 0 + self._symbols = [] + + self._add_symbol('', 0) + self._add_symbol('', 0) + self._add_symbol('', 0) + self._add_symbol('', 0) + self._load_dict(dict_file) + + if extra_token_ids is None: + extra_token_ids = [str(x) for x in range(50257, 50400) + ] # follows GPT-J settings + + for token_id in extra_token_ids: + self._add_symbol(token_id, 0) + + if pad_to_vocab_size > 0: + self._pad_to_vocab_size(pad_to_vocab_size) + + def _pad_to_vocab_size(self, vocab_size: int): + num_pad = vocab_size - len(self) + if num_pad <= 0: + return + for i in range(1, num_pad + 1): + self._add_symbol('vocab_pad_token{}'.format(i), 0) + + def _load_dict(self, dict_file: str): + with open(dict_file, 'r') as f: + for line in f: + line = line.strip() + if line == '' or line.startswith('#'): + continue + sym, count = line.split() + self._add_symbol(sym, int(count)) + + def _add_symbol(self, sym: str, count: int): + self._idx[sym] = self._num_symbols + self._count[sym] = count + self._symbols.append(sym) + self._num_symbols += 1 + + def __len__(self): + return self._num_symbols + + def index(self, sym: str): + return self._idx[sym] + + def string(self, idx: int): + return self._symbols[idx] + + def map_token(self, token: Union[int, str]): + if isinstance(token, int): + token = str(token) + return self.index(token) + + def map_tokens(self, tokens): + return [self.map_token(token) for token in tokens] + + def decode_tokens(self, tokens): + decoded = [ + '50256' if token == 50256 else self.string(token) + for token in tokens + ] + return [int(x) for x in decoded if not x.startswith('vocab_pad_token')] + + +class CodeGeeXTokenizer(object): + + def __init__( + self, + tokenizer: GPT2TokenizerFast = None, + tokenizer_path: str = 'EleutherAI/gpt-j-6B', + start_extra_id: int = 10, + max_len: int = 10, + mode='codegeex-13b', + dict_file: str = None, + ): + self.tokenizer = tokenizer if tokenizer is not None else AutoTokenizer.from_pretrained( + tokenizer_path) + if mode not in ['codegeex-13b', 'codegeex-python-13b']: + raise ValueError( + f"Invalid mode {mode}, choose from ['codegeex-13b', 'codegeex-python-13b']" + ) + self.start_extra_id = start_extra_id + self.max_len = max_len + self.mode = mode + if dict_file is not None: + self.code_dict = Code13BDictionary( + dict_file, pad_to_vocab_size=51200 + ) if self.mode == 'codegeex-python-13b' else None + else: + self.code_dict = None + self.eos_token_id = self.tokenizer.eos_token_id + + def encode_code(self, code: str): + if self.mode == 'codegeex-13b': + code = encode_whitespaces(code, self.start_extra_id, self.max_len) + input_ids = self.tokenizer( + code, is_split_into_words=False).input_ids + + elif self.mode == 'codegeex-python-13b': + code = encode_whitespaces(code, self.start_extra_id, self.max_len) + input_ids = self.code_dict.map_tokens(self.tokenizer.encode(code)) + input_ids = torch.LongTensor(input_ids).reshape(1, -1) + + return input_ids + + def decode_code(self, input_ids): + if self.mode == 'codegeex-13b': + text = self.tokenizer.decode(input_ids, skip_special_tokens=False) + output_code = decode_whitespaces(text, self.start_extra_id, + self.max_len) + elif self.mode == 'codegeex-python-13b': + input_ids = [self.code_dict.decode_tokens(input_ids.tolist()[0])] + text = self.tokenizer.decode(input_ids, skip_special_tokens=False) + output_code = decode_whitespaces(text, self.start_extra_id, + self.max_len) + + return output_code diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index 1206ae08..3ffe7b93 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: from .word_segmentation_pipeline import WordSegmentationPipeline from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline + from .codegeex_code_translation_pipeline import CodeGeeXCodeTranslationPipeline from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \ WordSegmentationThaiPipeline @@ -73,6 +74,8 @@ else: 'zero_shot_classification_pipeline': ['ZeroShotClassificationPipeline'], 'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'], + 'codegeex_code_translation_pipeline': + ['CodeGeeXCodeTranslationPipeline'], 'multilingual_word_segmentation_pipeline': [ 'MultilingualWordSegmentationPipeline', 'WordSegmentationThaiPipeline' diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py new file mode 100755 index 00000000..3c7374da --- /dev/null +++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py @@ -0,0 +1,44 @@ +# Copyright (c) 2022 Zhipu.AI + +from typing import Any, Dict, Optional, Union + +from modelscope.metainfo import Pipelines +from modelscope.models.base import Model +from modelscope.models.nlp import CodeGeeXForCodeTranslation +from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import CodeGeeXPreprocessor, Preprocessor +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + group_key=Tasks.code_translation, + module_name=Pipelines.codegeex_code_translation) +class CodeGeeXCodeTranslationPipeline(Pipeline): + + def __init__(self, + model: Union[CodeGeeXForCodeTranslation, str], + preprocessor: [Preprocessor] = None, + *args, + **kwargs): + model = CodeGeeXForCodeTranslation(model) if isinstance(model, + str) else model + self.model = model + self.model.eval() + self.model.half() + self.model.cuda() + if preprocessor is None: + preprocessor = CodeGeeXPreprocessor() + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + # define the forward pass + def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]: + # check input format + for para in ['prompt', 'source language', 'target language']: + if para not in inputs: + return ('please check your input format.') + return self.model(inputs) + + # format the outputs from pipeline + def postprocess(self, input, **kwargs) -> Dict[str, Any]: + return input diff --git a/modelscope/preprocessors/__init__.py b/modelscope/preprocessors/__init__.py index 0db1c7e0..ce053459 100644 --- a/modelscope/preprocessors/__init__.py +++ b/modelscope/preprocessors/__init__.py @@ -23,7 +23,7 @@ if TYPE_CHECKING: SentenceEmbeddingPreprocessor, SequenceClassificationPreprocessor, TokenClassificationPreprocessor, TextErrorCorrectionPreprocessor, TextGenerationPreprocessor, Text2TextGenerationPreprocessor, Tokenize, - WordSegmentationBlankSetToLabelPreprocessor, + WordSegmentationBlankSetToLabelPreprocessor, CodeGeeXPreprocessor, MGLMSummarizationPreprocessor, ZeroShotClassificationPreprocessor, TextGenerationJiebaPreprocessor, SentencePiecePreprocessor, DialogIntentPredictionPreprocessor, DialogModelingPreprocessor, @@ -57,7 +57,7 @@ else: 'TextErrorCorrectionPreprocessor', 'TextGenerationPreprocessor', 'Tokenize', 'Text2TextGenerationPreprocessor', 'WordSegmentationBlankSetToLabelPreprocessor', - 'MGLMSummarizationPreprocessor', + 'MGLMSummarizationPreprocessor', 'CodeGeeXPreprocessor', 'ZeroShotClassificationPreprocessor', 'TextGenerationJiebaPreprocessor', 'SentencePiecePreprocessor', 'NERPreprocessorViet', 'NERPreprocessorThai', diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index 7c48fb3c..2121543a 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -30,6 +30,7 @@ if TYPE_CHECKING: from .space_T_en import ConversationalTextToSqlPreprocessor from .space_T_cn import TableQuestionAnsweringPreprocessor from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor + from .codegeex_preprocessor import CodeGeeXPreprocessor else: _import_structure = { 'nlp_base': [ @@ -64,6 +65,7 @@ else: 'TextErrorCorrectionPreprocessor', ], 'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'], + 'codegeex_preprocessor': ['CodeGeeXPreprocessor'], 'token_classification_thai_preprocessor': [ 'NERPreprocessorThai', 'WordSegmentationPreprocessorThai', diff --git a/modelscope/preprocessors/nlp/codegeex_preprocessor.py b/modelscope/preprocessors/nlp/codegeex_preprocessor.py new file mode 100755 index 00000000..f5f462f6 --- /dev/null +++ b/modelscope/preprocessors/nlp/codegeex_preprocessor.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 Zhipu.AI + +import re +from typing import Any, Dict, Iterable, Optional, Tuple, Union + +from modelscope.metainfo import Models, Preprocessors +from modelscope.preprocessors.base import Preprocessor +from modelscope.preprocessors.builder import PREPROCESSORS +from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile +from modelscope.utils.type_assert import type_assert + + +@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.codegeex) +class CodeGeeXPreprocessor(Preprocessor): + + def __init__(self, *args, **kwargs): + """preprocess the data + Args: + model_dir (str): model path + """ + super().__init__(*args, **kwargs) + + @type_assert(object, (str, tuple, Dict)) + def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: + return data diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index b1bccc4c..bf3f8fb9 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -120,6 +120,7 @@ class NLPTasks(object): fill_mask = 'fill-mask' text_summarization = 'text-summarization' question_answering = 'question-answering' + code_translation = 'code-translation' zero_shot_classification = 'zero-shot-classification' backbone = 'backbone' text_error_correction = 'text-error-correction' diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py new file mode 100644 index 00000000..d2fd5369 --- /dev/null +++ b/tests/pipelines/test_CodeGeeX_code_translation.py @@ -0,0 +1,38 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import unittest + +from modelscope.models import Model +from modelscope.pipelines import pipeline +from modelscope.preprocessors import CodeGeeXPreprocessor +from modelscope.utils.constant import Tasks +from modelscope.utils.demo_utils import DemoCompatibilityCheck +from modelscope.utils.test_utils import test_level + + +class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck): + + def setUp(self) -> None: + self.output_dir = 'unittest_output' + os.makedirs(self.output_dir, exist_ok=True) + + @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') + def test_run_with_CodeGeeX_with_name(self): + model = 'ZhipuAI/CodeGeeX-Code-Translation-13B' + preprocessor = CodeGeeXPreprocessor() + pipe = pipeline( + task=Tasks.code_translation, + model=model, + preprocessor=preprocessor, + ) + inputs = { + 'prompt': 'for i in range(10):\n\tprint(i)\n', + 'source language': 'Python', + 'target language': 'C++' + } + result = pipe(inputs) + print(result) + + +if __name__ == '__main__': + unittest.main() From f171552ee3bbc0d334a9a360cebaa3973bf526d5 Mon Sep 17 00:00:00 2001 From: shuaigezhu Date: Thu, 24 Nov 2022 10:50:38 +0800 Subject: [PATCH 2/6] updated --- modelscope/models/nlp/codegeex/__init__.py | 2 +- modelscope/models/nlp/codegeex/codegeex.py | 2 +- .../codegeex/codegeex_for_code_translation.py | 43 ++++++------------- modelscope/models/nlp/codegeex/inference.py | 41 ++---------------- modelscope/models/nlp/codegeex/tokenizer.py | 4 +- .../nlp/codegeex_code_translation_pipeline.py | 17 ++++---- modelscope/preprocessors/nlp/__init__.py | 2 - .../nlp/codegeex_preprocessor.py | 25 ----------- .../test_CodeGeeX_code_translation.py | 6 +-- 9 files changed, 29 insertions(+), 113 deletions(-) delete mode 100755 modelscope/preprocessors/nlp/codegeex_preprocessor.py diff --git a/modelscope/models/nlp/codegeex/__init__.py b/modelscope/models/nlp/codegeex/__init__.py index 6ee72f80..08add0b0 100755 --- a/modelscope/models/nlp/codegeex/__init__.py +++ b/modelscope/models/nlp/codegeex/__init__.py @@ -1,6 +1,6 @@ # Modified by Zhipu.AI # Original Copyright (c) Alibaba, Inc. and its affiliates. -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/nlp/codegeex/codegeex.py b/modelscope/models/nlp/codegeex/codegeex.py index 7a1b76a3..f8d43008 100755 --- a/modelscope/models/nlp/codegeex/codegeex.py +++ b/modelscope/models/nlp/codegeex/codegeex.py @@ -1,8 +1,8 @@ +# Copyright (c) 2022 Zhipu.AI import math import torch import torch.nn.functional as F -from torch.nn.parameter import Parameter def fast_gelu(x): diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py index 0e9d161b..be3e79f0 100755 --- a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py +++ b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py @@ -1,20 +1,15 @@ # Copyright (c) 2022 Zhipu.AI - import copy -import os -import random -import time -from typing import Dict +from typing import Any, Dict -import numpy as np import torch -from IPython import embed from modelscope.metainfo import Models -from modelscope.models.base import Tensor, TorchModel +from modelscope.models.base import TorchModel from modelscope.models.builder import MODELS from modelscope.outputs import OutputKeys -from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger from .codegeex import CodeGeeXModel from .inference import get_token_stream from .tokenizer import CodeGeeXTokenizer @@ -45,18 +40,18 @@ class CodeGeeXForCodeTranslation(TorchModel): model_dir (str): the model path. """ super().__init__(model_dir, *args, **kwargs) - + logger = get_logger() # loading tokenizer - print('Loading tokenizer ...') + logger.info('Loading tokenizer ...') self.tokenizer = CodeGeeXTokenizer( tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b') # loading model state_dict_path = model_dir + '/ckpt_ms_translation_0817.pt' - print('Loading state dict ...') + logger.info('Loading state dict ...') state_dict = torch.load(state_dict_path, map_location='cpu') state_dict = state_dict['module'] - print('Building CodeGeeX model ...') + logger.info('Building CodeGeeX model ...') self.model = model_provider() self.model.load_state_dict(state_dict) self.model.eval() @@ -68,21 +63,16 @@ class CodeGeeXForCodeTranslation(TorchModel): seq_length = 2048 out_seq_length = 256 bad_ids = None - print('Generating ...') src_lang = input['source language'] dst_lang = input['target language'] prompt = input['prompt'] prompt = f'code translation\n{src_lang}:\n{prompt}\n{dst_lang}:\n' - t0 = time.perf_counter() + logger = get_logger() tokenizer = self.tokenizer model = self.model for prompt in [prompt]: tokens = tokenizer.encode_code(prompt) - print(tokens) - print('Current prompt:') - print(prompt) n_token_prompt = len(tokens) - print('N_token_prompt:', n_token_prompt) token_stream = get_token_stream( model, tokenizer, @@ -108,19 +98,10 @@ class CodeGeeXForCodeTranslation(TorchModel): generated_code = tokenizer.decode_code( generated_tokens_[n_token_prompt:]) generated_code = ''.join(generated_code) - t1 = time.perf_counter() - print('Total generation time:', t1 - t0, '# Tokens:', - len(generated_tokens_) - n_token_prompt) - print( - f'{(t1 - t0) / (len(generated_tokens_) - n_token_prompt)}s/token' - ) - print( - '================================= Generated code:' - ) - print(generated_code) - t0 = time.perf_counter() + logger.info('================================= Generated code:') + logger.info(generated_code) if all(is_finished): break - print('Generation finished.') + logger.info('Generation finished.') return {OutputKeys.TEXT: generated_code} diff --git a/modelscope/models/nlp/codegeex/inference.py b/modelscope/models/nlp/codegeex/inference.py index 76a9458b..d058f023 100755 --- a/modelscope/models/nlp/codegeex/inference.py +++ b/modelscope/models/nlp/codegeex/inference.py @@ -1,12 +1,8 @@ -import copy -import os -import time -import typing -from dataclasses import dataclass +# Copyright (c) 2022 Zhipu.AI -import json import torch import torch.nn.functional as F +from typing import List def get_ltor_masks_and_position_ids( @@ -128,38 +124,7 @@ def pad_batch(batch, pad_id, seq_length): tokens.extend([pad_id] * (seq_length - context_length)) context_lengths.append(context_length) return batch, context_lengths - - -def forward_step( - model, - tokens, - seq_length, - position_ids, - attention_mask, - layer_past=None, - get_key_value=None, - prompt_length=None, - context_length=None, -): - # Forward pass through the model. - output_tensor = model( - tokens, - position_ids, - attention_mask, - layer_past=layer_past, - get_key_value=get_key_value, - prompt_length=prompt_length, - context_length=context_length, - ) - - if get_key_value: - output_tensor, layer_past = output_tensor - - if get_key_value: - return output_tensor, layer_past - - return output_tensor - + def get_token_stream( model, diff --git a/modelscope/models/nlp/codegeex/tokenizer.py b/modelscope/models/nlp/codegeex/tokenizer.py index 66958d7d..cc507eb6 100755 --- a/modelscope/models/nlp/codegeex/tokenizer.py +++ b/modelscope/models/nlp/codegeex/tokenizer.py @@ -1,8 +1,8 @@ -import typing - +# Copyright (c) 2022 Zhipu.AI import torch from transformers import AutoTokenizer from transformers.models.gpt2 import GPT2TokenizerFast +from typing import List, Union def encode_whitespaces(text, start_extra_id: int, max_len: int): diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py index 3c7374da..f2bce381 100755 --- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py +++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py @@ -1,13 +1,12 @@ # Copyright (c) 2022 Zhipu.AI -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Union from modelscope.metainfo import Pipelines -from modelscope.models.base import Model from modelscope.models.nlp import CodeGeeXForCodeTranslation -from modelscope.pipelines.base import Pipeline, Tensor +from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import CodeGeeXPreprocessor, Preprocessor +from modelscope.preprocessors import Preprocessor from modelscope.utils.constant import Tasks @@ -27,16 +26,18 @@ class CodeGeeXCodeTranslationPipeline(Pipeline): self.model.eval() self.model.half() self.model.cuda() - if preprocessor is None: - preprocessor = CodeGeeXPreprocessor() - super().__init__(model=model, preprocessor=preprocessor, **kwargs) + + super().__init__(model=model, **kwargs) + + def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]: + return inputs # define the forward pass def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]: # check input format for para in ['prompt', 'source language', 'target language']: if para not in inputs: - return ('please check your input format.') + raise Exception('please check your input format.') return self.model(inputs) # format the outputs from pipeline diff --git a/modelscope/preprocessors/nlp/__init__.py b/modelscope/preprocessors/nlp/__init__.py index 2121543a..7c48fb3c 100644 --- a/modelscope/preprocessors/nlp/__init__.py +++ b/modelscope/preprocessors/nlp/__init__.py @@ -30,7 +30,6 @@ if TYPE_CHECKING: from .space_T_en import ConversationalTextToSqlPreprocessor from .space_T_cn import TableQuestionAnsweringPreprocessor from .mglm_summarization_preprocessor import MGLMSummarizationPreprocessor - from .codegeex_preprocessor import CodeGeeXPreprocessor else: _import_structure = { 'nlp_base': [ @@ -65,7 +64,6 @@ else: 'TextErrorCorrectionPreprocessor', ], 'mglm_summarization_preprocessor': ['MGLMSummarizationPreprocessor'], - 'codegeex_preprocessor': ['CodeGeeXPreprocessor'], 'token_classification_thai_preprocessor': [ 'NERPreprocessorThai', 'WordSegmentationPreprocessorThai', diff --git a/modelscope/preprocessors/nlp/codegeex_preprocessor.py b/modelscope/preprocessors/nlp/codegeex_preprocessor.py deleted file mode 100755 index f5f462f6..00000000 --- a/modelscope/preprocessors/nlp/codegeex_preprocessor.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2022 Zhipu.AI - -import re -from typing import Any, Dict, Iterable, Optional, Tuple, Union - -from modelscope.metainfo import Models, Preprocessors -from modelscope.preprocessors.base import Preprocessor -from modelscope.preprocessors.builder import PREPROCESSORS -from modelscope.utils.constant import Fields, InputFields, ModeKeys, ModelFile -from modelscope.utils.type_assert import type_assert - - -@PREPROCESSORS.register_module(Fields.nlp, module_name=Preprocessors.codegeex) -class CodeGeeXPreprocessor(Preprocessor): - - def __init__(self, *args, **kwargs): - """preprocess the data - Args: - model_dir (str): model path - """ - super().__init__(*args, **kwargs) - - @type_assert(object, (str, tuple, Dict)) - def __call__(self, data: Union[str, tuple, Dict]) -> Dict[str, Any]: - return data diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py index d2fd5369..a56ae00e 100644 --- a/tests/pipelines/test_CodeGeeX_code_translation.py +++ b/tests/pipelines/test_CodeGeeX_code_translation.py @@ -2,9 +2,7 @@ import os import unittest -from modelscope.models import Model from modelscope.pipelines import pipeline -from modelscope.preprocessors import CodeGeeXPreprocessor from modelscope.utils.constant import Tasks from modelscope.utils.demo_utils import DemoCompatibilityCheck from modelscope.utils.test_utils import test_level @@ -19,11 +17,9 @@ class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_CodeGeeX_with_name(self): model = 'ZhipuAI/CodeGeeX-Code-Translation-13B' - preprocessor = CodeGeeXPreprocessor() pipe = pipeline( task=Tasks.code_translation, - model=model, - preprocessor=preprocessor, + model=model ) inputs = { 'prompt': 'for i in range(10):\n\tprint(i)\n', From 1ab8a1f764b33b7be174619520af2a2f8958ffbe Mon Sep 17 00:00:00 2001 From: shuaigezhu Date: Thu, 24 Nov 2022 11:20:25 +0800 Subject: [PATCH 3/6] updated --- .../models/nlp/codegeex/codegeex_for_code_translation.py | 4 +++- modelscope/models/nlp/codegeex/inference.py | 5 +++-- modelscope/models/nlp/codegeex/tokenizer.py | 3 ++- .../pipelines/nlp/codegeex_code_translation_pipeline.py | 4 ++-- tests/pipelines/test_CodeGeeX_code_translation.py | 5 +---- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py index be3e79f0..fece907d 100755 --- a/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py +++ b/modelscope/models/nlp/codegeex/codegeex_for_code_translation.py @@ -98,7 +98,9 @@ class CodeGeeXForCodeTranslation(TorchModel): generated_code = tokenizer.decode_code( generated_tokens_[n_token_prompt:]) generated_code = ''.join(generated_code) - logger.info('================================= Generated code:') + logger.info( + '================================= Generated code:' + ) logger.info(generated_code) if all(is_finished): break diff --git a/modelscope/models/nlp/codegeex/inference.py b/modelscope/models/nlp/codegeex/inference.py index d058f023..38f14d6c 100755 --- a/modelscope/models/nlp/codegeex/inference.py +++ b/modelscope/models/nlp/codegeex/inference.py @@ -1,8 +1,9 @@ # Copyright (c) 2022 Zhipu.AI +from typing import List + import torch import torch.nn.functional as F -from typing import List def get_ltor_masks_and_position_ids( @@ -124,7 +125,7 @@ def pad_batch(batch, pad_id, seq_length): tokens.extend([pad_id] * (seq_length - context_length)) context_lengths.append(context_length) return batch, context_lengths - + def get_token_stream( model, diff --git a/modelscope/models/nlp/codegeex/tokenizer.py b/modelscope/models/nlp/codegeex/tokenizer.py index cc507eb6..a5da9a3c 100755 --- a/modelscope/models/nlp/codegeex/tokenizer.py +++ b/modelscope/models/nlp/codegeex/tokenizer.py @@ -1,8 +1,9 @@ # Copyright (c) 2022 Zhipu.AI +from typing import List, Union + import torch from transformers import AutoTokenizer from transformers.models.gpt2 import GPT2TokenizerFast -from typing import List, Union def encode_whitespaces(text, start_extra_id: int, max_len: int): diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py index f2bce381..ef0f29e0 100755 --- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py +++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py @@ -28,9 +28,9 @@ class CodeGeeXCodeTranslationPipeline(Pipeline): self.model.cuda() super().__init__(model=model, **kwargs) - + def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]: - return inputs + return inputs # define the forward pass def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]: diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py index a56ae00e..0972c494 100644 --- a/tests/pipelines/test_CodeGeeX_code_translation.py +++ b/tests/pipelines/test_CodeGeeX_code_translation.py @@ -17,10 +17,7 @@ class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck): @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') def test_run_with_CodeGeeX_with_name(self): model = 'ZhipuAI/CodeGeeX-Code-Translation-13B' - pipe = pipeline( - task=Tasks.code_translation, - model=model - ) + pipe = pipeline(task=Tasks.code_translation, model=model) inputs = { 'prompt': 'for i in range(10):\n\tprint(i)\n', 'source language': 'Python', From 65adde14d8b2f6e13cc44983b439e319d0a7cf66 Mon Sep 17 00:00:00 2001 From: shuaigezhu Date: Fri, 25 Nov 2022 11:55:53 +0800 Subject: [PATCH 4/6] remove uttest --- .../test_CodeGeeX_code_translation.py | 31 ------------------- 1 file changed, 31 deletions(-) delete mode 100644 tests/pipelines/test_CodeGeeX_code_translation.py diff --git a/tests/pipelines/test_CodeGeeX_code_translation.py b/tests/pipelines/test_CodeGeeX_code_translation.py deleted file mode 100644 index 0972c494..00000000 --- a/tests/pipelines/test_CodeGeeX_code_translation.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -import unittest - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks -from modelscope.utils.demo_utils import DemoCompatibilityCheck -from modelscope.utils.test_utils import test_level - - -class CodeGeeXCodeTranslationTest(unittest.TestCase, DemoCompatibilityCheck): - - def setUp(self) -> None: - self.output_dir = 'unittest_output' - os.makedirs(self.output_dir, exist_ok=True) - - @unittest.skipUnless(test_level() >= 0, 'skip test in current test level') - def test_run_with_CodeGeeX_with_name(self): - model = 'ZhipuAI/CodeGeeX-Code-Translation-13B' - pipe = pipeline(task=Tasks.code_translation, model=model) - inputs = { - 'prompt': 'for i in range(10):\n\tprint(i)\n', - 'source language': 'Python', - 'target language': 'C++' - } - result = pipe(inputs) - print(result) - - -if __name__ == '__main__': - unittest.main() From c9064caa58d7e207834478423a66bf82025e23e0 Mon Sep 17 00:00:00 2001 From: shuaigezhu Date: Fri, 25 Nov 2022 16:35:19 +0800 Subject: [PATCH 5/6] add code_generation --- modelscope/metainfo.py | 2 +- modelscope/models/nlp/__init__.py | 4 +- modelscope/models/nlp/codegeex/__init__.py | 2 + .../codegeex/codegeex_for_code_generation.py | 111 ++++++++++++++++++ modelscope/pipelines/nlp/__init__.py | 3 + .../nlp/codegeex_code_generation_pipeline.py | 48 ++++++++ .../nlp/codegeex_code_translation_pipeline.py | 6 + modelscope/utils/constant.py | 1 + 8 files changed, 174 insertions(+), 3 deletions(-) create mode 100755 modelscope/models/nlp/codegeex/codegeex_for_code_generation.py create mode 100755 modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 99f4a047..c74eaeb2 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -257,6 +257,7 @@ class Pipelines(object): feature_extraction = 'feature-extraction' mglm_text_summarization = 'mglm-text-summarization' codegeex_code_translation = 'codegeex-code-translation' + codegeex_code_generation = 'codegeex-code-generation' translation_en_to_de = 'translation_en_to_de' # keep it underscore translation_en_to_ro = 'translation_en_to_ro' # keep it underscore translation_en_to_fr = 'translation_en_to_fr' # keep it underscore @@ -384,7 +385,6 @@ class Preprocessors(object): document_segmentation = 'document-segmentation' feature_extraction = 'feature-extraction' mglm_summarization = 'mglm-summarization' - codegeex = 'codegeex' sentence_piece = 'sentence-piece' # audio preprocessor diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 3f9d224c..5f8b88f9 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -36,7 +36,7 @@ if TYPE_CHECKING: ) from .T5 import T5ForConditionalGeneration from .mglm import MGLMForTextSummarization - from .codegeex import CodeGeeXForCodeTranslation + from .codegeex import CodeGeeXForCodeTranslation, CodeGeeXForCodeGeneration from .task_models import ( FeatureExtractionModel, InformationExtractionModel, @@ -109,7 +109,7 @@ else: 'sentence_embedding': ['SentenceEmbedding'], 'T5': ['T5ForConditionalGeneration'], 'mglm': ['MGLMForTextSummarization'], - 'codegeex': ['CodeGeeXForCodeTranslation'], + 'codegeex': ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'], 'gpt_neo': ['GPTNeoModel'], 'bloom': ['BloomModel'], } diff --git a/modelscope/models/nlp/codegeex/__init__.py b/modelscope/models/nlp/codegeex/__init__.py index 08add0b0..0bcdb4bc 100755 --- a/modelscope/models/nlp/codegeex/__init__.py +++ b/modelscope/models/nlp/codegeex/__init__.py @@ -6,9 +6,11 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: from .codegeex_for_code_translation import CodeGeeXForCodeTranslation + from .codegeex_for_code_generation import CodeGeeXForCodeGeneration else: _import_structure = { 'codegeex_for_code_translation': ['CodeGeeXForCodeTranslation'], + 'codegeex_for_code_generation': ['CodeGeeXForCodeGeneration'], } import sys diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py new file mode 100755 index 00000000..dbe6d4a4 --- /dev/null +++ b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022 Zhipu.AI +import copy +from typing import Any, Dict + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base import TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger +from .codegeex import CodeGeeXModel +from .inference import get_token_stream +from .tokenizer import CodeGeeXTokenizer + + +def model_provider(): + """Build the model.""" + + hidden_size = 5120 + num_attention_heads = 40 + num_layers = 39 + padded_vocab_size = 52224 + max_position_embeddings = 2048 + + model = CodeGeeXModel(hidden_size, num_layers, num_attention_heads, + padded_vocab_size, max_position_embeddings) + + return model + + +@MODELS.register_module(Tasks.code_generation, module_name=Models.codegeex) +class CodeGeeXForCodeGeneration(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + """initialize the fast poem model from the `model_dir` path. + + Args: + model_dir (str): the model path. + """ + super().__init__(model_dir, *args, **kwargs) + logger = get_logger() + # loading tokenizer + logger.info('Loading tokenizer ...') + self.tokenizer = CodeGeeXTokenizer( + tokenizer_path=model_dir + '/tokenizer', mode='codegeex-13b') + # loading model + state_dict_path = model_dir + '/ckpt_ms_213000_fp32_52224.pt' + logger.info('Loading state dict ...') + state_dict = torch.load(state_dict_path, map_location='cpu') + state_dict = state_dict['module'] + + logger.info('Building CodeGeeX model ...') + self.model = model_provider() + self.model.load_state_dict(state_dict) + self.model.eval() + self.model.half() + self.model.cuda() + + def forward(self, input: Dict[str, str]) -> Dict[str, str]: + micro_batch_size = 1 + seq_length = 2048 + out_seq_length = 256 + bad_ids = None + lang = input['language'] + prompt = input['prompt'] + prompt = f"# language: {lang}\n{prompt}" + logger = get_logger() + tokenizer = self.tokenizer + model = self.model + for prompt in [prompt]: + tokens = tokenizer.encode_code(prompt) + n_token_prompt = len(tokens) + token_stream = get_token_stream( + model, + tokenizer, + seq_length, + out_seq_length, + [copy.deepcopy(tokens) for _ in range(micro_batch_size)], + micro_batch_size=micro_batch_size, + bad_ids=bad_ids, + topk=1, + topp=0.9, + temperature=0.9, + greedy=True + ) + is_finished = [False for _ in range(micro_batch_size)] + for i, generated in enumerate(token_stream): + generated_tokens = generated[0] + for j in range(micro_batch_size): + if is_finished[j]: + continue + if generated_tokens[j].cpu().numpy( + )[-1] == tokenizer.eos_token_id or len( + generated_tokens[j]) >= out_seq_length: + is_finished[j] = True + generated_tokens_ = generated_tokens[j].cpu().numpy( + ).tolist() + generated_code = tokenizer.decode_code( + generated_tokens_[n_token_prompt:]) + generated_code = ''.join(generated_code) + logger.info( + '================================= Generated code:' + ) + logger.info(generated_code) + if all(is_finished): + break + + logger.info('Generation finished.') + return {OutputKeys.TEXT: generated_code} diff --git a/modelscope/pipelines/nlp/__init__.py b/modelscope/pipelines/nlp/__init__.py index 3ffe7b93..cbea8436 100644 --- a/modelscope/pipelines/nlp/__init__.py +++ b/modelscope/pipelines/nlp/__init__.py @@ -33,6 +33,7 @@ if TYPE_CHECKING: from .zero_shot_classification_pipeline import ZeroShotClassificationPipeline from .mglm_text_summarization_pipeline import MGLMTextSummarizationPipeline from .codegeex_code_translation_pipeline import CodeGeeXCodeTranslationPipeline + from .codegeex_code_generation_pipeline import CodeGeeXCodeGenerationPipeline from .multilingual_word_segmentation_pipeline import MultilingualWordSegmentationPipeline, \ WordSegmentationThaiPipeline @@ -76,6 +77,8 @@ else: 'mglm_text_summarization_pipeline': ['MGLMTextSummarizationPipeline'], 'codegeex_code_translation_pipeline': ['CodeGeeXCodeTranslationPipeline'], + 'codegeex_code_generation_pipeline': + ['CodeGeeXCodeGenerationPipeline'], 'multilingual_word_segmentation_pipeline': [ 'MultilingualWordSegmentationPipeline', 'WordSegmentationThaiPipeline' diff --git a/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py new file mode 100755 index 00000000..2eaebca3 --- /dev/null +++ b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py @@ -0,0 +1,48 @@ +# Copyright (c) 2022 Zhipu.AI + +from typing import Any, Dict, Union + +from modelscope.metainfo import Pipelines +from modelscope.models.nlp import CodeGeeXForCodeGeneration +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import Preprocessor +from modelscope.utils.constant import Tasks + + +@PIPELINES.register_module( + group_key=Tasks.code_generation, + module_name=Pipelines.codegeex_code_generation) +class CodeGeeXCodeGenerationPipeline(Pipeline): + + def __init__(self, + model: Union[CodeGeeXForCodeGeneration, str], + preprocessor: [Preprocessor] = None, + *args, + **kwargs): + model = CodeGeeXForCodeGeneration(model) if isinstance(model, + str) else model + self.model = model + self.model.eval() + self.model.half() + self.model.cuda() + + super().__init__(model=model, **kwargs) + + def preprocess(self, inputs, **preprocess_params) -> Dict[str, Any]: + return inputs + + # define the forward pass + def forward(self, inputs: Union[Dict], **forward_params) -> Dict[str, Any]: + # check input format + for para in ['prompt', 'language']: + if para not in inputs: + raise Exception('Please check your input format.') + if inputs['language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa + raise Exception('Make sure the language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa + + return self.model(inputs) + + # format the outputs from pipeline + def postprocess(self, input, **kwargs) -> Dict[str, Any]: + return input diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py index ef0f29e0..61be5620 100755 --- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py +++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py @@ -38,6 +38,12 @@ class CodeGeeXCodeTranslationPipeline(Pipeline): for para in ['prompt', 'source language', 'target language']: if para not in inputs: raise Exception('please check your input format.') + if inputs['source language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa + raise Exception('Make sure the source language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa + + if inputs['target language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa + raise Exception('Make sure the target language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa + return self.model(inputs) # format the outputs from pipeline diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py index bf3f8fb9..6cd7a571 100644 --- a/modelscope/utils/constant.py +++ b/modelscope/utils/constant.py @@ -121,6 +121,7 @@ class NLPTasks(object): text_summarization = 'text-summarization' question_answering = 'question-answering' code_translation = 'code-translation' + code_generation = 'code-generation' zero_shot_classification = 'zero-shot-classification' backbone = 'backbone' text_error_correction = 'text-error-correction' From 028551cd62ee57c081c637dc32cc6a0a6e356dd2 Mon Sep 17 00:00:00 2001 From: shuaigezhu Date: Fri, 25 Nov 2022 16:41:44 +0800 Subject: [PATCH 6/6] add code_generation files --- modelscope/models/nlp/__init__.py | 3 ++- .../codegeex/codegeex_for_code_generation.py | 5 ++--- .../nlp/codegeex_code_generation_pipeline.py | 13 ++++++++--- .../nlp/codegeex_code_translation_pipeline.py | 22 +++++++++++++++---- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/modelscope/models/nlp/__init__.py b/modelscope/models/nlp/__init__.py index 5f8b88f9..3d4f8c7d 100644 --- a/modelscope/models/nlp/__init__.py +++ b/modelscope/models/nlp/__init__.py @@ -109,7 +109,8 @@ else: 'sentence_embedding': ['SentenceEmbedding'], 'T5': ['T5ForConditionalGeneration'], 'mglm': ['MGLMForTextSummarization'], - 'codegeex': ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'], + 'codegeex': + ['CodeGeeXForCodeTranslation', 'CodeGeeXForCodeGeneration'], 'gpt_neo': ['GPTNeoModel'], 'bloom': ['BloomModel'], } diff --git a/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py index dbe6d4a4..ff191cba 100755 --- a/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py +++ b/modelscope/models/nlp/codegeex/codegeex_for_code_generation.py @@ -65,7 +65,7 @@ class CodeGeeXForCodeGeneration(TorchModel): bad_ids = None lang = input['language'] prompt = input['prompt'] - prompt = f"# language: {lang}\n{prompt}" + prompt = f'# language: {lang}\n{prompt}' logger = get_logger() tokenizer = self.tokenizer model = self.model @@ -83,8 +83,7 @@ class CodeGeeXForCodeGeneration(TorchModel): topk=1, topp=0.9, temperature=0.9, - greedy=True - ) + greedy=True) is_finished = [False for _ in range(micro_batch_size)] for i, generated in enumerate(token_stream): generated_tokens = generated[0] diff --git a/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py index 2eaebca3..f23461b1 100755 --- a/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py +++ b/modelscope/pipelines/nlp/codegeex_code_generation_pipeline.py @@ -21,7 +21,7 @@ class CodeGeeXCodeGenerationPipeline(Pipeline): *args, **kwargs): model = CodeGeeXForCodeGeneration(model) if isinstance(model, - str) else model + str) else model self.model = model self.model.eval() self.model.half() @@ -38,8 +38,15 @@ class CodeGeeXCodeGenerationPipeline(Pipeline): for para in ['prompt', 'language']: if para not in inputs: raise Exception('Please check your input format.') - if inputs['language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa - raise Exception('Make sure the language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa + if inputs['language'] not in [ + 'C++', 'C', 'C#', 'Cuda', 'Objective-C', 'Objective-C++', + 'Python', 'Java', 'Scala', 'TeX', 'HTML', 'PHP', 'JavaScript', + 'TypeScript', 'Go', 'Shell', 'Rust', 'CSS', 'SQL', 'Kotlin', + 'Pascal', 'R', 'Fortran', 'Lean' + ]: # noqa + raise Exception( + 'Make sure the language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]' # noqa + ) # noqa return self.model(inputs) diff --git a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py index 61be5620..8bd5a6da 100755 --- a/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py +++ b/modelscope/pipelines/nlp/codegeex_code_translation_pipeline.py @@ -38,11 +38,25 @@ class CodeGeeXCodeTranslationPipeline(Pipeline): for para in ['prompt', 'source language', 'target language']: if para not in inputs: raise Exception('please check your input format.') - if inputs['source language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa - raise Exception('Make sure the source language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa + if inputs['source language'] not in [ + 'C++', 'C', 'C#', 'Cuda', 'Objective-C', 'Objective-C++', + 'Python', 'Java', 'Scala', 'TeX', 'HTML', 'PHP', 'JavaScript', + 'TypeScript', 'Go', 'Shell', 'Rust', 'CSS', 'SQL', 'Kotlin', + 'Pascal', 'R', 'Fortran', 'Lean' + ]: + raise Exception( + 'Make sure the source language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]' # noqa + ) # noqa - if inputs['target language'] not in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]: # noqa - raise Exception('Make sure the target language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]') # noqa + if inputs['target language'] not in [ + 'C++', 'C', 'C#', 'Cuda', 'Objective-C', 'Objective-C++', + 'Python', 'Java', 'Scala', 'TeX', 'HTML', 'PHP', 'JavaScript', + 'TypeScript', 'Go', 'Shell', 'Rust', 'CSS', 'SQL', 'Kotlin', + 'Pascal', 'R', 'Fortran', 'Lean' + ]: + raise Exception( + 'Make sure the target language is in ["C++","C","C#","Cuda","Objective-C","Objective-C++","Python","Java","Scala","TeX","HTML","PHP","JavaScript","TypeScript","Go","Shell","Rust","CSS","SQL","Kotlin","Pascal","R","Fortran","Lean"]' # noqa + ) # noqa return self.model(inputs)