mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-25 04:30:48 +01:00
Merge branch 'master' of gitlab.alibaba-inc.com:Ali-MaaS/MaaS-lib into release/1.0
This commit is contained in:
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4fd6fa6b23c2fdaf876606a767d9b64b1924e1acddfc06ac42db73ba86083280
|
||||
size 119940
|
||||
oid sha256:4eae921001139d7e3c06331c9ef2213f8fc1c23512acd95751559866fb770e96
|
||||
size 121855
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4d37672a0e299a08d2daf5c7fc29bfce96bb15701fe5e5e68f068861ac2ee705
|
||||
size 119619
|
||||
oid sha256:f97d34d7450d17d0a93647129ab10d16b1f6e70c34a73b6f7687b79519ee4f71
|
||||
size 121563
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c692e0753cfe349e520511427727a8252f141fa10e85f9a61562845e8d731f9a
|
||||
size 119619
|
||||
oid sha256:a8355f27a3235209f206b5e75f4400353e5989e94cf4d71270b42ded8821d536
|
||||
size 121563
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2bce1341f4b55d536771dad6e2b280458579f46c3216474ceb8a926022ab53d0
|
||||
size 151572
|
||||
oid sha256:344ef971bdf310b76c6571d1f4994ab6abc5edc659654d71a4f75b14a30960c2
|
||||
size 152926
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6af5024a26337a440c7ea2935fce84af558dd982ee97a2f027bb922cc874292b
|
||||
size 61741
|
||||
oid sha256:f0aeb07b6c9b40a0cfa7492e839431764e9bece93c906833a07c05e83520a399
|
||||
size 63161
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bbce084781342ca7274c2e4d02ed5c5de43ba213a3b76328d5994404d6544c41
|
||||
size 61745
|
||||
oid sha256:7aa5c7a2565ccf0d2eea4baf8adbd0e020dbe36a7159b31156c53141cc9b2df2
|
||||
size 63165
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
|
||||
size 61239
|
||||
oid sha256:cc6de82a8485fbfa008f6c2d5411cd07ba03e4a780bcb4e67efc6fba3c6ce92f
|
||||
size 63597
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
|
||||
size 61115
|
||||
oid sha256:7d98ac11a4e9e2744a7402a5cc912da991a41938bbc5dd60f15ee5c6b3196030
|
||||
size 63349
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9e3ecc2c30d382641d561f84849b199c12bb1a9418e8099a191153f6f5275a85
|
||||
size 61589
|
||||
oid sha256:01f9b9bf6f8bbf9bb377d4cb6f399b2e5e065381f5b7332343e0db7b4fae72a5
|
||||
size 62519
|
||||
|
||||
@@ -19,10 +19,13 @@ class Exporter(ABC):
|
||||
def from_model(cls, model: Model, **kwargs):
|
||||
"""Build the Exporter instance.
|
||||
|
||||
@param model: A model instance. it will be used to output the generated file,
|
||||
Args:
|
||||
model: A Model instance. it will be used to generate the intermediate format file,
|
||||
and the configuration.json in its model_dir field will be used to create the exporter instance.
|
||||
@param kwargs: Extra kwargs used to create the Exporter instance.
|
||||
@return: The Exporter instance
|
||||
kwargs: Extra kwargs used to create the Exporter instance.
|
||||
|
||||
Returns:
|
||||
The Exporter instance
|
||||
"""
|
||||
cfg = Config.from_file(
|
||||
os.path.join(model.model_dir, ModelFile.CONFIGURATION))
|
||||
@@ -44,10 +47,13 @@ class Exporter(ABC):
|
||||
In some cases, several files may be generated,
|
||||
So please return a dict which contains the generated name with the file path.
|
||||
|
||||
@param opset: The version of the ONNX operator set to use.
|
||||
@param outputs: The output dir.
|
||||
@param kwargs: In this default implementation,
|
||||
kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
|
||||
@return: A dict contains the model name with the model file path.
|
||||
Args:
|
||||
opset: The version of the ONNX operator set to use.
|
||||
outputs: The output dir.
|
||||
kwargs: In this default implementation,
|
||||
kwargs will be carried to generate_dummy_inputs as extra arguments (like input shape).
|
||||
|
||||
Returns:
|
||||
A dict contains the model name with the model file path.
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -27,11 +27,14 @@ class SbertForSequenceClassificationExporter(TorchModelExporter):
|
||||
**kwargs) -> Dict[str, Any]:
|
||||
"""Generate dummy inputs for model exportation to onnx or other formats by tracing.
|
||||
|
||||
@param shape: A tuple of input shape which should have at most two dimensions.
|
||||
shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
|
||||
shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
|
||||
@param pair: Generate sentence pairs or single sentences for dummy inputs.
|
||||
@return: Dummy inputs.
|
||||
Args:
|
||||
shape: A tuple of input shape which should have at most two dimensions.
|
||||
shape = (1, ) batch_size=1, sequence_length will be taken from the preprocessor.
|
||||
shape = (8, 128) batch_size=1, sequence_length=128, which will cover the config of the preprocessor.
|
||||
pair(bool, `optional`): Whether to generate sentence pairs or single sentences.
|
||||
|
||||
Returns:
|
||||
Dummy inputs.
|
||||
"""
|
||||
|
||||
cfg = Config.from_file(
|
||||
|
||||
@@ -13,8 +13,8 @@ from modelscope.models import TorchModel
|
||||
from modelscope.pipelines.base import collate_fn
|
||||
from modelscope.utils.constant import ModelFile
|
||||
from modelscope.utils.logger import get_logger
|
||||
from modelscope.utils.regress_test_utils import compare_arguments_nested
|
||||
from modelscope.utils.tensor_utils import torch_nested_numpify
|
||||
from modelscope.utils.regress_test_utils import (compare_arguments_nested,
|
||||
numpify_tensor_nested)
|
||||
from .base import Exporter
|
||||
|
||||
logger = get_logger(__name__)
|
||||
@@ -28,49 +28,61 @@ class TorchModelExporter(Exporter):
|
||||
and to provide implementations for generate_dummy_inputs/inputs/outputs methods.
|
||||
"""
|
||||
|
||||
def export_onnx(self, outputs: str, opset=11, **kwargs):
|
||||
def export_onnx(self, output_dir: str, opset=13, **kwargs):
|
||||
"""Export the model as onnx format files.
|
||||
|
||||
In some cases, several files may be generated,
|
||||
So please return a dict which contains the generated name with the file path.
|
||||
|
||||
@param opset: The version of the ONNX operator set to use.
|
||||
@param outputs: The output dir.
|
||||
@param kwargs: In this default implementation,
|
||||
you can pass the arguments needed by _torch_export_onnx, other unrecognized args
|
||||
will be carried to generate_dummy_inputs as extra arguments (such as input shape).
|
||||
@return: A dict containing the model key - model file path pairs.
|
||||
Args:
|
||||
opset: The version of the ONNX operator set to use.
|
||||
output_dir: The output dir.
|
||||
kwargs:
|
||||
model: A model instance which will replace the exporting of self.model.
|
||||
In this default implementation,
|
||||
you can pass the arguments needed by _torch_export_onnx, other unrecognized args
|
||||
will be carried to generate_dummy_inputs as extra arguments (such as input shape).
|
||||
|
||||
Returns:
|
||||
A dict containing the model key - model file path pairs.
|
||||
"""
|
||||
model = self.model
|
||||
model = self.model if 'model' not in kwargs else kwargs.pop('model')
|
||||
if not isinstance(model, nn.Module) and hasattr(model, 'model'):
|
||||
model = model.model
|
||||
onnx_file = os.path.join(outputs, ModelFile.ONNX_MODEL_FILE)
|
||||
onnx_file = os.path.join(output_dir, ModelFile.ONNX_MODEL_FILE)
|
||||
self._torch_export_onnx(model, onnx_file, opset=opset, **kwargs)
|
||||
return {'model': onnx_file}
|
||||
|
||||
def export_torch_script(self, outputs: str, **kwargs):
|
||||
def export_torch_script(self, output_dir: str, **kwargs):
|
||||
"""Export the model as torch script files.
|
||||
|
||||
In some cases, several files may be generated,
|
||||
So please return a dict which contains the generated name with the file path.
|
||||
|
||||
@param outputs: The output dir.
|
||||
@param kwargs: In this default implementation,
|
||||
Args:
|
||||
output_dir: The output dir.
|
||||
kwargs:
|
||||
model: A model instance which will replace the exporting of self.model.
|
||||
In this default implementation,
|
||||
you can pass the arguments needed by _torch_export_torch_script, other unrecognized args
|
||||
will be carried to generate_dummy_inputs as extra arguments (like input shape).
|
||||
@return: A dict contains the model name with the model file path.
|
||||
|
||||
Returns:
|
||||
A dict contains the model name with the model file path.
|
||||
"""
|
||||
model = self.model
|
||||
model = self.model if 'model' not in kwargs else kwargs.pop('model')
|
||||
if not isinstance(model, nn.Module) and hasattr(model, 'model'):
|
||||
model = model.model
|
||||
ts_file = os.path.join(outputs, ModelFile.TS_MODEL_FILE)
|
||||
ts_file = os.path.join(output_dir, ModelFile.TS_MODEL_FILE)
|
||||
# generate ts by tracing
|
||||
self._torch_export_torch_script(model, ts_file, **kwargs)
|
||||
return {'model': ts_file}
|
||||
|
||||
def generate_dummy_inputs(self, **kwargs) -> Dict[str, Any]:
|
||||
"""Generate dummy inputs for model exportation to onnx or other formats by tracing.
|
||||
@return: Dummy inputs.
|
||||
|
||||
Returns:
|
||||
Dummy inputs.
|
||||
"""
|
||||
return None
|
||||
|
||||
@@ -93,7 +105,7 @@ class TorchModelExporter(Exporter):
|
||||
def _torch_export_onnx(self,
|
||||
model: nn.Module,
|
||||
output: str,
|
||||
opset: int = 11,
|
||||
opset: int = 13,
|
||||
device: str = 'cpu',
|
||||
validation: bool = True,
|
||||
rtol: float = None,
|
||||
@@ -101,18 +113,27 @@ class TorchModelExporter(Exporter):
|
||||
**kwargs):
|
||||
"""Export the model to an onnx format file.
|
||||
|
||||
@param model: A torch.nn.Module instance to export.
|
||||
@param output: The output file.
|
||||
@param opset: The version of the ONNX operator set to use.
|
||||
@param device: The device used to forward.
|
||||
@param validation: Whether validate the export file.
|
||||
@param rtol: The rtol used to regress the outputs.
|
||||
@param atol: The atol used to regress the outputs.
|
||||
Args:
|
||||
model: A torch.nn.Module instance to export.
|
||||
output: The output file.
|
||||
opset: The version of the ONNX operator set to use.
|
||||
device: The device used to forward.
|
||||
validation: Whether validate the export file.
|
||||
rtol: The rtol used to regress the outputs.
|
||||
atol: The atol used to regress the outputs.
|
||||
kwargs:
|
||||
dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs().
|
||||
inputs: An inputs structure which will replace the calling of self.inputs.
|
||||
outputs: An outputs structure which will replace the calling of self.outputs.
|
||||
"""
|
||||
|
||||
dummy_inputs = self.generate_dummy_inputs(**kwargs)
|
||||
inputs = self.inputs
|
||||
outputs = self.outputs
|
||||
dummy_inputs = self.generate_dummy_inputs(
|
||||
**kwargs) if 'dummy_inputs' not in kwargs else kwargs.pop(
|
||||
'dummy_inputs')
|
||||
inputs = self.inputs if 'inputs' not in kwargs else kwargs.pop(
|
||||
'inputs')
|
||||
outputs = self.outputs if 'outputs' not in kwargs else kwargs.pop(
|
||||
'outputs')
|
||||
if dummy_inputs is None or inputs is None or outputs is None:
|
||||
raise NotImplementedError(
|
||||
'Model property dummy_inputs,inputs,outputs must be set.')
|
||||
@@ -125,7 +146,7 @@ class TorchModelExporter(Exporter):
|
||||
|
||||
if isinstance(dummy_inputs, Mapping):
|
||||
dummy_inputs = dict(dummy_inputs)
|
||||
onnx_outputs = list(self.outputs.keys())
|
||||
onnx_outputs = list(outputs.keys())
|
||||
|
||||
with replace_call():
|
||||
onnx_export(
|
||||
@@ -160,11 +181,13 @@ class TorchModelExporter(Exporter):
|
||||
outputs_origin = model.forward(
|
||||
*_decide_input_format(model, dummy_inputs))
|
||||
if isinstance(outputs_origin, Mapping):
|
||||
outputs_origin = torch_nested_numpify(
|
||||
outputs_origin = numpify_tensor_nested(
|
||||
list(outputs_origin.values()))
|
||||
elif isinstance(outputs_origin, (tuple, list)):
|
||||
outputs_origin = numpify_tensor_nested(outputs_origin)
|
||||
outputs = ort_session.run(
|
||||
onnx_outputs,
|
||||
torch_nested_numpify(dummy_inputs),
|
||||
numpify_tensor_nested(dummy_inputs),
|
||||
)
|
||||
|
||||
tols = {}
|
||||
@@ -184,19 +207,26 @@ class TorchModelExporter(Exporter):
|
||||
validation: bool = True,
|
||||
rtol: float = None,
|
||||
atol: float = None,
|
||||
strict: bool = True,
|
||||
**kwargs):
|
||||
"""Export the model to a torch script file.
|
||||
|
||||
@param model: A torch.nn.Module instance to export.
|
||||
@param output: The output file.
|
||||
@param device: The device used to forward.
|
||||
@param validation: Whether validate the export file.
|
||||
@param rtol: The rtol used to regress the outputs.
|
||||
@param atol: The atol used to regress the outputs.
|
||||
Args:
|
||||
model: A torch.nn.Module instance to export.
|
||||
output: The output file.
|
||||
device: The device used to forward.
|
||||
validation: Whether validate the export file.
|
||||
rtol: The rtol used to regress the outputs.
|
||||
atol: The atol used to regress the outputs.
|
||||
strict: strict mode in torch script tracing.
|
||||
kwargs:
|
||||
dummy_inputs: A dummy inputs which will replace the calling of self.generate_dummy_inputs().
|
||||
"""
|
||||
|
||||
model.eval()
|
||||
dummy_inputs = self.generate_dummy_inputs(**kwargs)
|
||||
dummy_param = 'dummy_inputs' not in kwargs
|
||||
dummy_inputs = self.generate_dummy_inputs(
|
||||
**kwargs) if dummy_param else kwargs.pop('dummy_inputs')
|
||||
if dummy_inputs is None:
|
||||
raise NotImplementedError(
|
||||
'Model property dummy_inputs must be set.')
|
||||
@@ -207,7 +237,7 @@ class TorchModelExporter(Exporter):
|
||||
model.eval()
|
||||
with replace_call():
|
||||
traced_model = torch.jit.trace(
|
||||
model, dummy_inputs, strict=False)
|
||||
model, dummy_inputs, strict=strict)
|
||||
torch.jit.save(traced_model, output)
|
||||
|
||||
if validation:
|
||||
@@ -216,9 +246,9 @@ class TorchModelExporter(Exporter):
|
||||
model.eval()
|
||||
ts_model.eval()
|
||||
outputs = ts_model.forward(*dummy_inputs)
|
||||
outputs = torch_nested_numpify(outputs)
|
||||
outputs = numpify_tensor_nested(outputs)
|
||||
outputs_origin = model.forward(*dummy_inputs)
|
||||
outputs_origin = torch_nested_numpify(outputs_origin)
|
||||
outputs_origin = numpify_tensor_nested(outputs_origin)
|
||||
tols = {}
|
||||
if rtol is not None:
|
||||
tols['rtol'] = rtol
|
||||
@@ -240,7 +270,6 @@ def replace_call():
|
||||
problems. Here we recover the call method to the default implementation of torch.nn.Module, and change it
|
||||
back after the tracing was done.
|
||||
"""
|
||||
|
||||
TorchModel.call_origin, TorchModel.__call__ = TorchModel.__call__, TorchModel._call_impl
|
||||
yield
|
||||
TorchModel.__call__ = TorchModel.call_origin
|
||||
|
||||
@@ -12,7 +12,6 @@ from http.cookiejar import CookieJar
|
||||
from os.path import expanduser
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import attrs
|
||||
import requests
|
||||
|
||||
from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
|
||||
@@ -22,14 +21,9 @@ from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
|
||||
API_RESPONSE_FIELD_USERNAME,
|
||||
DEFAULT_CREDENTIALS_PATH, Licenses,
|
||||
ModelVisibility)
|
||||
from modelscope.hub.deploy import (DeleteServiceParameters,
|
||||
DeployServiceParameters,
|
||||
GetServiceParameters, ListServiceParameters,
|
||||
ServiceParameters, ServiceResourceConfig,
|
||||
Vendor)
|
||||
from modelscope.hub.errors import (InvalidParameter, NotExistError,
|
||||
NotLoginException, NotSupportError,
|
||||
RequestError, datahub_raise_on_error,
|
||||
NotLoginException, RequestError,
|
||||
datahub_raise_on_error,
|
||||
handle_http_post_error,
|
||||
handle_http_response, is_ok, raise_on_error)
|
||||
from modelscope.hub.git import GitCommandWrapper
|
||||
@@ -312,169 +306,6 @@ class HubApi:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def deploy_model(self, model_id: str, revision: str, instance_name: str,
|
||||
resource: ServiceResourceConfig,
|
||||
provider: ServiceParameters):
|
||||
"""Deploy model to cloud, current we only support PAI EAS, this is asynchronous
|
||||
call , please check instance status through the console or query the instance status.
|
||||
At the same time, this call may take a long time.
|
||||
|
||||
Args:
|
||||
model_id (str): The deployed model id
|
||||
revision (str): The model revision
|
||||
instance_name (str): The deployed model instance name.
|
||||
resource (DeployResource): The resource information.
|
||||
provider (CreateParameter): The cloud service provider parameter
|
||||
|
||||
Raises:
|
||||
NotLoginException: To use this api, you need login first.
|
||||
NotSupportError: Not supported platform.
|
||||
RequestError: The server return error.
|
||||
|
||||
Returns:
|
||||
InstanceInfo: The instance information.
|
||||
"""
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
if cookies is None:
|
||||
raise NotLoginException(
|
||||
'Token does not exist, please login first.')
|
||||
if provider.vendor != Vendor.EAS:
|
||||
raise NotSupportError(
|
||||
'Not support vendor: %s ,only support EAS current.' %
|
||||
(provider.vendor))
|
||||
create_params = DeployServiceParameters(
|
||||
instance_name=instance_name,
|
||||
model_id=model_id,
|
||||
revision=revision,
|
||||
resource=resource,
|
||||
provider=provider)
|
||||
path = f'{self.endpoint}/api/v1/deployer/endpoint'
|
||||
body = attrs.asdict(create_params)
|
||||
r = requests.post(
|
||||
path,
|
||||
json=body,
|
||||
cookies=cookies,
|
||||
)
|
||||
handle_http_response(r, logger, cookies, 'create_eas_instance')
|
||||
if r.status_code >= HTTPStatus.OK and r.status_code < HTTPStatus.MULTIPLE_CHOICES:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def list_deployed_model_instances(self,
|
||||
provider: ServiceParameters,
|
||||
skip: int = 0,
|
||||
limit: int = 100):
|
||||
"""List deployed model instances.
|
||||
|
||||
Args:
|
||||
provider (ListServiceParameter): The cloud service provider parameter,
|
||||
for eas, need access_key_id and access_key_secret.
|
||||
skip: start of the list, current not support.
|
||||
limit: maximum number of instances return, current not support
|
||||
Raises:
|
||||
NotLoginException: To use this api, you need login first.
|
||||
RequestError: The request is failed from server.
|
||||
|
||||
Returns:
|
||||
List: List of instance information
|
||||
"""
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
if cookies is None:
|
||||
raise NotLoginException(
|
||||
'Token does not exist, please login first.')
|
||||
params = ListServiceParameters(
|
||||
provider=provider, skip=skip, limit=limit)
|
||||
path = '%s/api/v1/deployer/endpoint?%s' % (self.endpoint,
|
||||
params.to_query_str())
|
||||
r = requests.get(path, cookies=cookies)
|
||||
handle_http_response(r, logger, cookies, 'list_deployed_model')
|
||||
if r.status_code == HTTPStatus.OK:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def get_deployed_model_instance(self, instance_name: str,
|
||||
provider: ServiceParameters):
|
||||
"""Query the specified instance information.
|
||||
|
||||
Args:
|
||||
instance_name (str): The deployed instance name.
|
||||
provider (GetParameter): The cloud provider information, for eas
|
||||
need region(eg: ch-hangzhou), access_key_id and access_key_secret.
|
||||
|
||||
Raises:
|
||||
NotLoginException: To use this api, you need login first.
|
||||
RequestError: The request is failed from server.
|
||||
|
||||
Returns:
|
||||
Dict: The request instance information
|
||||
"""
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
if cookies is None:
|
||||
raise NotLoginException(
|
||||
'Token does not exist, please login first.')
|
||||
params = GetServiceParameters(provider=provider)
|
||||
path = '%s/api/v1/deployer/endpoint/%s?%s' % (
|
||||
self.endpoint, instance_name, params.to_query_str())
|
||||
r = requests.get(path, cookies=cookies)
|
||||
handle_http_response(r, logger, cookies, 'get_deployed_model')
|
||||
if r.status_code == HTTPStatus.OK:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def delete_deployed_model_instance(self, instance_name: str,
|
||||
provider: ServiceParameters):
|
||||
"""Delete deployed model, this api send delete command and return, it will take
|
||||
some to delete, please check through the cloud console.
|
||||
|
||||
Args:
|
||||
instance_name (str): The instance name you want to delete.
|
||||
provider (DeleteParameter): The cloud provider information, for eas
|
||||
need region(eg: ch-hangzhou), access_key_id and access_key_secret.
|
||||
|
||||
Raises:
|
||||
NotLoginException: To call this api, you need login first.
|
||||
RequestError: The request is failed.
|
||||
|
||||
Returns:
|
||||
Dict: The deleted instance information.
|
||||
"""
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
if cookies is None:
|
||||
raise NotLoginException(
|
||||
'Token does not exist, please login first.')
|
||||
params = DeleteServiceParameters(provider=provider)
|
||||
path = '%s/api/v1/deployer/endpoint/%s?%s' % (
|
||||
self.endpoint, instance_name, params.to_query_str())
|
||||
r = requests.delete(path, cookies=cookies)
|
||||
handle_http_response(r, logger, cookies, 'delete_deployed_model')
|
||||
if r.status_code == HTTPStatus.OK:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def _check_cookie(self,
|
||||
use_cookies: Union[bool,
|
||||
CookieJar] = False) -> CookieJar:
|
||||
|
||||
@@ -1,11 +1,25 @@
|
||||
import urllib
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional, Union
|
||||
from abc import ABC
|
||||
from http import HTTPStatus
|
||||
from typing import Optional
|
||||
|
||||
import attrs
|
||||
import json
|
||||
from attr import fields
|
||||
import requests
|
||||
from attrs import asdict, define, field, validators
|
||||
|
||||
from modelscope.hub.api import ModelScopeConfig
|
||||
from modelscope.hub.constants import (API_RESPONSE_FIELD_DATA,
|
||||
API_RESPONSE_FIELD_MESSAGE)
|
||||
from modelscope.hub.errors import (NotLoginException, NotSupportError,
|
||||
RequestError, handle_http_response, is_ok)
|
||||
from modelscope.hub.utils.utils import get_endpoint
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
# yapf: enable
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class Accelerator(object):
|
||||
CPU = 'cpu'
|
||||
@@ -76,12 +90,12 @@ class ServiceResourceConfig(object):
|
||||
|
||||
|
||||
@define
|
||||
class ServiceParameters(ABC):
|
||||
class ServiceProviderParameters(ABC):
|
||||
pass
|
||||
|
||||
|
||||
@define
|
||||
class EASDeployParameters(ServiceParameters):
|
||||
class EASDeployParameters(ServiceProviderParameters):
|
||||
"""Parameters for EAS Deployment.
|
||||
|
||||
Args:
|
||||
@@ -97,29 +111,10 @@ class EASDeployParameters(ServiceParameters):
|
||||
resource_group: Optional[str] = None
|
||||
vendor: str = field(
|
||||
default=Vendor.EAS, validator=validators.in_([Vendor.EAS]))
|
||||
"""
|
||||
def __init__(self,
|
||||
instance_name: str,
|
||||
access_key_id: str,
|
||||
access_key_secret: str,
|
||||
region = EASRegion.beijing,
|
||||
instance_type: str = EASCpuInstances.small,
|
||||
accelerator: str = Accelerator.CPU,
|
||||
resource_group: Optional[str] = None,
|
||||
scaling: Optional[str] = None):
|
||||
self.instance_name=instance_name
|
||||
self.access_key_id=self.access_key_id
|
||||
self.access_key_secret = access_key_secret
|
||||
self.region = region
|
||||
self.instance_type = instance_type
|
||||
self.accelerator = accelerator
|
||||
self.resource_group = resource_group
|
||||
self.scaling = scaling
|
||||
"""
|
||||
|
||||
|
||||
@define
|
||||
class EASListParameters(ServiceParameters):
|
||||
class EASListParameters(ServiceProviderParameters):
|
||||
"""EAS instance list parameters.
|
||||
|
||||
Args:
|
||||
@@ -152,7 +147,7 @@ class DeployServiceParameters(object):
|
||||
model_id: str
|
||||
revision: str
|
||||
resource: ServiceResourceConfig
|
||||
provider: ServiceParameters
|
||||
provider: ServiceProviderParameters
|
||||
|
||||
|
||||
class AttrsToQueryString(ABC):
|
||||
@@ -174,16 +169,173 @@ class AttrsToQueryString(ABC):
|
||||
|
||||
@define
|
||||
class ListServiceParameters(AttrsToQueryString):
|
||||
provider: ServiceParameters
|
||||
provider: ServiceProviderParameters
|
||||
skip: int = 0
|
||||
limit: int = 100
|
||||
|
||||
|
||||
@define
|
||||
class GetServiceParameters(AttrsToQueryString):
|
||||
provider: ServiceParameters
|
||||
provider: ServiceProviderParameters
|
||||
|
||||
|
||||
@define
|
||||
class DeleteServiceParameters(AttrsToQueryString):
|
||||
provider: ServiceParameters
|
||||
provider: ServiceProviderParameters
|
||||
|
||||
|
||||
class ServiceDeployer(object):
|
||||
|
||||
def __init__(self, endpoint=None):
|
||||
self.endpoint = endpoint if endpoint is not None else get_endpoint()
|
||||
self.cookies = ModelScopeConfig.get_cookies()
|
||||
if self.cookies is None:
|
||||
raise NotLoginException(
|
||||
'Token does not exist, please login with HubApi first.')
|
||||
|
||||
# deploy_model
|
||||
def create(self, model_id: str, revision: str, instance_name: str,
|
||||
resource: ServiceResourceConfig,
|
||||
provider: ServiceProviderParameters):
|
||||
"""Deploy model to cloud, current we only support PAI EAS, this is an async API ,
|
||||
and the deployment could take a while to finish remotely. Please check deploy instance
|
||||
status separately via checking the status.
|
||||
|
||||
Args:
|
||||
model_id (str): The deployed model id
|
||||
revision (str): The model revision
|
||||
instance_name (str): The deployed model instance name.
|
||||
resource (ServiceResourceConfig): The service resource information.
|
||||
provider (ServiceProviderParameters): The service provider parameter
|
||||
|
||||
Raises:
|
||||
NotLoginException: To use this api, you need login first.
|
||||
NotSupportError: Not supported platform.
|
||||
RequestError: The server return error.
|
||||
|
||||
Returns:
|
||||
ServiceInstanceInfo: The information of the deployed service instance.
|
||||
"""
|
||||
if provider.vendor != Vendor.EAS:
|
||||
raise NotSupportError(
|
||||
'Not support vendor: %s ,only support EAS current.' %
|
||||
(provider.vendor))
|
||||
create_params = DeployServiceParameters(
|
||||
instance_name=instance_name,
|
||||
model_id=model_id,
|
||||
revision=revision,
|
||||
resource=resource,
|
||||
provider=provider)
|
||||
path = f'{self.endpoint}/api/v1/deployer/endpoint'
|
||||
body = attrs.asdict(create_params)
|
||||
r = requests.post(
|
||||
path,
|
||||
json=body,
|
||||
cookies=self.cookies,
|
||||
)
|
||||
handle_http_response(r, logger, self.cookies, 'create_service')
|
||||
if r.status_code >= HTTPStatus.OK and r.status_code < HTTPStatus.MULTIPLE_CHOICES:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def get(self, instance_name: str, provider: ServiceProviderParameters):
|
||||
"""Query the specified instance information.
|
||||
|
||||
Args:
|
||||
instance_name (str): The deployed instance name.
|
||||
provider (ServiceProviderParameters): The cloud provider information, for eas
|
||||
need region(eg: ch-hangzhou), access_key_id and access_key_secret.
|
||||
|
||||
Raises:
|
||||
NotLoginException: To use this api, you need login first.
|
||||
RequestError: The request is failed from server.
|
||||
|
||||
Returns:
|
||||
Dict: The information of the requested service instance.
|
||||
"""
|
||||
params = GetServiceParameters(provider=provider)
|
||||
path = '%s/api/v1/deployer/endpoint/%s?%s' % (
|
||||
self.endpoint, instance_name, params.to_query_str())
|
||||
r = requests.get(path, cookies=self.cookies)
|
||||
handle_http_response(r, logger, self.cookies, 'get_service')
|
||||
if r.status_code == HTTPStatus.OK:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def delete(self, instance_name: str, provider: ServiceProviderParameters):
|
||||
"""Delete deployed model, this api send delete command and return, it will take
|
||||
some to delete, please check through the cloud console.
|
||||
|
||||
Args:
|
||||
instance_name (str): The instance name you want to delete.
|
||||
provider (ServiceProviderParameters): The cloud provider information, for eas
|
||||
need region(eg: ch-hangzhou), access_key_id and access_key_secret.
|
||||
|
||||
Raises:
|
||||
NotLoginException: To call this api, you need login first.
|
||||
RequestError: The request is failed.
|
||||
|
||||
Returns:
|
||||
Dict: The deleted instance information.
|
||||
"""
|
||||
params = DeleteServiceParameters(provider=provider)
|
||||
path = '%s/api/v1/deployer/endpoint/%s?%s' % (
|
||||
self.endpoint, instance_name, params.to_query_str())
|
||||
r = requests.delete(path, cookies=self.cookies)
|
||||
handle_http_response(r, logger, self.cookies, 'delete_service')
|
||||
if r.status_code == HTTPStatus.OK:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
def list(self,
|
||||
provider: ServiceProviderParameters,
|
||||
skip: int = 0,
|
||||
limit: int = 100):
|
||||
"""List deployed model instances.
|
||||
|
||||
Args:
|
||||
provider (ServiceProviderParameters): The cloud service provider parameter,
|
||||
for eas, need access_key_id and access_key_secret.
|
||||
skip: start of the list, current not support.
|
||||
limit: maximum number of instances return, current not support
|
||||
Raises:
|
||||
NotLoginException: To use this api, you need login first.
|
||||
RequestError: The request is failed from server.
|
||||
|
||||
Returns:
|
||||
List: List of instance information
|
||||
"""
|
||||
|
||||
params = ListServiceParameters(
|
||||
provider=provider, skip=skip, limit=limit)
|
||||
path = '%s/api/v1/deployer/endpoint?%s' % (self.endpoint,
|
||||
params.to_query_str())
|
||||
r = requests.get(path, cookies=self.cookies)
|
||||
handle_http_response(r, logger, self.cookies, 'list_service_instances')
|
||||
if r.status_code == HTTPStatus.OK:
|
||||
if is_ok(r.json()):
|
||||
data = r.json()[API_RESPONSE_FIELD_DATA]
|
||||
return data
|
||||
else:
|
||||
raise RequestError(r.json()[API_RESPONSE_FIELD_MESSAGE])
|
||||
else:
|
||||
r.raise_for_status()
|
||||
return None
|
||||
|
||||
@@ -69,7 +69,6 @@ class Models(object):
|
||||
space_modeling = 'space-modeling'
|
||||
space_T_en = 'space-T-en'
|
||||
space_T_cn = 'space-T-cn'
|
||||
|
||||
tcrf = 'transformer-crf'
|
||||
transformer_softmax = 'transformer-softmax'
|
||||
lcrf = 'lstm-crf'
|
||||
@@ -81,6 +80,7 @@ class Models(object):
|
||||
bert_for_ds = 'bert-for-document-segmentation'
|
||||
ponet = 'ponet'
|
||||
T5 = 'T5'
|
||||
bloom = 'bloom'
|
||||
|
||||
# audio models
|
||||
sambert_hifigan = 'sambert-hifigan'
|
||||
@@ -282,6 +282,7 @@ class Trainers(object):
|
||||
|
||||
# multi-modal trainers
|
||||
clip_multi_modal_embedding = 'clip-multi-modal-embedding'
|
||||
ofa = 'ofa'
|
||||
|
||||
# cv trainers
|
||||
image_instance_segmentation = 'image-instance-segmentation'
|
||||
@@ -376,6 +377,9 @@ class Metrics(object):
|
||||
accuracy = 'accuracy'
|
||||
audio_noise_metric = 'audio-noise-metric'
|
||||
|
||||
# text gen
|
||||
BLEU = 'bleu'
|
||||
|
||||
# metrics for image denoise task
|
||||
image_denoise_metric = 'image-denoise-metric'
|
||||
|
||||
@@ -396,6 +400,8 @@ class Metrics(object):
|
||||
movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
|
||||
# metric for inpainting task
|
||||
image_inpainting_metric = 'image-inpainting-metric'
|
||||
# metric for ocr
|
||||
NED = 'ned'
|
||||
|
||||
|
||||
class Optimizers(object):
|
||||
|
||||
@@ -17,6 +17,8 @@ if TYPE_CHECKING:
|
||||
from .token_classification_metric import TokenClassificationMetric
|
||||
from .video_summarization_metric import VideoSummarizationMetric
|
||||
from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
|
||||
from .accuracy_metric import AccuracyMetric
|
||||
from .bleu_metric import BleuMetric
|
||||
from .image_inpainting_metric import ImageInpaintingMetric
|
||||
|
||||
else:
|
||||
@@ -36,6 +38,8 @@ else:
|
||||
'video_summarization_metric': ['VideoSummarizationMetric'],
|
||||
'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
|
||||
'image_inpainting_metric': ['ImageInpaintingMetric'],
|
||||
'accuracy_metric': ['AccuracyMetric'],
|
||||
'bleu_metric': ['BleuMetric'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
46
modelscope/metrics/accuracy_metric.py
Normal file
46
modelscope/metrics/accuracy_metric.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from modelscope.metainfo import Metrics
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.registry import default_group
|
||||
from .base import Metric
|
||||
from .builder import METRICS, MetricKeys
|
||||
|
||||
|
||||
@METRICS.register_module(group_key=default_group, module_name=Metrics.accuracy)
|
||||
class AccuracyMetric(Metric):
|
||||
"""The metric computation class for classification classes.
|
||||
|
||||
This metric class calculates accuracy for the whole input batches.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.preds = []
|
||||
self.labels = []
|
||||
|
||||
def add(self, outputs: Dict, inputs: Dict):
|
||||
label_name = OutputKeys.LABEL if OutputKeys.LABEL in inputs else OutputKeys.LABELS
|
||||
ground_truths = inputs[label_name]
|
||||
eval_results = outputs[label_name]
|
||||
assert type(ground_truths) == type(eval_results)
|
||||
if isinstance(ground_truths, list):
|
||||
self.preds.extend(eval_results)
|
||||
self.labels.extend(ground_truths)
|
||||
elif isinstance(ground_truths, np.ndarray):
|
||||
self.preds.extend(eval_results.tolist())
|
||||
self.labels.extend(ground_truths.tolist())
|
||||
else:
|
||||
raise 'only support list or np.ndarray'
|
||||
|
||||
def evaluate(self):
|
||||
assert len(self.preds) == len(self.labels)
|
||||
return {
|
||||
MetricKeys.ACCURACY: (np.asarray([
|
||||
pred == ref for pred, ref in zip(self.preds, self.labels)
|
||||
])).mean().item()
|
||||
}
|
||||
@@ -10,9 +10,6 @@ class Metric(ABC):
|
||||
complex metrics for a specific task with or without other Metric subclasses.
|
||||
"""
|
||||
|
||||
def __init__(self, trainer=None, *args, **kwargs):
|
||||
self.trainer = trainer
|
||||
|
||||
@abstractmethod
|
||||
def add(self, outputs: Dict, inputs: Dict):
|
||||
""" Append logits and labels within an eval loop.
|
||||
|
||||
42
modelscope/metrics/bleu_metric.py
Normal file
42
modelscope/metrics/bleu_metric.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from itertools import zip_longest
|
||||
from typing import Dict
|
||||
|
||||
import sacrebleu
|
||||
|
||||
from modelscope.metainfo import Metrics
|
||||
from modelscope.utils.registry import default_group
|
||||
from .base import Metric
|
||||
from .builder import METRICS, MetricKeys
|
||||
|
||||
EVAL_BLEU_ORDER = 4
|
||||
|
||||
|
||||
@METRICS.register_module(group_key=default_group, module_name=Metrics.BLEU)
|
||||
class BleuMetric(Metric):
|
||||
"""The metric computation bleu for text generation classes.
|
||||
|
||||
This metric class calculates accuracy for the whole input batches.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.eval_tokenized_bleu = kwargs.get('eval_tokenized_bleu', False)
|
||||
self.hyp_name = kwargs.get('hyp_name', 'hyp')
|
||||
self.ref_name = kwargs.get('ref_name', 'ref')
|
||||
self.refs = list()
|
||||
self.hyps = list()
|
||||
|
||||
def add(self, outputs: Dict, inputs: Dict):
|
||||
self.refs.extend(inputs[self.ref_name])
|
||||
self.hyps.extend(outputs[self.hyp_name])
|
||||
|
||||
def evaluate(self):
|
||||
if self.eval_tokenized_bleu:
|
||||
bleu = sacrebleu.corpus_bleu(
|
||||
self.hyps, list(zip_longest(*self.refs)), tokenize='none')
|
||||
else:
|
||||
bleu = sacrebleu.corpus_bleu(self.hyps,
|
||||
list(zip_longest(*self.refs)))
|
||||
return {
|
||||
MetricKeys.BLEU_4: bleu.score,
|
||||
}
|
||||
@@ -23,6 +23,7 @@ class MetricKeys(object):
|
||||
BLEU_4 = 'bleu-4'
|
||||
ROUGE_1 = 'rouge-1'
|
||||
ROUGE_L = 'rouge-l'
|
||||
NED = 'ned' # ocr metric
|
||||
|
||||
|
||||
task_default_metrics = {
|
||||
|
||||
1
modelscope/metrics/ciderD/__init__.py
Executable file
1
modelscope/metrics/ciderD/__init__.py
Executable file
@@ -0,0 +1 @@
|
||||
__author__ = 'tylin'
|
||||
57
modelscope/metrics/ciderD/ciderD.py
Executable file
57
modelscope/metrics/ciderD/ciderD.py
Executable file
@@ -0,0 +1,57 @@
|
||||
# Filename: ciderD.py
|
||||
#
|
||||
# Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
|
||||
# by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
|
||||
#
|
||||
# Creation Date: Sun Feb 8 14:16:54 2015
|
||||
#
|
||||
# Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from .ciderD_scorer import CiderScorer
|
||||
|
||||
|
||||
class CiderD:
|
||||
"""
|
||||
Main Class to compute the CIDEr metric
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, n=4, sigma=6.0, df='corpus'):
|
||||
# set cider to sum over 1 to 4-grams
|
||||
self._n = n
|
||||
# set the standard deviation parameter for gaussian penalty
|
||||
self._sigma = sigma
|
||||
# set which where to compute document frequencies from
|
||||
self._df = df
|
||||
self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
|
||||
|
||||
def compute_score(self, gts, res):
|
||||
"""
|
||||
Main function to compute CIDEr score
|
||||
:param hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
|
||||
ref_for_image (dict) : dictionary with key <image> and value <tokenized reference sentence>
|
||||
:return: cider (float) : computed CIDEr score for the corpus
|
||||
""" # noqa
|
||||
|
||||
# clear all the previous hypos and refs
|
||||
tmp_cider_scorer = self.cider_scorer.copy_empty()
|
||||
tmp_cider_scorer.clear()
|
||||
for res_id in res:
|
||||
|
||||
hypo = res_id['caption']
|
||||
ref = gts[res_id['image_id']]
|
||||
|
||||
# Sanity check.
|
||||
assert (type(hypo) is list)
|
||||
assert (len(hypo) == 1)
|
||||
assert (type(ref) is list)
|
||||
assert (len(ref) > 0)
|
||||
tmp_cider_scorer += (hypo[0], ref)
|
||||
|
||||
(score, scores) = tmp_cider_scorer.compute_score()
|
||||
|
||||
return score, scores
|
||||
|
||||
def method(self):
|
||||
return 'CIDEr-D'
|
||||
233
modelscope/metrics/ciderD/ciderD_scorer.py
Executable file
233
modelscope/metrics/ciderD/ciderD_scorer.py
Executable file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python
|
||||
# Tsung-Yi Lin <tl483@cornell.edu>
|
||||
# Ramakrishna Vedantam <vrama91@vt.edu>
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import copy
|
||||
import math
|
||||
import os
|
||||
import pdb
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
import six
|
||||
from six.moves import cPickle
|
||||
|
||||
|
||||
def precook(s, n=4, out=False):
|
||||
"""
|
||||
Takes a string as input and returns an object that can be given to
|
||||
either cook_refs or cook_test. This is optional: cook_refs and cook_test
|
||||
can take string arguments as well.
|
||||
:param s: string : sentence to be converted into ngrams
|
||||
:param n: int : number of ngrams for which representation is calculated
|
||||
:return: term frequency vector for occuring ngrams
|
||||
"""
|
||||
words = s.split()
|
||||
counts = defaultdict(int)
|
||||
for k in range(1, n + 1):
|
||||
for i in range(len(words) - k + 1):
|
||||
ngram = tuple(words[i:i + k])
|
||||
counts[ngram] += 1
|
||||
return counts
|
||||
|
||||
|
||||
def cook_refs(refs, n=4): # lhuang: oracle will call with "average"
|
||||
'''Takes a list of reference sentences for a single segment
|
||||
and returns an object that encapsulates everything that BLEU
|
||||
needs to know about them.
|
||||
:param refs: list of string : reference sentences for some image
|
||||
:param n: int : number of ngrams for which (ngram) representation is calculated
|
||||
:return: result (list of dict)
|
||||
'''
|
||||
return [precook(ref, n) for ref in refs]
|
||||
|
||||
|
||||
def cook_test(test, n=4):
|
||||
'''Takes a test sentence and returns an object that
|
||||
encapsulates everything that BLEU needs to know about it.
|
||||
:param test: list of string : hypothesis sentence for some image
|
||||
:param n: int : number of ngrams for which (ngram) representation is calculated
|
||||
:return: result (dict)
|
||||
'''
|
||||
return precook(test, n, True)
|
||||
|
||||
|
||||
class CiderScorer(object):
|
||||
"""CIDEr scorer.
|
||||
"""
|
||||
|
||||
def copy(self):
|
||||
''' copy the refs.'''
|
||||
new = CiderScorer(n=self.n)
|
||||
new.ctest = copy.copy(self.ctest)
|
||||
new.crefs = copy.copy(self.crefs)
|
||||
return new
|
||||
|
||||
def copy_empty(self):
|
||||
new = CiderScorer(df_mode='corpus', n=self.n, sigma=self.sigma)
|
||||
new.df_mode = self.df_mode
|
||||
new.ref_len = self.ref_len
|
||||
new.document_frequency = self.document_frequency
|
||||
return new
|
||||
|
||||
def __init__(self, df_mode='corpus', test=None, refs=None, n=4, sigma=6.0):
|
||||
''' singular instance '''
|
||||
self.n = n
|
||||
self.sigma = sigma
|
||||
self.crefs = []
|
||||
self.ctest = []
|
||||
self.df_mode = df_mode
|
||||
self.ref_len = None
|
||||
if self.df_mode != 'corpus':
|
||||
pkl_file = cPickle.load(
|
||||
open(df_mode, 'rb'),
|
||||
**(dict(encoding='latin1') if six.PY3 else {}))
|
||||
self.ref_len = np.log(float(pkl_file['ref_len']))
|
||||
self.document_frequency = pkl_file['document_frequency']
|
||||
else:
|
||||
self.document_frequency = None
|
||||
self.cook_append(test, refs)
|
||||
|
||||
def clear(self):
|
||||
self.crefs = []
|
||||
self.ctest = []
|
||||
|
||||
def cook_append(self, test, refs):
|
||||
'''called by constructor and __iadd__ to avoid creating new instances.'''
|
||||
|
||||
if refs is not None:
|
||||
self.crefs.append(cook_refs(refs))
|
||||
if test is not None:
|
||||
self.ctest.append(cook_test(test)) # N.B.: -1
|
||||
else:
|
||||
self.ctest.append(
|
||||
None) # lens of crefs and ctest have to match
|
||||
|
||||
def size(self):
|
||||
assert len(self.crefs) == len(
|
||||
self.ctest), 'refs/test mismatch! %d<>%d' % (len(
|
||||
self.crefs), len(self.ctest))
|
||||
return len(self.crefs)
|
||||
|
||||
def __iadd__(self, other):
|
||||
'''add an instance (e.g., from another sentence).'''
|
||||
|
||||
if type(other) is tuple:
|
||||
# avoid creating new CiderScorer instances
|
||||
self.cook_append(other[0], other[1])
|
||||
else:
|
||||
self.ctest.extend(other.ctest)
|
||||
self.crefs.extend(other.crefs)
|
||||
|
||||
return self
|
||||
|
||||
def compute_doc_freq(self):
|
||||
"""
|
||||
Compute term frequency for reference data.
|
||||
This will be used to compute idf (inverse document frequency later)
|
||||
The term frequency is stored in the object
|
||||
:return: None
|
||||
"""
|
||||
for refs in self.crefs:
|
||||
# refs, k ref captions of one image
|
||||
for ngram in set([
|
||||
ngram for ref in refs for (ngram, count) in ref.items()
|
||||
]): # noqa
|
||||
self.document_frequency[ngram] += 1
|
||||
|
||||
def compute_cider(self):
|
||||
|
||||
def counts2vec(cnts):
|
||||
"""
|
||||
Function maps counts of ngram to vector of tfidf weights.
|
||||
The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
|
||||
The n-th entry of array denotes length of n-grams.
|
||||
:param cnts:
|
||||
:return: vec (array of dict), norm (array of float), length (int)
|
||||
"""
|
||||
vec = [defaultdict(float) for _ in range(self.n)]
|
||||
length = 0
|
||||
norm = [0.0 for _ in range(self.n)]
|
||||
for (ngram, term_freq) in cnts.items():
|
||||
# give word count 1 if it doesn't appear in reference corpus
|
||||
df = np.log(max(1.0, self.document_frequency[ngram]))
|
||||
# ngram index
|
||||
n = len(ngram) - 1
|
||||
# tf (term_freq) * idf (precomputed idf) for n-grams
|
||||
vec[n][ngram] = float(term_freq) * (self.ref_len - df)
|
||||
# compute norm for the vector. the norm will be used for computing similarity
|
||||
norm[n] += pow(vec[n][ngram], 2)
|
||||
|
||||
if n == 1:
|
||||
length += term_freq
|
||||
norm = [np.sqrt(n) for n in norm]
|
||||
return vec, norm, length
|
||||
|
||||
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
|
||||
'''
|
||||
Compute the cosine similarity of two vectors.
|
||||
:param vec_hyp: array of dictionary for vector corresponding to hypothesis
|
||||
:param vec_ref: array of dictionary for vector corresponding to reference
|
||||
:param norm_hyp: array of float for vector corresponding to hypothesis
|
||||
:param norm_ref: array of float for vector corresponding to reference
|
||||
:param length_hyp: int containing length of hypothesis
|
||||
:param length_ref: int containing length of reference
|
||||
:return: array of score for each n-grams cosine similarity
|
||||
'''
|
||||
delta = float(length_hyp - length_ref)
|
||||
# measure consine similarity
|
||||
val = np.array([0.0 for _ in range(self.n)])
|
||||
for n in range(self.n):
|
||||
# ngram
|
||||
for (ngram, count) in vec_hyp[n].items():
|
||||
# vrama91 : added clipping
|
||||
val[n] += min(vec_hyp[n][ngram],
|
||||
vec_ref[n][ngram]) * vec_ref[n][ngram]
|
||||
|
||||
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
|
||||
val[n] /= (norm_hyp[n] * norm_ref[n])
|
||||
|
||||
assert (not math.isnan(val[n]))
|
||||
# vrama91: added a length based gaussian penalty
|
||||
val[n] *= np.e**(-(delta**2) / (2 * self.sigma**2))
|
||||
return val
|
||||
|
||||
# compute log reference length
|
||||
if self.df_mode == 'corpus':
|
||||
self.ref_len = np.log(float(len(self.crefs)))
|
||||
# elif self.df_mode == "coco-val-df":
|
||||
# if coco option selected, use length of coco-val set
|
||||
# self.ref_len = np.log(float(40504))
|
||||
|
||||
scores = []
|
||||
for test, refs in zip(self.ctest, self.crefs):
|
||||
# compute vector for test captions
|
||||
vec, norm, length = counts2vec(test)
|
||||
# compute vector for ref captions
|
||||
score = np.array([0.0 for _ in range(self.n)])
|
||||
for ref in refs:
|
||||
vec_ref, norm_ref, length_ref = counts2vec(ref)
|
||||
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
|
||||
# change by vrama91 - mean of ngram scores, instead of sum
|
||||
score_avg = np.mean(score)
|
||||
# divide by number of references
|
||||
score_avg /= len(refs)
|
||||
# multiply score by 10
|
||||
score_avg *= 10.0
|
||||
# append score of an image to the score list
|
||||
scores.append(score_avg)
|
||||
return scores
|
||||
|
||||
def compute_score(self, option=None, verbose=0):
|
||||
# compute idf
|
||||
if self.df_mode == 'corpus':
|
||||
self.document_frequency = defaultdict(float)
|
||||
self.compute_doc_freq()
|
||||
# assert to check document frequency
|
||||
assert (len(self.ctest) >= max(self.document_frequency.values()))
|
||||
# import json for now and write the corresponding files
|
||||
# compute cider score
|
||||
score = self.compute_cider()
|
||||
# debug
|
||||
# print score
|
||||
return np.mean(np.array(score)), np.array(score)
|
||||
@@ -34,17 +34,24 @@ class TokenClassificationMetric(Metric):
|
||||
self.labels.append(
|
||||
torch_nested_numpify(torch_nested_detach(ground_truths)))
|
||||
|
||||
def __init__(self, return_entity_level_metrics=False, *args, **kwargs):
|
||||
def __init__(self,
|
||||
return_entity_level_metrics=False,
|
||||
label2id=None,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.return_entity_level_metrics = return_entity_level_metrics
|
||||
self.preds = []
|
||||
self.labels = []
|
||||
self.label2id = label2id
|
||||
|
||||
def evaluate(self):
|
||||
self.id2label = {
|
||||
id: label
|
||||
for label, id in self.trainer.label2id.items()
|
||||
}
|
||||
label2id = self.label2id
|
||||
if label2id is None:
|
||||
assert hasattr(self, 'trainer')
|
||||
label2id = self.trainer.label2id
|
||||
|
||||
self.id2label = {id: label for label, id in label2id.items()}
|
||||
self.preds = np.concatenate(self.preds, axis=0)
|
||||
self.labels = np.concatenate(self.labels, axis=0)
|
||||
predictions = np.argmax(self.preds, axis=-1)
|
||||
|
||||
@@ -5,11 +5,11 @@ from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
from modelscope.hub.snapshot_download import snapshot_download
|
||||
from modelscope.models.builder import build_model
|
||||
from modelscope.utils.checkpoint import save_pretrained
|
||||
from modelscope.models.builder import MODELS, build_model
|
||||
from modelscope.utils.checkpoint import save_checkpoint, save_pretrained
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
|
||||
from modelscope.utils.device import device_placement, verify_device
|
||||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile, Tasks
|
||||
from modelscope.utils.device import verify_device
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
@@ -66,7 +66,6 @@ class Model(ABC):
|
||||
revision: Optional[str] = DEFAULT_MODEL_REVISION,
|
||||
cfg_dict: Config = None,
|
||||
device: str = None,
|
||||
*model_args,
|
||||
**kwargs):
|
||||
""" Instantiate a model from local directory or remote model repo. Note
|
||||
that when loading from remote, the model revision can be specified.
|
||||
@@ -90,11 +89,11 @@ class Model(ABC):
|
||||
cfg = Config.from_file(
|
||||
osp.join(local_model_dir, ModelFile.CONFIGURATION))
|
||||
task_name = cfg.task
|
||||
if 'task' in kwargs:
|
||||
task_name = kwargs.pop('task')
|
||||
model_cfg = cfg.model
|
||||
|
||||
if hasattr(model_cfg, 'model_type') and not hasattr(model_cfg, 'type'):
|
||||
model_cfg.type = model_cfg.model_type
|
||||
|
||||
model_cfg.model_dir = local_model_dir
|
||||
for k, v in kwargs.items():
|
||||
model_cfg[k] = v
|
||||
@@ -109,15 +108,19 @@ class Model(ABC):
|
||||
# dynamically add pipeline info to model for pipeline inference
|
||||
if hasattr(cfg, 'pipeline'):
|
||||
model.pipeline = cfg.pipeline
|
||||
|
||||
if not hasattr(model, 'cfg'):
|
||||
model.cfg = cfg
|
||||
return model
|
||||
|
||||
def save_pretrained(self,
|
||||
target_folder: Union[str, os.PathLike],
|
||||
save_checkpoint_names: Union[str, List[str]] = None,
|
||||
save_function: Callable = None,
|
||||
save_function: Callable = save_checkpoint,
|
||||
config: Optional[dict] = None,
|
||||
**kwargs):
|
||||
"""save the pretrained model, its configuration and other related files to a directory, so that it can be re-loaded
|
||||
"""save the pretrained model, its configuration and other related files to a directory,
|
||||
so that it can be re-loaded
|
||||
|
||||
Args:
|
||||
target_folder (Union[str, os.PathLike]):
|
||||
@@ -133,5 +136,10 @@ class Model(ABC):
|
||||
The config for the configuration.json, might not be identical with model.config
|
||||
|
||||
"""
|
||||
if config is None and hasattr(self, 'cfg'):
|
||||
config = self.cfg
|
||||
assert config is not None, 'Cannot save the model because the model config is empty.'
|
||||
if isinstance(config, Config):
|
||||
config = config.to_dict()
|
||||
save_pretrained(self, target_folder, save_checkpoint_names,
|
||||
save_function, config, **kwargs)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from modelscope.utils.config import ConfigDict
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.registry import TYPE_NAME, Registry, build_from_cfg
|
||||
|
||||
MODELS = Registry('models')
|
||||
BACKBONES = Registry('backbones')
|
||||
BACKBONES._modules = MODELS._modules
|
||||
HEADS = Registry('heads')
|
||||
|
||||
|
||||
@@ -23,30 +25,27 @@ def build_model(cfg: ConfigDict,
|
||||
cfg, MODELS, group_key=task_name, default_args=default_args)
|
||||
|
||||
|
||||
def build_backbone(cfg: ConfigDict,
|
||||
field: str = None,
|
||||
default_args: dict = None):
|
||||
def build_backbone(cfg: ConfigDict, default_args: dict = None):
|
||||
""" build backbone given backbone config dict
|
||||
|
||||
Args:
|
||||
cfg (:obj:`ConfigDict`): config dict for backbone object.
|
||||
field (str, optional): field, such as CV, NLP's backbone
|
||||
default_args (dict, optional): Default initialization arguments.
|
||||
"""
|
||||
return build_from_cfg(
|
||||
cfg, BACKBONES, group_key=field, default_args=default_args)
|
||||
cfg, BACKBONES, group_key=Tasks.backbone, default_args=default_args)
|
||||
|
||||
|
||||
def build_head(cfg: ConfigDict,
|
||||
group_key: str = None,
|
||||
task_name: str = None,
|
||||
default_args: dict = None):
|
||||
""" build head given config dict
|
||||
|
||||
Args:
|
||||
cfg (:obj:`ConfigDict`): config dict for head object.
|
||||
task_name (str, optional): task name, refer to
|
||||
:obj:`Tasks` for more details
|
||||
default_args (dict, optional): Default initialization arguments.
|
||||
"""
|
||||
if group_key is None:
|
||||
group_key = cfg[TYPE_NAME]
|
||||
return build_from_cfg(
|
||||
cfg, HEADS, group_key=group_key, default_args=default_args)
|
||||
cfg, HEADS, group_key=task_name, default_args=default_args)
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
# The implementation is adopted from the CLIP4Clip implementation,
|
||||
# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
|
||||
|
||||
import os
|
||||
import random
|
||||
import uuid
|
||||
from os.path import exists
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Any, Dict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
@@ -11,6 +15,7 @@ import torch
|
||||
from decord import VideoReader, cpu
|
||||
from PIL import Image
|
||||
|
||||
from modelscope.hub.file_download import http_get_file
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
@@ -68,12 +73,16 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
|
||||
self.model.to(self.device)
|
||||
|
||||
def _get_text(self, caption, tokenizer, enable_zh=False):
|
||||
if len(caption) == 3:
|
||||
_caption_text, s, e = caption
|
||||
elif len(caption) == 4:
|
||||
_caption_text, s, e, pos = caption
|
||||
else:
|
||||
NotImplementedError
|
||||
|
||||
if type(caption) is str:
|
||||
_caption_text, s, e = caption, None, None
|
||||
elif type(caption) is tuple:
|
||||
if len(caption) == 3:
|
||||
_caption_text, s, e = caption
|
||||
elif len(caption) == 4:
|
||||
_caption_text, s, e, pos = caption
|
||||
else:
|
||||
NotImplementedError
|
||||
|
||||
if isinstance(_caption_text, list):
|
||||
caption_text = random.choice(_caption_text)
|
||||
@@ -137,11 +146,25 @@ class VideoCLIPForMultiModalEmbedding(TorchModel):
|
||||
elif start_time == end_time:
|
||||
end_time = end_time + 1
|
||||
|
||||
if exists(video_path):
|
||||
url_parsed = urlparse(video_path)
|
||||
if url_parsed.scheme in ('file', '') and exists(
|
||||
url_parsed.path): # Possibly a local file
|
||||
vreader = VideoReader(video_path, ctx=cpu(0))
|
||||
else:
|
||||
logger.error('non video input, output is wrong!!!')
|
||||
return video, video_mask
|
||||
try:
|
||||
with TemporaryDirectory() as temporary_cache_dir:
|
||||
random_str = uuid.uuid4().hex
|
||||
http_get_file(
|
||||
url=video_path,
|
||||
local_dir=temporary_cache_dir,
|
||||
file_name=random_str,
|
||||
cookies=None)
|
||||
temp_file_path = os.path.join(temporary_cache_dir,
|
||||
random_str)
|
||||
vreader = VideoReader(temp_file_path, ctx=cpu(0))
|
||||
except Exception as ex:
|
||||
logger.error('non video input, output is {}!!!'.format(ex))
|
||||
return video, video_mask
|
||||
|
||||
fps = vreader.get_avg_fps()
|
||||
f_start = 0 if start_time is None else int(start_time * fps)
|
||||
|
||||
@@ -148,7 +148,7 @@ class BeamSearch(Search):
|
||||
scores_buf = top_prediction[0]
|
||||
indices_buf = top_prediction[1]
|
||||
# Project back into relative indices and beams
|
||||
beams_buf = indices_buf // vocab_size
|
||||
beams_buf = torch.div(indices_buf, vocab_size, rounding_mode='floor')
|
||||
indices_buf = indices_buf.fmod(vocab_size)
|
||||
|
||||
# At this point, beams_buf and indices_buf are single-dim and contain relative indices
|
||||
|
||||
@@ -385,12 +385,7 @@ class SequenceGenerator(nn.Module):
|
||||
attn = torch.empty(bsz * beam_size,
|
||||
avg_attn_scores.size(1),
|
||||
max_len + 2).to(scores)
|
||||
# print("+++++++ debug attention shape +++++++")
|
||||
# print("attn", attn.shape)
|
||||
# print("avg_attn_scores", avg_attn_scores.shape)
|
||||
attn[:, :, step + 1].copy_(avg_attn_scores)
|
||||
# print("attn[:, :, step + 1]", attn[:, :, step + 1].shape)
|
||||
# print("attn", attn.shape)
|
||||
|
||||
scores = scores.type_as(lprobs)
|
||||
eos_bbsz_idx = torch.empty(0).to(
|
||||
@@ -404,8 +399,28 @@ class SequenceGenerator(nn.Module):
|
||||
self.search.set_src_lengths(src_lengths)
|
||||
|
||||
if self.repeat_ngram_blocker is not None:
|
||||
lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz,
|
||||
beam_size, step)
|
||||
# process prefix_tokens
|
||||
p_toks_len = prefix_tokens.ne(self.pad).sum(
|
||||
dim=1) if prefix_tokens is not None else None
|
||||
if p_toks_len is not None:
|
||||
p_toks_len_beam = p_toks_len.unsqueeze(-1).repeat(
|
||||
1, beam_size).view(-1)
|
||||
no_repeat_ngram_size = self.repeat_ngram_blocker.no_repeat_ngram_size
|
||||
out_prefix = p_toks_len_beam < (
|
||||
step + no_repeat_ngram_size - 1)
|
||||
else:
|
||||
out_prefix = torch.ones(bsz * beam_size).bool()
|
||||
ngram_blocker_tokens = tokens[out_prefix]
|
||||
ngram_blocker_lprobs = lprobs[out_prefix]
|
||||
ngram_blocker_bsz = torch.div(
|
||||
out_prefix.sum(), beam_size, rounding_mode='trunc')
|
||||
|
||||
lprobs[out_prefix] = self.repeat_ngram_blocker(
|
||||
tokens=ngram_blocker_tokens,
|
||||
lprobs=ngram_blocker_lprobs,
|
||||
bsz=ngram_blocker_bsz,
|
||||
beam_size=beam_size,
|
||||
step=step)
|
||||
|
||||
# Shape: (batch, cand_size)
|
||||
cand_scores, cand_indices, cand_beams = self.search.step(
|
||||
@@ -415,7 +430,6 @@ class SequenceGenerator(nn.Module):
|
||||
tokens[:, :step + 1],
|
||||
original_batch_idxs,
|
||||
)
|
||||
|
||||
# cand_bbsz_idx contains beam indices for the top candidate
|
||||
# hypotheses, with a range of values: [0, bsz*beam_size),
|
||||
# and dimensions: [bsz, cand_size]
|
||||
@@ -671,7 +685,7 @@ class SequenceGenerator(nn.Module):
|
||||
cum_unfin.append(prev)
|
||||
cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx)
|
||||
|
||||
unfin_idx = bbsz_idx // beam_size
|
||||
unfin_idx = torch.div(bbsz_idx, beam_size, rounding_mode='floor')
|
||||
sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx)
|
||||
|
||||
# Create a set of "{sent}{unfin_idx}", where
|
||||
|
||||
@@ -19,6 +19,7 @@ from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch import Tensor, nn
|
||||
from torch.nn import functional as F
|
||||
from transformers.activations import ACT2FN
|
||||
@@ -40,6 +41,8 @@ logger = logging.get_logger(__name__)
|
||||
_CHECKPOINT_FOR_DOC = 'ofa-base'
|
||||
_CONFIG_FOR_DOC = 'OFAConfig'
|
||||
_TOKENIZER_FOR_DOC = 'OFATokenizer'
|
||||
TORCH_VERSION = version.parse(torch.__version__)
|
||||
TORCH_MESH_GRID_WARNING_VERSION = version.parse('1.9.1')
|
||||
|
||||
DEFAULT_MAX_SOURCE_POSITIONS = 1024
|
||||
DEFAULT_MAX_TARGET_POSITIONS = 1024
|
||||
@@ -51,6 +54,7 @@ OFA_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
'ofa-medium',
|
||||
'ofa-base',
|
||||
'ofa-large',
|
||||
'ofa-huge',
|
||||
]
|
||||
|
||||
try:
|
||||
@@ -114,7 +118,11 @@ def make_image_bucket_position(bucket_size, num_relative_distance):
|
||||
"""
|
||||
coords_h = torch.arange(bucket_size)
|
||||
coords_w = torch.arange(bucket_size)
|
||||
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
|
||||
if TORCH_VERSION > TORCH_MESH_GRID_WARNING_VERSION:
|
||||
coords = torch.stack(
|
||||
torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
|
||||
else:
|
||||
coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
|
||||
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
||||
relative_coords = coords_flatten[:, :, None] - \
|
||||
coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
|
||||
|
||||
@@ -8,7 +8,7 @@ OFA_TASK_KEY_MAPPING = {
|
||||
Tasks.text_summarization: OutputKeys.TEXT,
|
||||
Tasks.visual_question_answering: OutputKeys.TEXT,
|
||||
Tasks.visual_grounding: OutputKeys.BOXES,
|
||||
Tasks.text_classification: (OutputKeys.SCORES, OutputKeys.LABELS),
|
||||
Tasks.text_classification: OutputKeys.LABELS,
|
||||
Tasks.image_classification: OutputKeys.LABELS,
|
||||
Tasks.visual_entailment: (OutputKeys.SCORES, OutputKeys.LABELS),
|
||||
Tasks.visual_entailment: OutputKeys.LABELS,
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import math
|
||||
import os
|
||||
import string
|
||||
from functools import partial
|
||||
from os import path as osp
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import json
|
||||
import torch.cuda
|
||||
@@ -10,7 +12,6 @@ import torch.nn.functional as F
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.preprocessors.ofa.utils.collate import collate_tokens
|
||||
@@ -66,10 +67,9 @@ class OfaForAllTasks(TorchModel):
|
||||
self.gen_type = self.cfg.model.get('gen_type', 'generation')
|
||||
assert self.gen_type in ['generation', 'traverse'], \
|
||||
'model.gen_type must be in ["generation", "traverse"]'
|
||||
self._device = torch.device('cuda') if torch.cuda.is_available() \
|
||||
else torch.device('cpu')
|
||||
self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id
|
||||
]).to(self._device)
|
||||
self.bos_item = torch.LongTensor([self.tokenizer.bos_token_id])
|
||||
self.pad_item = torch.LongTensor([self.tokenizer.pad_token_id])
|
||||
self.eos_item = torch.LongTensor([self.tokenizer.eos_token_id])
|
||||
self.index2ans = {}
|
||||
self.ans2label_dict = {}
|
||||
self.load_ans2label()
|
||||
@@ -90,7 +90,8 @@ class OfaForAllTasks(TorchModel):
|
||||
self.val_masks_l = []
|
||||
self.build_trie()
|
||||
sg_args['constraint_trie'] = self.constraint_trie
|
||||
self.model.to(self._device)
|
||||
else:
|
||||
self.constraint_trie = None
|
||||
self.generator = sg.SequenceGenerator(**sg_args)
|
||||
inference_d = {
|
||||
'generation': self._text_gen_inference,
|
||||
@@ -108,8 +109,16 @@ class OfaForAllTasks(TorchModel):
|
||||
}
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
input = move_to_device(input, self.model.device)
|
||||
if self.model.training:
|
||||
return self.model(**input['net_input'])
|
||||
else:
|
||||
return self.inference(input)
|
||||
|
||||
def inference(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
ret = self.task_inference_mapping[self.cfg.task](input)
|
||||
ret['samples'] = input['samples']
|
||||
if 'samples' in input:
|
||||
ret['samples'] = input['samples']
|
||||
for key in [
|
||||
OutputKeys.CAPTION, OutputKeys.TEXT, OutputKeys.BOXES,
|
||||
OutputKeys.LABELS, OutputKeys.SCORES
|
||||
@@ -118,21 +127,33 @@ class OfaForAllTasks(TorchModel):
|
||||
ret[key] = None
|
||||
return ret
|
||||
|
||||
def postprocess(self, input: Dict[str, Tensor],
|
||||
**kwargs) -> Dict[str, Tensor]:
|
||||
if self.cfg.task == Tasks.image_captioning:
|
||||
caption = [
|
||||
cap.translate(self.transtab).strip()
|
||||
for cap in input[OutputKeys.CAPTION]
|
||||
]
|
||||
input[OutputKeys.CAPTION] = caption
|
||||
def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
||||
if not self.model.training and self.cfg.task == Tasks.image_captioning:
|
||||
caption = input[OutputKeys.CAPTION]
|
||||
result_l = list()
|
||||
for cap in caption:
|
||||
result_l.append(cap.translate(self.transtab).strip())
|
||||
input[OutputKeys.CAPTION] = result_l
|
||||
return input
|
||||
|
||||
def _text_gen_inference(self, input):
|
||||
input = move_to_device(input, self._device)
|
||||
gen_output = self.generator.generate([self.model], input)
|
||||
gen = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
|
||||
result = self.tokenizer.batch_decode(gen, skip_special_tokens=True)
|
||||
gen_outputs = self.generator.generate([self.model],
|
||||
input,
|
||||
prefix_tokens=input.get(
|
||||
'prefix_tokens', None))
|
||||
gen_l = list()
|
||||
for idx, gen_out in enumerate(gen_outputs):
|
||||
if len(gen_out) > 0:
|
||||
decode_tokens = gen_out[0]['tokens']
|
||||
if 'prefix_tokens' in input:
|
||||
prefix_len = input['prefix_tokens'][idx].ne(
|
||||
self.pad_item.to(self.model.device)).sum()
|
||||
decode_tokens = decode_tokens[prefix_len:]
|
||||
gen_l.append(decode_tokens)
|
||||
else:
|
||||
gen_l.append('')
|
||||
result = self.tokenizer.batch_decode(gen_l, skip_special_tokens=True)
|
||||
result = [item.strip() for item in result]
|
||||
# text generation tasks have no score
|
||||
ret = {OFA_TASK_KEY_MAPPING[self.cfg.task]: result}
|
||||
if self.cfg.task.endswith('classification'):
|
||||
@@ -140,7 +161,6 @@ class OfaForAllTasks(TorchModel):
|
||||
return ret
|
||||
|
||||
def _visual_grounding_inference(self, input):
|
||||
input = move_to_device(input, self._device)
|
||||
gen_output = self.generator.generate([self.model], input)
|
||||
tokens = [gen_output[i][0]['tokens'] for i in range(len(gen_output))]
|
||||
region_coord_l = list()
|
||||
@@ -160,7 +180,6 @@ class OfaForAllTasks(TorchModel):
|
||||
}
|
||||
|
||||
def _traverse_inference(self, input):
|
||||
input = move_to_device(input, self._device)
|
||||
encoder_input = dict()
|
||||
for key in input['net_input'].keys():
|
||||
encoder_input[key] = input['net_input'][key]
|
||||
@@ -170,13 +189,14 @@ class OfaForAllTasks(TorchModel):
|
||||
valid_size = len(val_ans)
|
||||
valid_tgt_items = [
|
||||
torch.cat([
|
||||
torch.tensor(decoder_prompt[1:]), valid_answer,
|
||||
torch.tensor(decoder_prompt[1:]).to('cpu'), valid_answer,
|
||||
self.eos_item
|
||||
]) for decoder_prompt in input['decoder_prompts']
|
||||
for valid_answer in val_ans
|
||||
]
|
||||
valid_prev_items = [
|
||||
torch.cat([torch.tensor(decoder_prompt), valid_answer])
|
||||
torch.cat(
|
||||
[torch.tensor(decoder_prompt).to('cpu'), valid_answer])
|
||||
for decoder_prompt in input['decoder_prompts']
|
||||
for valid_answer in val_ans
|
||||
]
|
||||
@@ -184,19 +204,19 @@ class OfaForAllTasks(TorchModel):
|
||||
torch.cat([
|
||||
torch.zeros(
|
||||
len(decoder_prompt) - 1,
|
||||
valid_constraint_mask.size(1)).bool().to(self._device),
|
||||
valid_constraint_mask.size(1)).bool(),
|
||||
valid_constraint_mask], dim=0) # yapf: disable
|
||||
for decoder_prompt in input['decoder_prompts'] # yapf: disable
|
||||
for valid_constraint_mask in val_masks] # yapf: disable
|
||||
valid_tgt = collate_tokens(
|
||||
valid_tgt_items,
|
||||
pad_idx=self.tokenizer.pad_token_id).to(self._device)
|
||||
pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
|
||||
valid_prev_output = collate_tokens(
|
||||
valid_prev_items,
|
||||
pad_idx=self.tokenizer.pad_token_id).to(self._device)
|
||||
pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
|
||||
val_masks = collate_tokens(
|
||||
valid_constraint_mask_items,
|
||||
pad_idx=self.tokenizer.pad_token_id).to(self._device)
|
||||
pad_idx=self.tokenizer.pad_token_id).to(self.model.device)
|
||||
new_encoder_out = {
|
||||
'last_hidden_state':
|
||||
encoder_out['last_hidden_state'].repeat_interleave(
|
||||
@@ -271,10 +291,23 @@ class OfaForAllTasks(TorchModel):
|
||||
self.val_masks_l += [
|
||||
constraint_mask_list[i:i + self.val_batch_size]
|
||||
]
|
||||
self.val_ans_l = move_to_device(self.val_ans_l, self._device)
|
||||
self.val_masks_l = move_to_device(self.val_masks_l, self._device)
|
||||
|
||||
def load_ans2label(self):
|
||||
if self.cfg.model.get('answer2label', None):
|
||||
filename = osp.join(self.model_dir, self.cfg.model.answer2label)
|
||||
self.ans2label_dict = json.load(open(filename))
|
||||
ans2label_file = osp.join(self.model_dir,
|
||||
self.cfg.model.answer2label)
|
||||
with open(ans2label_file, 'r') as reader:
|
||||
self.ans2label_dict = json.load(reader)
|
||||
|
||||
def save_pretrained(self,
|
||||
target_folder: Union[str, os.PathLike],
|
||||
save_checkpoint_names: Union[str, List[str]] = None,
|
||||
save_function: Callable = None,
|
||||
config: Optional[dict] = None,
|
||||
**kwargs):
|
||||
super(OfaForAllTasks, self). \
|
||||
save_pretrained(target_folder=target_folder,
|
||||
save_checkpoint_names=save_checkpoint_names,
|
||||
save_function=partial(save_function, with_meta=False),
|
||||
config=config,
|
||||
**kwargs)
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .t5_for_text_generation import T5ForConditionalGeneration
|
||||
from .backbone import T5Model
|
||||
from .text2text_generation import T5ForConditionalGeneration
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
't5_for_text_generation': ['T5ForConditionalGeneration'],
|
||||
'backbone': ['T5Model'],
|
||||
'text2text_generation': ['T5ForConditionalGeneration'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -21,12 +22,8 @@ from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from torch.utils.checkpoint import checkpoint
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.modeling_outputs import (
|
||||
BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions,
|
||||
Seq2SeqLMOutput, Seq2SeqModelOutput)
|
||||
from transformers.modeling_utils import (PreTrainedModel,
|
||||
find_pruneable_heads_and_indices,
|
||||
prune_linear_layer)
|
||||
@@ -36,30 +33,20 @@ from transformers.utils import (DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings,
|
||||
from transformers.utils.model_parallel_utils import (assert_device_map,
|
||||
get_device_map)
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import Model, Tensor, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import (BaseModelOutput,
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
Seq2SeqModelOutput)
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .configuration_t5 import T5Config
|
||||
from .configuration import T5Config
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = 'T5Config'
|
||||
_TOKENIZER_FOR_DOC = 'T5Tokenizer'
|
||||
_CHECKPOINT_FOR_DOC = 't5-small'
|
||||
|
||||
####################################################
|
||||
# This dict contains ids and associated url
|
||||
# for the pretrained weights provided with the models
|
||||
####################################################
|
||||
T5_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
||||
't5-small',
|
||||
't5-base',
|
||||
't5-large',
|
||||
't5-3b',
|
||||
't5-11b',
|
||||
# See all T5 models at https://huggingface.co/models?filter=t5
|
||||
]
|
||||
|
||||
|
||||
####################################################
|
||||
###################################################
|
||||
# This is a conversion method from TF 1.0 to PyTorch
|
||||
# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
|
||||
####################################################
|
||||
@@ -173,65 +160,6 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
|
||||
return model
|
||||
|
||||
|
||||
####################################################
|
||||
# PyTorch Models are constructed by sub-classing
|
||||
# - torch.nn.Module for the layers and
|
||||
# - PreTrainedModel for the models (it-self a sub-class of nn.Module)
|
||||
####################################################
|
||||
PARALLELIZE_DOCSTRING = r"""
|
||||
This is an experimental feature and is a subject to change at a moment's notice.
|
||||
|
||||
Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
|
||||
it will evenly distribute blocks across all devices.
|
||||
|
||||
Args:
|
||||
device_map (`Dict[int, list]`, optional, defaults to None):
|
||||
A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
|
||||
automatically mapped to the first device (for esoteric reasons). That means that the first device should
|
||||
have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
|
||||
following number of attention modules:
|
||||
|
||||
- t5-small: 6
|
||||
- t5-base: 12
|
||||
- t5-large: 24
|
||||
- t5-3b: 24
|
||||
- t5-11b: 24
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
# Here is an example of a device map on a machine with 4 GPUs
|
||||
# using t5-3b, which has a total of 24 attention modules:
|
||||
model = T5ForConditionalGeneration.from_pretrained("t5-3b")
|
||||
device_map = {
|
||||
0: [0, 1, 2],
|
||||
1: [3, 4, 5, 6, 7, 8, 9],
|
||||
2: [10, 11, 12, 13, 14, 15, 16],
|
||||
3: [17, 18, 19, 20, 21, 22, 23],
|
||||
}
|
||||
model.parallelize(device_map)
|
||||
```
|
||||
"""
|
||||
DEPARALLELIZE_DOCSTRING = r"""
|
||||
Moves the model to cpu from a model parallel state.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
# On a 4 GPU machine with t5-3b:
|
||||
model = T5ForConditionalGeneration.from_pretrained("t5-3b")
|
||||
device_map = {
|
||||
0: [0, 1, 2],
|
||||
1: [3, 4, 5, 6, 7, 8, 9],
|
||||
2: [10, 11, 12, 13, 14, 15, 16],
|
||||
3: [17, 18, 19, 20, 21, 22, 23],
|
||||
}
|
||||
model.parallelize(device_map) # Splits the model across several devices
|
||||
model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
class T5LayerNorm(nn.Module):
|
||||
|
||||
def __init__(self, hidden_size, eps=1e-6):
|
||||
@@ -261,23 +189,6 @@ class T5LayerNorm(nn.Module):
|
||||
return self.weight * hidden_states
|
||||
|
||||
|
||||
try:
|
||||
from apex.normalization import FusedRMSNorm
|
||||
|
||||
T5LayerNorm = FusedRMSNorm # noqa
|
||||
|
||||
logger.info(
|
||||
'Discovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNorm'
|
||||
)
|
||||
except ImportError:
|
||||
# using the normal T5LayerNorm
|
||||
pass
|
||||
except Exception:
|
||||
logger.warning(
|
||||
'discovered apex but it failed to load, falling back to T5LayerNorm')
|
||||
pass
|
||||
|
||||
|
||||
class T5DenseReluDense(nn.Module):
|
||||
|
||||
def __init__(self, config: T5Config):
|
||||
@@ -791,7 +702,7 @@ class T5Block(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class T5PreTrainedModel(PreTrainedModel):
|
||||
class T5PreTrainedModel(TorchModel, PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface
|
||||
for downloading and loading pretrained models.
|
||||
@@ -803,6 +714,10 @@ class T5PreTrainedModel(PreTrainedModel):
|
||||
is_parallelizable = True
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config.name_or_path, **kwargs)
|
||||
super(Model, self).__init__(config)
|
||||
|
||||
@property
|
||||
def dummy_inputs(self):
|
||||
input_ids = torch.tensor(DUMMY_INPUTS)
|
||||
@@ -819,8 +734,7 @@ class T5PreTrainedModel(PreTrainedModel):
|
||||
factor = self.config.initializer_factor # Used for testing weights initialization
|
||||
if isinstance(module, T5LayerNorm):
|
||||
module.weight.data.fill_(factor * 1.0)
|
||||
elif isinstance(module,
|
||||
(T5Model, T5ForConditionalGeneration, T5EncoderModel)):
|
||||
elif isinstance(module, T5Model):
|
||||
# Mesh TensorFlow embeddings initialization See
|
||||
# https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
|
||||
module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
|
||||
@@ -902,6 +816,36 @@ class T5PreTrainedModel(PreTrainedModel):
|
||||
|
||||
return shifted_input_ids
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
Args:
|
||||
kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the
|
||||
label information. num_labels: An optional arg to tell the
|
||||
model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping
|
||||
if num_labels not supplied. If num_labels is
|
||||
not found, the model will use the default
|
||||
setting (2 classes).
|
||||
|
||||
Returns:
|
||||
The loaded model, which is initialized by
|
||||
transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
|
||||
model_dir = kwargs.get('model_dir', None)
|
||||
if model_dir is None:
|
||||
config = T5Config(**kwargs)
|
||||
model = cls(config)
|
||||
else:
|
||||
model_kwargs = {}
|
||||
model = super(Model, cls).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir, **model_kwargs)
|
||||
model.model_dir = model_dir
|
||||
return model
|
||||
|
||||
|
||||
class T5Stack(T5PreTrainedModel):
|
||||
|
||||
@@ -926,8 +870,42 @@ class T5Stack(T5PreTrainedModel):
|
||||
self.device_map = None
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
@add_start_docstrings(PARALLELIZE_DOCSTRING)
|
||||
def parallelize(self, device_map=None):
|
||||
r"""
|
||||
This is an experimental feature and is a subject to change at a
|
||||
moment's notice.
|
||||
|
||||
Uses a device map to distribute attention modules of the model
|
||||
across several devices. If no device map is given, it will evenly
|
||||
distribute blocks across all devices.
|
||||
|
||||
Args:
|
||||
device_map (`Dict[int, list]`, optional, defaults to None):
|
||||
A dictionary that maps attention modules to devices. Note
|
||||
that the embedding module and LMHead are always
|
||||
automatically mapped to the first device (for esoteric
|
||||
reasons). That means that the first device should have fewer
|
||||
attention modules mapped to it than other devices. For
|
||||
reference, the t5 models have the following number of
|
||||
attention modules:
|
||||
|
||||
- t5-small: 6
|
||||
- t5-base: 12
|
||||
- t5-large: 24
|
||||
- t5-3b: 24
|
||||
- t5-11b: 24
|
||||
|
||||
Example:
|
||||
|
||||
```python # Here is an example of a device map on a machine with 4
|
||||
GPUs # using t5-3b, which has a total of 24 attention modules: model
|
||||
= T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
|
||||
0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
|
||||
15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
|
||||
} model.parallelize(device_map) ``` all of the parallelize methods
|
||||
in this file are the same
|
||||
|
||||
"""
|
||||
# Check validity of device_map
|
||||
self.device_map = (
|
||||
get_device_map(len(self.block), range(torch.cuda.device_count()))
|
||||
@@ -948,8 +926,22 @@ class T5Stack(T5PreTrainedModel):
|
||||
# Set final layer norm to last device
|
||||
self.final_layer_norm = self.final_layer_norm.to(self.last_device)
|
||||
|
||||
@add_start_docstrings(PARALLELIZE_DOCSTRING)
|
||||
def deparallelize(self):
|
||||
r"""
|
||||
Moves the model to cpu from a model parallel state.
|
||||
|
||||
Example:
|
||||
|
||||
```python # On a 4 GPU machine with t5-3b: model =
|
||||
T5ForConditionalGeneration.from_pretrained("t5-3b") device_map = {
|
||||
0: [0, 1, 2], 1: [3, 4, 5, 6, 7, 8, 9], 2: [10, 11, 12, 13, 14,
|
||||
15, 16], 3: [17, 18, 19, 20, 21, 22, 23],
|
||||
} model.parallelize(device_map) # Splits the model across several
|
||||
devices model.deparallelize() # Put the model back on cpu and
|
||||
cleans memory by calling torch.cuda.empty_cache() ```
|
||||
|
||||
all of the deparallelize methods in this file are the same
|
||||
"""
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
self.first_device = 'cpu'
|
||||
@@ -1199,7 +1191,20 @@ class T5Stack(T5PreTrainedModel):
|
||||
)
|
||||
|
||||
|
||||
T5_START_DOCSTRING = r"""
|
||||
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
__HEAD_MASK_WARNING_MSG = """
|
||||
The input argument `head_mask` was split into two arguments `head_mask` and
|
||||
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
|
||||
but this feature is deprecated and will be removed in future versions. If you do
|
||||
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
|
||||
torch.ones(num_layers, num_heads)`.
|
||||
"""
|
||||
|
||||
|
||||
@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.T5)
|
||||
class T5Model(T5PreTrainedModel):
|
||||
"""The bare T5 Model transformer outputting raw hidden-states without any
|
||||
specific head on top.
|
||||
|
||||
The T5 model was proposed in [Exploring the Limits of Transfer Learning with
|
||||
a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by
|
||||
@@ -1224,10 +1229,99 @@ T5_START_DOCSTRING = r"""
|
||||
with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model
|
||||
weights.
|
||||
"""
|
||||
"""
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'encoder\.embed_tokens\.weight',
|
||||
r'decoder\.embed_tokens\.weight',
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
|
||||
]
|
||||
|
||||
T5_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
def __init__(self, config: T5Config):
|
||||
super().__init__(config)
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.is_decoder = False
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
decoder_config = copy.deepcopy(config)
|
||||
decoder_config.is_decoder = True
|
||||
decoder_config.is_encoder_decoder = False
|
||||
decoder_config.num_layers = config.num_decoder_layers
|
||||
self.decoder = T5Stack(decoder_config, self.shared)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
def parallelize(self, device_map=None):
|
||||
self.device_map = (
|
||||
get_device_map(
|
||||
len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None else device_map)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.decoder.parallelize(self.device_map)
|
||||
self.model_parallel = True
|
||||
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.decoder.deparallelize()
|
||||
self.encoder = self.encoder.to('cpu')
|
||||
self.decoder = self.decoder.to('cpu')
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
self.decoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
|
||||
heads to prune in this layer} See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.BoolTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_head_mask: Optional[torch.FloatTensor] = None,
|
||||
cross_attn_head_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.Tensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. T5 is a model
|
||||
with relative position embeddings so you should be able to pad the
|
||||
@@ -1343,166 +1437,6 @@ T5_INPUTS_DOCSTRING = r"""
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain
|
||||
tuple.
|
||||
"""
|
||||
|
||||
T5_ENCODER_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. T5 is a model
|
||||
with relative position embeddings so you should be able to pad the
|
||||
inputs on both the right and the left.
|
||||
|
||||
Indices can be obtained using [`T5Tokenizer`]. See
|
||||
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
|
||||
for detail.
|
||||
|
||||
To know more on how to prepare `input_ids` for pretraining take a
|
||||
look a [T5 Training](./t5#training).
|
||||
attention_mask (`torch.FloatTensor` of shape `(batch_size,
|
||||
sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask
|
||||
values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
|
||||
num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask
|
||||
values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
|
||||
sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to
|
||||
directly pass an embedded representation. This is useful if you want
|
||||
more control over how to convert `input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention
|
||||
layers. See `attentions` under returned tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See
|
||||
`hidden_states` under returned tensors for more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain
|
||||
tuple.
|
||||
"""
|
||||
|
||||
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
__HEAD_MASK_WARNING_MSG = """
|
||||
The input argument `head_mask` was split into two arguments `head_mask` and
|
||||
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
|
||||
but this feature is deprecated and will be removed in future versions. If you do
|
||||
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
|
||||
torch.ones(num_layers, num_heads)`.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
'The bare T5 Model transformer outputting raw hidden-states without any specific head on top.',
|
||||
T5_START_DOCSTRING,
|
||||
)
|
||||
class T5Model(T5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'encoder\.embed_tokens\.weight',
|
||||
r'decoder\.embed_tokens\.weight',
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
|
||||
]
|
||||
|
||||
def __init__(self, config: T5Config):
|
||||
super().__init__(config)
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.is_decoder = False
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
decoder_config = copy.deepcopy(config)
|
||||
decoder_config.is_decoder = True
|
||||
decoder_config.is_encoder_decoder = False
|
||||
decoder_config.num_layers = config.num_decoder_layers
|
||||
self.decoder = T5Stack(decoder_config, self.shared)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
@add_start_docstrings(PARALLELIZE_DOCSTRING)
|
||||
def parallelize(self, device_map=None):
|
||||
self.device_map = (
|
||||
get_device_map(
|
||||
len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None else device_map)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.decoder.parallelize(self.device_map)
|
||||
self.model_parallel = True
|
||||
|
||||
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.decoder.deparallelize()
|
||||
self.encoder = self.encoder.to('cpu')
|
||||
self.decoder = self.decoder.to('cpu')
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
self.decoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
|
||||
heads to prune in this layer} See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(
|
||||
output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.BoolTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_head_mask: Optional[torch.FloatTensor] = None,
|
||||
cross_attn_head_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.Tensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]:
|
||||
r"""
|
||||
Returns:
|
||||
|
||||
Example:
|
||||
@@ -1595,409 +1529,3 @@ class T5Model(T5PreTrainedModel):
|
||||
encoder_hidden_states=encoder_outputs.hidden_states,
|
||||
encoder_attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings("""T5 Model with a `language modeling` head on top.""",
|
||||
T5_START_DOCSTRING)
|
||||
class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'encoder\.embed_tokens\.weight',
|
||||
r'decoder\.embed_tokens\.weight',
|
||||
r'lm_head\.weight',
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
|
||||
]
|
||||
|
||||
def __init__(self, config: T5Config):
|
||||
super().__init__(config)
|
||||
self.model_dim = config.d_model
|
||||
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.is_decoder = False
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
decoder_config = copy.deepcopy(config)
|
||||
decoder_config.is_decoder = True
|
||||
decoder_config.is_encoder_decoder = False
|
||||
decoder_config.num_layers = config.num_decoder_layers
|
||||
self.decoder = T5Stack(decoder_config, self.shared)
|
||||
|
||||
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
@add_start_docstrings(PARALLELIZE_DOCSTRING)
|
||||
def parallelize(self, device_map=None):
|
||||
self.device_map = (
|
||||
get_device_map(
|
||||
len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None else device_map)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.decoder.parallelize(self.device_map)
|
||||
self.lm_head = self.lm_head.to(self.decoder.first_device)
|
||||
self.model_parallel = True
|
||||
|
||||
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.decoder.deparallelize()
|
||||
self.encoder = self.encoder.to('cpu')
|
||||
self.decoder = self.decoder.to('cpu')
|
||||
self.lm_head = self.lm_head.to('cpu')
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
self.decoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
@add_start_docstrings_to_model_forward(T5_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(
|
||||
output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.BoolTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_head_mask: Optional[torch.FloatTensor] = None,
|
||||
cross_attn_head_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
|
||||
labels set to `-100` are ignored (masked), the loss is only computed
|
||||
for labels in `[0, ..., config.vocab_size]`
|
||||
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
|
||||
```python >>> from transformers import T5Tokenizer,
|
||||
T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
|
||||
>>> # training
|
||||
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
|
||||
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
|
||||
>>> outputs = model(input_ids=input_ids, labels=labels)
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
|
||||
>>> # inference
|
||||
>>> input_ids = tokenizer(
|
||||
... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
|
||||
>>> ).input_ids # Batch size 1
|
||||
>>> outputs = model.generate(input_ids)
|
||||
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
>>> # studies have shown that owning a dog is good for you.
|
||||
```"""
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
if head_mask is not None and decoder_head_mask is None:
|
||||
if self.config.num_layers == self.config.num_decoder_layers:
|
||||
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
|
||||
decoder_head_mask = head_mask
|
||||
|
||||
# Encode if needed (training, first prediction pass)
|
||||
if encoder_outputs is None:
|
||||
# Convert encoder inputs in embeddings if needed
|
||||
encoder_outputs = self.encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
hidden_states=encoder_outputs[1]
|
||||
if len(encoder_outputs) > 1 else None,
|
||||
attentions=encoder_outputs[2]
|
||||
if len(encoder_outputs) > 2 else None,
|
||||
)
|
||||
|
||||
hidden_states = encoder_outputs[0]
|
||||
|
||||
if self.model_parallel:
|
||||
torch.cuda.set_device(self.decoder.first_device)
|
||||
|
||||
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
# get decoder inputs from shifting lm labels to the right
|
||||
decoder_input_ids = self._shift_right(labels)
|
||||
|
||||
# Set device for model parallelism
|
||||
if self.model_parallel:
|
||||
torch.cuda.set_device(self.decoder.first_device)
|
||||
hidden_states = hidden_states.to(self.decoder.first_device)
|
||||
if decoder_input_ids is not None:
|
||||
decoder_input_ids = decoder_input_ids.to(
|
||||
self.decoder.first_device)
|
||||
if attention_mask is not None:
|
||||
attention_mask = attention_mask.to(self.decoder.first_device)
|
||||
if decoder_attention_mask is not None:
|
||||
decoder_attention_mask = decoder_attention_mask.to(
|
||||
self.decoder.first_device)
|
||||
|
||||
# Decode
|
||||
decoder_outputs = self.decoder(
|
||||
input_ids=decoder_input_ids,
|
||||
attention_mask=decoder_attention_mask,
|
||||
inputs_embeds=decoder_inputs_embeds,
|
||||
past_key_values=past_key_values,
|
||||
encoder_hidden_states=hidden_states,
|
||||
encoder_attention_mask=attention_mask,
|
||||
head_mask=decoder_head_mask,
|
||||
cross_attn_head_mask=cross_attn_head_mask,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = decoder_outputs[0]
|
||||
|
||||
# Set device for model parallelism
|
||||
if self.model_parallel:
|
||||
torch.cuda.set_device(self.encoder.first_device)
|
||||
self.lm_head = self.lm_head.to(self.encoder.first_device)
|
||||
sequence_output = sequence_output.to(self.lm_head.weight.device)
|
||||
|
||||
if self.config.tie_word_embeddings:
|
||||
# Rescale output before projecting on vocab See
|
||||
# https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
|
||||
sequence_output = sequence_output * (self.model_dim**-0.5)
|
||||
|
||||
lm_logits = self.lm_head(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
||||
loss = loss_fct(
|
||||
lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
|
||||
# TODO(thom): Add z_loss
|
||||
# https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
|
||||
|
||||
if not return_dict:
|
||||
output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return Seq2SeqLMOutput(
|
||||
loss=loss,
|
||||
logits=lm_logits,
|
||||
past_key_values=decoder_outputs.past_key_values,
|
||||
decoder_hidden_states=decoder_outputs.hidden_states,
|
||||
decoder_attentions=decoder_outputs.attentions,
|
||||
cross_attentions=decoder_outputs.cross_attentions,
|
||||
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
|
||||
encoder_hidden_states=encoder_outputs.hidden_states,
|
||||
encoder_attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(self,
|
||||
input_ids,
|
||||
past=None,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
decoder_head_mask=None,
|
||||
cross_attn_head_mask=None,
|
||||
use_cache=None,
|
||||
encoder_outputs=None,
|
||||
**kwargs):
|
||||
|
||||
# cut decoder_input_ids if past is used
|
||||
if past is not None:
|
||||
input_ids = input_ids[:, -1:]
|
||||
|
||||
return {
|
||||
'decoder_input_ids': input_ids,
|
||||
'past_key_values': past,
|
||||
'encoder_outputs': encoder_outputs,
|
||||
'attention_mask': attention_mask,
|
||||
'head_mask': head_mask,
|
||||
'decoder_head_mask': decoder_head_mask,
|
||||
'cross_attn_head_mask': cross_attn_head_mask,
|
||||
'use_cache': use_cache,
|
||||
}
|
||||
|
||||
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
|
||||
return self._shift_right(labels)
|
||||
|
||||
def _reorder_cache(self, past, beam_idx):
|
||||
# if decoder past is not included in output
|
||||
# speedy decoding is disabled and no need to reorder
|
||||
if past is None:
|
||||
logger.warning(
|
||||
'You might want to consider setting `use_cache=True` to speed up decoding'
|
||||
)
|
||||
return past
|
||||
|
||||
reordered_decoder_past = ()
|
||||
for layer_past_states in past:
|
||||
# get the correct batch idx from layer past batch dim
|
||||
# batch dim of `past` is at 2nd position
|
||||
reordered_layer_past_states = ()
|
||||
for layer_past_state in layer_past_states:
|
||||
# need to set correct `past` for each of the four key / value states
|
||||
reordered_layer_past_states = reordered_layer_past_states + (
|
||||
layer_past_state.index_select(
|
||||
0, beam_idx.to(layer_past_state.device)), )
|
||||
|
||||
assert reordered_layer_past_states[0].shape == layer_past_states[
|
||||
0].shape
|
||||
assert len(reordered_layer_past_states) == len(layer_past_states)
|
||||
|
||||
reordered_decoder_past = reordered_decoder_past + (
|
||||
reordered_layer_past_states, )
|
||||
return reordered_decoder_past
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"The bare T5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
|
||||
T5_START_DOCSTRING,
|
||||
)
|
||||
class T5EncoderModel(T5PreTrainedModel):
|
||||
authorized_missing_keys = [
|
||||
r'encoder\.embed_tokens\.weight',
|
||||
]
|
||||
|
||||
def __init__(self, config: T5Config):
|
||||
super().__init__(config)
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
@add_start_docstrings(PARALLELIZE_DOCSTRING)
|
||||
def parallelize(self, device_map=None):
|
||||
self.device_map = (
|
||||
get_device_map(
|
||||
len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None else device_map)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.model_parallel = True
|
||||
|
||||
@add_start_docstrings(DEPARALLELIZE_DOCSTRING)
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.encoder = self.encoder.to('cpu')
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of
|
||||
heads to prune in this layer} See base class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
@add_start_docstrings_to_model_forward(T5_ENCODER_INPUTS_DOCSTRING)
|
||||
@replace_return_docstrings(
|
||||
output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
|
||||
r"""
|
||||
Returns:
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import T5Tokenizer, T5EncoderModel
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5EncoderModel.from_pretrained("t5-small")
|
||||
>>> input_ids = tokenizer(
|
||||
... "Studies have been shown that owning a dog is good for you", return_tensors="pt"
|
||||
>>> ).input_ids # Batch size 1
|
||||
>>> outputs = model(input_ids=input_ids)
|
||||
>>> last_hidden_states = outputs.last_hidden_state
|
||||
```"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
return encoder_outputs
|
||||
@@ -1,3 +1,4 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2020, The T5 Authors and HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -1,56 +0,0 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import Tensor, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .modeling_t5 import T5Config
|
||||
from .modeling_t5 import T5ForConditionalGeneration as T5ForGeneration
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.text2text_generation,
|
||||
module_name=Models.T5,
|
||||
)
|
||||
class T5ForConditionalGeneration(TorchModel):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
"""initialize the text generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_cls (Optional[Any], optional): model loader, if None, use the
|
||||
default loader to load model weights, by default None.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.model = T5ForGeneration.from_pretrained(model_dir)
|
||||
self.generate = self.model.generate
|
||||
self.config = self.model.config
|
||||
|
||||
def forward(self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.BoolTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_head_mask: Optional[torch.FloatTensor] = None,
|
||||
cross_attn_head_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs):
|
||||
return self.model.forward(
|
||||
self, input_ids, attention_mask, decoder_input_ids,
|
||||
decoder_attention_mask, head_mask, decoder_head_mask,
|
||||
cross_attn_head_mask, encoder_outputs, past_key_values,
|
||||
inputs_embeds, decoder_inputs_embeds, labels, use_cache,
|
||||
output_attentions, output_hidden_states, return_dict, **kwargs)
|
||||
455
modelscope/models/nlp/T5/text2text_generation.py
Normal file
455
modelscope/models/nlp/T5/text2text_generation.py
Normal file
@@ -0,0 +1,455 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import warnings
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers.utils.model_parallel_utils import (assert_device_map,
|
||||
get_device_map)
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import BaseModelOutput, Seq2SeqLMOutput
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .backbone import T5PreTrainedModel, T5Stack
|
||||
from .configuration import T5Config
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
__HEAD_MASK_WARNING_MSG = """
|
||||
The input argument `head_mask` was split into two arguments `head_mask` and
|
||||
`decoder_head_mask`. Currently, `decoder_head_mask` is set to copy `head_mask`,
|
||||
but this feature is deprecated and will be removed in future versions. If you do
|
||||
not want to use any `decoder_head_mask` now, please set `decoder_head_mask =
|
||||
torch.ones(num_layers, num_heads)`.
|
||||
"""
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.text2text_generation,
|
||||
module_name=Models.T5,
|
||||
)
|
||||
class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'encoder\.embed_tokens\.weight',
|
||||
r'decoder\.embed_tokens\.weight',
|
||||
r'lm_head\.weight',
|
||||
]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r'decoder\.block\.0\.layer\.1\.EncDecAttention\.relative_attention_bias\.weight',
|
||||
]
|
||||
|
||||
def __init__(self, config: T5Config):
|
||||
super().__init__(config)
|
||||
self.model_dim = config.d_model
|
||||
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.is_decoder = False
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
decoder_config = copy.deepcopy(config)
|
||||
decoder_config.is_decoder = True
|
||||
decoder_config.is_encoder_decoder = False
|
||||
decoder_config.num_layers = config.num_decoder_layers
|
||||
self.decoder = T5Stack(decoder_config, self.shared)
|
||||
|
||||
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
def parallelize(self, device_map=None):
|
||||
self.device_map = (
|
||||
get_device_map(
|
||||
len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None else device_map)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.decoder.parallelize(self.device_map)
|
||||
self.lm_head = self.lm_head.to(self.decoder.first_device)
|
||||
self.model_parallel = True
|
||||
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.decoder.deparallelize()
|
||||
self.encoder = self.encoder.to('cpu')
|
||||
self.decoder = self.decoder.to('cpu')
|
||||
self.lm_head = self.lm_head.to('cpu')
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
self.decoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.lm_head = new_embeddings
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def get_decoder(self):
|
||||
return self.decoder
|
||||
|
||||
def forward(self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_input_ids: Optional[torch.LongTensor] = None,
|
||||
decoder_attention_mask: Optional[torch.BoolTensor] = None,
|
||||
head_mask: Optional[torch.FloatTensor] = None,
|
||||
decoder_head_mask: Optional[torch.FloatTensor] = None,
|
||||
cross_attn_head_mask: Optional[torch.Tensor] = None,
|
||||
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
||||
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
**kwargs) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary. T5 is a model
|
||||
with relative position embeddings so you should be able to pad the
|
||||
inputs on both the right and the left.
|
||||
|
||||
Indices can be obtained using [`T5Tokenizer`]. See
|
||||
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
|
||||
for detail.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
|
||||
To know more on how to prepare `input_ids` for pretraining take a
|
||||
look a [T5 Training](./t5#training).
|
||||
attention_mask (`torch.FloatTensor` of shape `(batch_size,
|
||||
sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask
|
||||
values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
decoder_input_ids (`torch.LongTensor` of shape `(batch_size,
|
||||
target_sequence_length)`, *optional*):
|
||||
Indices of decoder input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`T5Tokenizer`]. See
|
||||
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
|
||||
for details.
|
||||
|
||||
[What are decoder input IDs?](../glossary#decoder-input-ids)
|
||||
|
||||
T5 uses the `pad_token_id` as the starting token for
|
||||
`decoder_input_ids` generation. If `past_key_values` is used,
|
||||
optionally only the last `decoder_input_ids` have to be input (see
|
||||
`past_key_values`).
|
||||
|
||||
To know more on how to prepare `decoder_input_ids` for pretraining
|
||||
take a look at [T5 Training](./t5#training).
|
||||
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,
|
||||
target_sequence_length)`, *optional*):
|
||||
Default behavior: generate a tensor that ignores pad tokens in
|
||||
`decoder_input_ids`. Causal mask will also be used by default.
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
|
||||
num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules in the
|
||||
encoder. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or
|
||||
`(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules in the
|
||||
decoder. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or
|
||||
`(num_layers, num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the cross-attention modules in
|
||||
the decoder. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
|
||||
Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*,
|
||||
`optional`: *attentions*) `last_hidden_state` of shape `(batch_size,
|
||||
sequence_length, hidden_size)` is a sequence of hidden states at the
|
||||
output of the last layer of the encoder. Used in the cross-attention
|
||||
of the decoder.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))` of length
|
||||
`config.n_layers` with each tuple having 4 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
||||
Contains precomputed key and value hidden states of the attention
|
||||
blocks. Can be used to speed up decoding.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only
|
||||
the last `decoder_input_ids` (those that don't have their past key
|
||||
value states given to this model) of shape `(batch_size, 1)` instead
|
||||
of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||
inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
|
||||
sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to
|
||||
directly pass an embedded representation. This is useful if you want
|
||||
more control over how to convert `input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
|
||||
target_sequence_length, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `decoder_input_ids` you can choose to
|
||||
directly pass an embedded representation. If `past_key_values` is
|
||||
used, optionally only the last `decoder_inputs_embeds` have to be
|
||||
input (see `past_key_values`). This is useful if you want more
|
||||
control over how to convert `decoder_input_ids` indices into
|
||||
associated vectors than the model's internal embedding lookup
|
||||
matrix.
|
||||
|
||||
If `decoder_input_ids` and `decoder_inputs_embeds` are both unset,
|
||||
`decoder_inputs_embeds` takes the value of `inputs_embeds`.
|
||||
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned
|
||||
and can be used to speed up decoding (see `past_key_values`).
|
||||
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention
|
||||
layers. See `attentions` under returned tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See
|
||||
`hidden_states` under returned tensors for more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain
|
||||
tuple.
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in `[-100, 0, ..., config.vocab_size - 1]`. All
|
||||
labels set to `-100` are ignored (masked), the loss is only computed
|
||||
for labels in `[0, ..., config.vocab_size]`
|
||||
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
|
||||
```python >>> from transformers import T5Tokenizer,
|
||||
T5ForConditionalGeneration
|
||||
|
||||
>>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
||||
>>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
||||
|
||||
>>> # training
|
||||
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
|
||||
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
|
||||
>>> outputs = model(input_ids=input_ids, labels=labels)
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
|
||||
>>> # inference
|
||||
>>> input_ids = tokenizer(
|
||||
... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
|
||||
>>> ).input_ids # Batch size 1
|
||||
>>> outputs = model.generate(input_ids)
|
||||
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
>>> # studies have shown that owning a dog is good for you.
|
||||
```"""
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
|
||||
if head_mask is not None and decoder_head_mask is None:
|
||||
if self.config.num_layers == self.config.num_decoder_layers:
|
||||
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
|
||||
decoder_head_mask = head_mask
|
||||
|
||||
# Encode if needed (training, first prediction pass)
|
||||
if encoder_outputs is None:
|
||||
# Convert encoder inputs in embeddings if needed
|
||||
encoder_outputs = self.encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
hidden_states=encoder_outputs[1]
|
||||
if len(encoder_outputs) > 1 else None,
|
||||
attentions=encoder_outputs[2]
|
||||
if len(encoder_outputs) > 2 else None,
|
||||
)
|
||||
|
||||
hidden_states = encoder_outputs[0]
|
||||
|
||||
if self.model_parallel:
|
||||
torch.cuda.set_device(self.decoder.first_device)
|
||||
|
||||
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
# get decoder inputs from shifting lm labels to the right
|
||||
decoder_input_ids = self._shift_right(labels)
|
||||
|
||||
# Set device for model parallelism
|
||||
if self.model_parallel:
|
||||
torch.cuda.set_device(self.decoder.first_device)
|
||||
hidden_states = hidden_states.to(self.decoder.first_device)
|
||||
if decoder_input_ids is not None:
|
||||
decoder_input_ids = decoder_input_ids.to(
|
||||
self.decoder.first_device)
|
||||
if attention_mask is not None:
|
||||
attention_mask = attention_mask.to(self.decoder.first_device)
|
||||
if decoder_attention_mask is not None:
|
||||
decoder_attention_mask = decoder_attention_mask.to(
|
||||
self.decoder.first_device)
|
||||
|
||||
# Decode
|
||||
decoder_outputs = self.decoder(
|
||||
input_ids=decoder_input_ids,
|
||||
attention_mask=decoder_attention_mask,
|
||||
inputs_embeds=decoder_inputs_embeds,
|
||||
past_key_values=past_key_values,
|
||||
encoder_hidden_states=hidden_states,
|
||||
encoder_attention_mask=attention_mask,
|
||||
head_mask=decoder_head_mask,
|
||||
cross_attn_head_mask=cross_attn_head_mask,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = decoder_outputs[0]
|
||||
|
||||
# Set device for model parallelism
|
||||
if self.model_parallel:
|
||||
torch.cuda.set_device(self.encoder.first_device)
|
||||
self.lm_head = self.lm_head.to(self.encoder.first_device)
|
||||
sequence_output = sequence_output.to(self.lm_head.weight.device)
|
||||
|
||||
if self.config.tie_word_embeddings:
|
||||
# Rescale output before projecting on vocab See
|
||||
# https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
|
||||
sequence_output = sequence_output * (self.model_dim**-0.5)
|
||||
|
||||
lm_logits = self.lm_head(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
||||
loss = loss_fct(
|
||||
lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
|
||||
# TODO(thom): Add z_loss
|
||||
# https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
|
||||
|
||||
if not return_dict:
|
||||
output = (lm_logits, ) + decoder_outputs[1:] + encoder_outputs
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return Seq2SeqLMOutput(
|
||||
loss=loss,
|
||||
logits=lm_logits,
|
||||
past_key_values=decoder_outputs.past_key_values,
|
||||
decoder_hidden_states=decoder_outputs.hidden_states,
|
||||
decoder_attentions=decoder_outputs.attentions,
|
||||
cross_attentions=decoder_outputs.cross_attentions,
|
||||
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
|
||||
encoder_hidden_states=encoder_outputs.hidden_states,
|
||||
encoder_attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(self,
|
||||
input_ids,
|
||||
past=None,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
decoder_head_mask=None,
|
||||
cross_attn_head_mask=None,
|
||||
use_cache=None,
|
||||
encoder_outputs=None,
|
||||
**kwargs):
|
||||
|
||||
# cut decoder_input_ids if past is used
|
||||
if past is not None:
|
||||
input_ids = input_ids[:, -1:]
|
||||
|
||||
return {
|
||||
'decoder_input_ids': input_ids,
|
||||
'past_key_values': past,
|
||||
'encoder_outputs': encoder_outputs,
|
||||
'attention_mask': attention_mask,
|
||||
'head_mask': head_mask,
|
||||
'decoder_head_mask': decoder_head_mask,
|
||||
'cross_attn_head_mask': cross_attn_head_mask,
|
||||
'use_cache': use_cache,
|
||||
}
|
||||
|
||||
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
|
||||
return self._shift_right(labels)
|
||||
|
||||
def _reorder_cache(self, past, beam_idx):
|
||||
# if decoder past is not included in output
|
||||
# speedy decoding is disabled and no need to reorder
|
||||
if past is None:
|
||||
logger.warning(
|
||||
'You might want to consider setting `use_cache=True` to speed up decoding'
|
||||
)
|
||||
return past
|
||||
|
||||
reordered_decoder_past = ()
|
||||
for layer_past_states in past:
|
||||
# get the correct batch idx from layer past batch dim
|
||||
# batch dim of `past` is at 2nd position
|
||||
reordered_layer_past_states = ()
|
||||
for layer_past_state in layer_past_states:
|
||||
# need to set correct `past` for each of the four key / value states
|
||||
reordered_layer_past_states = reordered_layer_past_states + (
|
||||
layer_past_state.index_select(
|
||||
0, beam_idx.to(layer_past_state.device)), )
|
||||
|
||||
assert reordered_layer_past_states[0].shape == layer_past_states[
|
||||
0].shape
|
||||
assert len(reordered_layer_past_states) == len(layer_past_states)
|
||||
|
||||
reordered_decoder_past = reordered_decoder_past + (
|
||||
reordered_layer_past_states, )
|
||||
return reordered_decoder_past
|
||||
@@ -4,80 +4,99 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .backbones import SbertModel
|
||||
from .bart_for_text_error_correction import BartForTextErrorCorrection
|
||||
from .bert_for_document_segmentation import BertForDocumentSegmentation
|
||||
from .csanmt_for_translation import CsanmtForTranslation
|
||||
from .bart import BartForTextErrorCorrection
|
||||
from .csanmt import CsanmtForTranslation
|
||||
from .heads import SequenceClassificationHead
|
||||
from .gpt3 import GPT3ForTextGeneration
|
||||
from .masked_language import (StructBertForMaskedLM, VecoForMaskedLM,
|
||||
BertForMaskedLM, DebertaV2ForMaskedLM)
|
||||
from .ponet_for_masked_language import PoNetForMaskedLM
|
||||
from .nncrf_for_named_entity_recognition import (
|
||||
TransformerCRFForNamedEntityRecognition,
|
||||
LSTMCRFForNamedEntityRecognition)
|
||||
from .palm_v2 import PalmForTextGeneration
|
||||
from .sbert_for_faq_question_answering import SbertForFaqQuestionAnswering
|
||||
from .star_text_to_sql import StarForTextToSql
|
||||
from .sequence_classification import (VecoForSequenceClassification,
|
||||
SbertForSequenceClassification,
|
||||
BertForSequenceClassification)
|
||||
from .space import SpaceForDialogIntent
|
||||
from .space import SpaceForDialogModeling
|
||||
from .space import SpaceForDialogStateTracking
|
||||
from .table_question_answering import TableQuestionAnswering
|
||||
from .task_models import (FeatureExtractionModel,
|
||||
InformationExtractionModel,
|
||||
SequenceClassificationModel,
|
||||
SingleBackboneTaskModelBase,
|
||||
TokenClassificationModel,
|
||||
TaskModelForTextGeneration)
|
||||
from .token_classification import SbertForTokenClassification
|
||||
from .sentence_embedding import SentenceEmbedding
|
||||
from .text_ranking import TextRanking
|
||||
from .T5 import T5ForConditionalGeneration
|
||||
from .space_T_en import StarForTextToSql
|
||||
from .space_T_cn import TableQuestionAnswering
|
||||
from .space import SpaceForDialogIntent, SpaceForDialogModeling, SpaceForDST
|
||||
from .ponet import PoNetForMaskedLM, PoNetModel, PoNetConfig
|
||||
from .structbert import (
|
||||
SbertForFaqQuestionAnswering,
|
||||
SbertForMaskedLM,
|
||||
SbertForSequenceClassification,
|
||||
SbertForTokenClassification,
|
||||
SbertTokenizer,
|
||||
SbertTokenizerFast,
|
||||
)
|
||||
from .bert import (
|
||||
BertForMaskedLM,
|
||||
BertForTextRanking,
|
||||
BertForSentenceEmbedding,
|
||||
BertForSequenceClassification,
|
||||
BertForTokenClassification,
|
||||
BertForDocumentSegmentation,
|
||||
BertModel,
|
||||
BertConfig,
|
||||
)
|
||||
from .veco import VecoModel, VecoConfig, VecoForTokenClassification, \
|
||||
VecoForSequenceClassification, VecoForMaskedLM, VecoTokenizer, VecoTokenizerFast
|
||||
from .deberta_v2 import DebertaV2ForMaskedLM, DebertaV2Model
|
||||
from .task_models import (
|
||||
FeatureExtractionModel,
|
||||
InformationExtractionModel,
|
||||
LSTMCRFForNamedEntityRecognition,
|
||||
SequenceClassificationModel,
|
||||
SingleBackboneTaskModelBase,
|
||||
TaskModelForTextGeneration,
|
||||
TokenClassificationModel,
|
||||
TransformerCRFForNamedEntityRecognition,
|
||||
)
|
||||
|
||||
from .T5 import T5ForConditionalGeneration
|
||||
from .gpt_neo import GPTNeoModel
|
||||
else:
|
||||
_import_structure = {
|
||||
'backbones': ['SbertModel'],
|
||||
'bart_for_text_error_correction': ['BartForTextErrorCorrection'],
|
||||
'bert_for_document_segmentation': ['BertForDocumentSegmentation'],
|
||||
'csanmt_for_translation': ['CsanmtForTranslation'],
|
||||
'bart': ['BartForTextErrorCorrection'],
|
||||
'csanmt': ['CsanmtForTranslation'],
|
||||
'heads': ['SequenceClassificationHead'],
|
||||
'gpt3': ['GPT3ForTextGeneration'],
|
||||
'masked_language': [
|
||||
'StructBertForMaskedLM', 'VecoForMaskedLM', 'BertForMaskedLM',
|
||||
'DebertaV2ForMaskedLM'
|
||||
'structbert': [
|
||||
'SbertForFaqQuestionAnswering',
|
||||
'SbertForMaskedLM',
|
||||
'SbertForSequenceClassification',
|
||||
'SbertForTokenClassification',
|
||||
'SbertTokenizer',
|
||||
'SbertTokenizerFast',
|
||||
],
|
||||
'nncrf_for_named_entity_recognition': [
|
||||
'TransformerCRFForNamedEntityRecognition',
|
||||
'LSTMCRFForNamedEntityRecognition'
|
||||
'veco': [
|
||||
'VecoModel', 'VecoConfig', 'VecoForTokenClassification',
|
||||
'VecoForSequenceClassification', 'VecoForMaskedLM',
|
||||
'VecoTokenizer', 'VecoTokenizerFast'
|
||||
],
|
||||
'ponet_for_masked_language': ['PoNetForMaskedLM'],
|
||||
'bert': [
|
||||
'BertForMaskedLM',
|
||||
'BertForTextRanking',
|
||||
'BertForSentenceEmbedding',
|
||||
'BertForSequenceClassification',
|
||||
'BertForTokenClassification',
|
||||
'BertForDocumentSegmentation',
|
||||
'BertModel',
|
||||
'BertConfig',
|
||||
],
|
||||
'ponet': ['PoNetForMaskedLM', 'PoNetModel', 'PoNetConfig'],
|
||||
'palm_v2': ['PalmForTextGeneration'],
|
||||
'sbert_for_faq_question_answering': ['SbertForFaqQuestionAnswering'],
|
||||
'star_text_to_sql': ['StarForTextToSql'],
|
||||
'sequence_classification': [
|
||||
'VecoForSequenceClassification', 'SbertForSequenceClassification',
|
||||
'BertForSequenceClassification'
|
||||
],
|
||||
'space': [
|
||||
'SpaceForDialogIntent', 'SpaceForDialogModeling',
|
||||
'SpaceForDialogStateTracking'
|
||||
],
|
||||
'deberta_v2': ['DebertaV2ForMaskedLM', 'DebertaV2Model'],
|
||||
'space_T_en': ['StarForTextToSql'],
|
||||
'space_T_cn': ['TableQuestionAnswering'],
|
||||
'space':
|
||||
['SpaceForDialogIntent', 'SpaceForDialogModeling', 'SpaceForDST'],
|
||||
'task_models': [
|
||||
'FeatureExtractionModel',
|
||||
'InformationExtractionModel',
|
||||
'LSTMCRFForNamedEntityRecognition',
|
||||
'SequenceClassificationModel',
|
||||
'SingleBackboneTaskModelBase',
|
||||
'TokenClassificationModel',
|
||||
'TaskModelForTextGeneration',
|
||||
'TokenClassificationModel',
|
||||
'TransformerCRFForNamedEntityRecognition',
|
||||
],
|
||||
'token_classification': ['SbertForTokenClassification'],
|
||||
'table_question_answering': ['TableQuestionAnswering'],
|
||||
'sentence_embedding': ['SentenceEmbedding'],
|
||||
'text_ranking': ['TextRanking'],
|
||||
'T5': ['T5ForConditionalGeneration'],
|
||||
'gpt_neo': ['GPTNeoModel'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import BACKBONES
|
||||
from modelscope.models.nlp.bert import BertModel
|
||||
from modelscope.utils.constant import Fields
|
||||
|
||||
BACKBONES.register_module(
|
||||
group_key=Fields.nlp, module_name=Models.bert, module_cls=BertModel)
|
||||
@@ -1,52 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import BACKBONES
|
||||
from modelscope.models.nlp.structbert import SbertConfig
|
||||
from modelscope.models.nlp.structbert import SbertModel as SbertModelTransform
|
||||
from modelscope.utils.constant import Fields
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@BACKBONES.register_module(Fields.nlp, module_name=Models.structbert)
|
||||
class SbertModel(TorchModel, SbertModelTransform):
|
||||
|
||||
def __init__(self, model_dir=None, add_pooling_layer=True, **config):
|
||||
"""
|
||||
Args:
|
||||
model_dir (str, optional): The model checkpoint directory. Defaults to None.
|
||||
add_pooling_layer (bool, optional): to decide if pool the output from hidden layer. Defaults to True.
|
||||
"""
|
||||
config = SbertConfig(**config)
|
||||
super().__init__(model_dir)
|
||||
self.config = config
|
||||
SbertModelTransform.__init__(self, config, add_pooling_layer)
|
||||
|
||||
def extract_sequence_outputs(self, outputs):
|
||||
return outputs['last_hidden_state']
|
||||
|
||||
def extract_pooled_outputs(self, outputs):
|
||||
return outputs['pooler_output']
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
past_key_values=None,
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
**kwargs):
|
||||
return SbertModelTransform.forward(
|
||||
self, input_ids, attention_mask, token_type_ids, position_ids,
|
||||
head_mask, inputs_embeds, encoder_hidden_states,
|
||||
encoder_attention_mask, past_key_values, use_cache,
|
||||
output_attentions, output_hidden_states, return_dict, **kwargs)
|
||||
2
modelscope/models/nlp/bart/__init__.py
Normal file
2
modelscope/models/nlp/bart/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .text_error_correction import BartForTextErrorCorrection
|
||||
@@ -4,43 +4,33 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .modeling_bert import (
|
||||
BertForMaskedLM,
|
||||
BertForMultipleChoice,
|
||||
BertForNextSentencePrediction,
|
||||
BertForPreTraining,
|
||||
BertForQuestionAnswering,
|
||||
BertForSequenceClassification,
|
||||
BertForTokenClassification,
|
||||
from .backbone import (
|
||||
BertLayer,
|
||||
BertLMHeadModel,
|
||||
BertModel,
|
||||
BertPreTrainedModel,
|
||||
load_tf_weights_in_bert,
|
||||
)
|
||||
|
||||
from .configuration_bert import BertConfig, BertOnnxConfig
|
||||
|
||||
from .configuration import BertConfig
|
||||
from .fill_mask import BertForMaskedLM
|
||||
from .text_ranking import BertForTextRanking
|
||||
from .sentence_embedding import BertForSentenceEmbedding
|
||||
from .text_classification import BertForSequenceClassification
|
||||
from .token_classification import BertForTokenClassification
|
||||
from .document_segmentation import BertForDocumentSegmentation
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_bert': ['BertConfig', 'BertOnnxConfig'],
|
||||
'backbone': [
|
||||
'BertModel',
|
||||
'BertPreTrainedModel',
|
||||
],
|
||||
'configuration': ['BertConfig'],
|
||||
'fill_mask': ['BertForMaskedLM'],
|
||||
'text_ranking': ['BertForTextRanking'],
|
||||
'sentence_embedding': ['BertForSentenceEmbedding'],
|
||||
'text_classification': ['BertForSequenceClassification'],
|
||||
'token_classification': ['BertForTokenClassification'],
|
||||
'document_segmentation': ['BertForDocumentSegmentation'],
|
||||
}
|
||||
|
||||
_import_structure['modeling_bert'] = [
|
||||
'BertForMaskedLM',
|
||||
'BertForMultipleChoice',
|
||||
'BertForNextSentencePrediction',
|
||||
'BertForPreTraining',
|
||||
'BertForQuestionAnswering',
|
||||
'BertForSequenceClassification',
|
||||
'BertForTokenClassification',
|
||||
'BertLayer',
|
||||
'BertLMHeadModel',
|
||||
'BertModel',
|
||||
'BertPreTrainedModel',
|
||||
'load_tf_weights_in_bert',
|
||||
]
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
|
||||
952
modelscope/models/nlp/bert/backbone.py
Executable file
952
modelscope/models/nlp/bert/backbone.py
Executable file
@@ -0,0 +1,952 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""PyTorch BERT model. """
|
||||
|
||||
import math
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from packaging import version
|
||||
from torch import nn
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.modeling_utils import (PreTrainedModel,
|
||||
apply_chunking_to_forward,
|
||||
find_pruneable_heads_and_indices,
|
||||
prune_linear_layer)
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import Model, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import (BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions)
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.hub import parse_label_mapping
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .configuration import BertConfig
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = 'BertConfig'
|
||||
|
||||
|
||||
class BertEmbeddings(nn.Module):
|
||||
"""Construct the embeddings from word, position and token_type embeddings."""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.word_embeddings = nn.Embedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
padding_idx=config.pad_token_id)
|
||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings,
|
||||
config.hidden_size)
|
||||
self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
|
||||
config.hidden_size)
|
||||
|
||||
# self.LayerNorm is not snake-cased to stick with TensorFlow model
|
||||
# variable name and be able to load any TensorFlow checkpoint file
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
# position_ids (1, len position emb) is contiguous in memory and
|
||||
# exported when serialized
|
||||
self.position_embedding_type = getattr(config,
|
||||
'position_embedding_type',
|
||||
'absolute')
|
||||
self.register_buffer(
|
||||
'position_ids',
|
||||
torch.arange(config.max_position_embeddings).expand((1, -1)))
|
||||
if version.parse(torch.__version__) > version.parse('1.6.0'):
|
||||
self.register_buffer(
|
||||
'token_type_ids',
|
||||
torch.zeros(self.position_ids.size(), dtype=torch.long),
|
||||
persistent=False,
|
||||
)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
inputs_embeds=None,
|
||||
past_key_values_length=0):
|
||||
if input_ids is not None:
|
||||
input_shape = input_ids.size()
|
||||
else:
|
||||
input_shape = inputs_embeds.size()[:-1]
|
||||
|
||||
seq_length = input_shape[1]
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = self.position_ids[:,
|
||||
past_key_values_length:seq_length
|
||||
+ past_key_values_length]
|
||||
|
||||
# Setting the token_type_ids to the registered buffer in constructor
|
||||
# where it is all zeros, which usually occurs when its auto-generated,
|
||||
# registered buffer helps users when tracing the model without passing
|
||||
# token_type_ids, solves issue #5664
|
||||
if token_type_ids is None:
|
||||
if hasattr(self, 'token_type_ids'):
|
||||
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
|
||||
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
|
||||
input_shape[0], seq_length)
|
||||
token_type_ids = buffered_token_type_ids_expanded
|
||||
else:
|
||||
token_type_ids = torch.zeros(
|
||||
input_shape,
|
||||
dtype=torch.long,
|
||||
device=self.position_ids.device)
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.word_embeddings(input_ids)
|
||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||
|
||||
embeddings = inputs_embeds + token_type_embeddings
|
||||
if self.position_embedding_type == 'absolute':
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
embeddings += position_embeddings
|
||||
embeddings = self.LayerNorm(embeddings)
|
||||
embeddings = self.dropout(embeddings)
|
||||
return embeddings
|
||||
|
||||
|
||||
class BertSelfAttention(nn.Module):
|
||||
|
||||
def __init__(self, config, position_embedding_type=None):
|
||||
super().__init__()
|
||||
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
|
||||
config, 'embedding_size'):
|
||||
raise ValueError(
|
||||
f'The hidden size ({config.hidden_size}) is not a multiple of the number of attention '
|
||||
f'heads ({config.num_attention_heads})')
|
||||
|
||||
self.num_attention_heads = config.num_attention_heads
|
||||
self.attention_head_size = int(config.hidden_size
|
||||
/ config.num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.position_embedding_type = position_embedding_type or getattr(
|
||||
config, 'position_embedding_type', 'absolute')
|
||||
if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
|
||||
self.max_position_embeddings = config.max_position_embeddings
|
||||
self.distance_embedding = nn.Embedding(
|
||||
2 * config.max_position_embeddings - 1,
|
||||
self.attention_head_size)
|
||||
|
||||
self.is_decoder = config.is_decoder
|
||||
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads,
|
||||
self.attention_head_size)
|
||||
x = x.view(*new_x_shape)
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
past_key_value=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
|
||||
# If this is instantiated as a cross-attention module, the keys
|
||||
# and values come from an encoder; the attention mask needs to be
|
||||
# such that the encoder's padding tokens are not attended to.
|
||||
is_cross_attention = encoder_hidden_states is not None
|
||||
|
||||
if is_cross_attention and past_key_value is not None:
|
||||
# reuse k,v, cross_attentions
|
||||
key_layer = past_key_value[0]
|
||||
value_layer = past_key_value[1]
|
||||
attention_mask = encoder_attention_mask
|
||||
elif is_cross_attention:
|
||||
key_layer = self.transpose_for_scores(
|
||||
self.key(encoder_hidden_states))
|
||||
value_layer = self.transpose_for_scores(
|
||||
self.value(encoder_hidden_states))
|
||||
attention_mask = encoder_attention_mask
|
||||
elif past_key_value is not None:
|
||||
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
||||
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
||||
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
|
||||
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
|
||||
else:
|
||||
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
||||
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||
|
||||
if self.is_decoder:
|
||||
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all
|
||||
# cross attention key/value_states. Further calls to cross_attention
|
||||
# layer can then reuse all cross-attention key/value_states (first
|
||||
# "if" case) if uni-directional self-attention (decoder) save
|
||||
# Tuple(torch.Tensor, torch.Tensor) of all previous decoder
|
||||
# key/value_states. Further calls to uni-directional self-attention
|
||||
# can concat previous decoder key/value_states to current projected
|
||||
# key/value_states (third "elif" case) if encoder bi-directional
|
||||
# self-attention `past_key_value` is always `None`
|
||||
past_key_value = (key_layer, value_layer)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = torch.matmul(query_layer,
|
||||
key_layer.transpose(-1, -2))
|
||||
|
||||
if self.position_embedding_type == 'relative_key' or self.position_embedding_type == 'relative_key_query':
|
||||
seq_length = hidden_states.size()[1]
|
||||
position_ids_l = torch.arange(
|
||||
seq_length, dtype=torch.long,
|
||||
device=hidden_states.device).view(-1, 1)
|
||||
position_ids_r = torch.arange(
|
||||
seq_length, dtype=torch.long,
|
||||
device=hidden_states.device).view(1, -1)
|
||||
distance = position_ids_l - position_ids_r
|
||||
positional_embedding = self.distance_embedding(
|
||||
distance + self.max_position_embeddings - 1)
|
||||
positional_embedding = positional_embedding.to(
|
||||
dtype=query_layer.dtype) # fp16 compatibility
|
||||
|
||||
if self.position_embedding_type == 'relative_key':
|
||||
relative_position_scores = torch.einsum(
|
||||
'bhld,lrd->bhlr', query_layer, positional_embedding)
|
||||
attention_scores = attention_scores + relative_position_scores
|
||||
elif self.position_embedding_type == 'relative_key_query':
|
||||
relative_position_scores_query = torch.einsum(
|
||||
'bhld,lrd->bhlr', query_layer, positional_embedding)
|
||||
relative_position_scores_key = torch.einsum(
|
||||
'bhrd,lrd->bhlr', key_layer, positional_embedding)
|
||||
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
|
||||
|
||||
attention_scores = attention_scores / math.sqrt(
|
||||
self.attention_head_size)
|
||||
if attention_mask is not None:
|
||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
|
||||
# Mask heads if we want to
|
||||
if head_mask is not None:
|
||||
attention_probs = attention_probs * head_mask
|
||||
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (
|
||||
self.all_head_size, )
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
outputs = (context_layer,
|
||||
attention_probs) if output_attentions else (context_layer, )
|
||||
|
||||
if self.is_decoder:
|
||||
outputs = outputs + (past_key_value, )
|
||||
return outputs
|
||||
|
||||
|
||||
class BertSelfOutput(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertAttention(nn.Module):
|
||||
|
||||
def __init__(self, config, position_embedding_type=None):
|
||||
super().__init__()
|
||||
self.self = BertSelfAttention(
|
||||
config, position_embedding_type=position_embedding_type)
|
||||
self.output = BertSelfOutput(config)
|
||||
self.pruned_heads = set()
|
||||
|
||||
def prune_heads(self, heads):
|
||||
if len(heads) == 0:
|
||||
return
|
||||
heads, index = find_pruneable_heads_and_indices(
|
||||
heads, self.self.num_attention_heads,
|
||||
self.self.attention_head_size, self.pruned_heads)
|
||||
|
||||
# Prune linear layers
|
||||
self.self.query = prune_linear_layer(self.self.query, index)
|
||||
self.self.key = prune_linear_layer(self.self.key, index)
|
||||
self.self.value = prune_linear_layer(self.self.value, index)
|
||||
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
|
||||
|
||||
# Update hyper params and store pruned heads
|
||||
self.self.num_attention_heads = self.self.num_attention_heads - len(
|
||||
heads)
|
||||
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
|
||||
self.pruned_heads = self.pruned_heads.union(heads)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
past_key_value=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
self_outputs = self.self(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
attention_output = self.output(self_outputs[0], hidden_states)
|
||||
outputs = (attention_output,
|
||||
) + self_outputs[1:] # add attentions if we output them
|
||||
return outputs
|
||||
|
||||
|
||||
class BertIntermediate(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertOutput(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertLayer(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
||||
self.seq_len_dim = 1
|
||||
self.attention = BertAttention(config)
|
||||
self.is_decoder = config.is_decoder
|
||||
self.add_cross_attention = config.add_cross_attention
|
||||
if self.add_cross_attention:
|
||||
if not self.is_decoder:
|
||||
raise ValueError(
|
||||
f'{self} should be used as a decoder model if cross attention is added'
|
||||
)
|
||||
self.crossattention = BertAttention(
|
||||
config, position_embedding_type='absolute')
|
||||
self.intermediate = BertIntermediate(config)
|
||||
self.output = BertOutput(config)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
past_key_value=None,
|
||||
output_attentions=False,
|
||||
):
|
||||
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
||||
self_attn_past_key_value = past_key_value[:
|
||||
2] if past_key_value is not None else None
|
||||
self_attention_outputs = self.attention(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
head_mask,
|
||||
output_attentions=output_attentions,
|
||||
past_key_value=self_attn_past_key_value,
|
||||
)
|
||||
attention_output = self_attention_outputs[0]
|
||||
|
||||
# if decoder, the last output is tuple of self-attn cache
|
||||
if self.is_decoder:
|
||||
outputs = self_attention_outputs[1:-1]
|
||||
present_key_value = self_attention_outputs[-1]
|
||||
else:
|
||||
outputs = self_attention_outputs[
|
||||
1:] # add self attentions if we output attention weights
|
||||
|
||||
cross_attn_present_key_value = None
|
||||
if self.is_decoder and encoder_hidden_states is not None:
|
||||
if not hasattr(self, 'crossattention'):
|
||||
raise ValueError(
|
||||
f'If `encoder_hidden_states` are passed, {self} has to be instantiated '
|
||||
f'with cross-attention layers by setting `config.add_cross_attention=True`'
|
||||
)
|
||||
|
||||
# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
|
||||
cross_attn_past_key_value = past_key_value[
|
||||
-2:] if past_key_value is not None else None
|
||||
cross_attention_outputs = self.crossattention(
|
||||
attention_output,
|
||||
attention_mask,
|
||||
head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
cross_attn_past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
attention_output = cross_attention_outputs[0]
|
||||
outputs = outputs + cross_attention_outputs[
|
||||
1:-1] # add cross attentions if we output attention weights
|
||||
|
||||
# add cross-attn cache to positions 3,4 of present_key_value tuple
|
||||
cross_attn_present_key_value = cross_attention_outputs[-1]
|
||||
present_key_value = present_key_value + cross_attn_present_key_value
|
||||
|
||||
layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
|
||||
self.chunk_size_feed_forward,
|
||||
self.seq_len_dim,
|
||||
attention_output)
|
||||
outputs = (layer_output, ) + outputs
|
||||
|
||||
# if decoder, return the attn key/values as the last output
|
||||
if self.is_decoder:
|
||||
outputs = outputs + (present_key_value, )
|
||||
|
||||
return outputs
|
||||
|
||||
def feed_forward_chunk(self, attention_output):
|
||||
intermediate_output = self.intermediate(attention_output)
|
||||
layer_output = self.output(intermediate_output, attention_output)
|
||||
return layer_output
|
||||
|
||||
|
||||
class BertEncoder(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.layer = nn.ModuleList(
|
||||
[BertLayer(config) for _ in range(config.num_hidden_layers)])
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
past_key_values=None,
|
||||
use_cache=None,
|
||||
output_attentions=False,
|
||||
output_hidden_states=False,
|
||||
return_dict=True,
|
||||
):
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attentions = () if output_attentions else None
|
||||
all_cross_attentions = (
|
||||
) if output_attentions and self.config.add_cross_attention else None
|
||||
|
||||
next_decoder_cache = () if use_cache else None
|
||||
for i, layer_module in enumerate(self.layer):
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states, )
|
||||
|
||||
layer_head_mask = head_mask[i] if head_mask is not None else None
|
||||
past_key_value = past_key_values[
|
||||
i] if past_key_values is not None else None
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
|
||||
if use_cache:
|
||||
logger.warning(
|
||||
'`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
def create_custom_forward(module):
|
||||
|
||||
def custom_forward(*inputs):
|
||||
return module(*inputs, past_key_value,
|
||||
output_attentions)
|
||||
|
||||
return custom_forward
|
||||
|
||||
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||
create_custom_forward(layer_module),
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
)
|
||||
else:
|
||||
layer_outputs = layer_module(
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
layer_head_mask,
|
||||
encoder_hidden_states,
|
||||
encoder_attention_mask,
|
||||
past_key_value,
|
||||
output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
if use_cache:
|
||||
next_decoder_cache += (layer_outputs[-1], )
|
||||
if output_attentions:
|
||||
all_self_attentions = all_self_attentions + (
|
||||
layer_outputs[1], )
|
||||
if self.config.add_cross_attention:
|
||||
all_cross_attentions = all_cross_attentions + (
|
||||
layer_outputs[2], )
|
||||
|
||||
if output_hidden_states:
|
||||
all_hidden_states = all_hidden_states + (hidden_states, )
|
||||
|
||||
if not return_dict:
|
||||
return tuple(v for v in [
|
||||
hidden_states,
|
||||
next_decoder_cache,
|
||||
all_hidden_states,
|
||||
all_self_attentions,
|
||||
all_cross_attentions,
|
||||
] if v is not None)
|
||||
return BaseModelOutputWithPastAndCrossAttentions(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=next_decoder_cache,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attentions,
|
||||
cross_attentions=all_cross_attentions,
|
||||
)
|
||||
|
||||
|
||||
class BertPooler(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# We "pool" the model by simply taking the hidden state corresponding
|
||||
# to the first token.
|
||||
first_token_tensor = hidden_states[:, 0]
|
||||
pooled_output = self.dense(first_token_tensor)
|
||||
pooled_output = self.activation(pooled_output)
|
||||
return pooled_output
|
||||
|
||||
|
||||
class BertPreTrainedModel(TorchModel, PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface
|
||||
for downloading and loading pretrained models.
|
||||
"""
|
||||
|
||||
config_class = BertConfig
|
||||
base_model_prefix = 'bert'
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r'position_ids']
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config.name_or_path, **kwargs)
|
||||
super(Model, self).__init__(config)
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
if isinstance(module, nn.Linear):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(
|
||||
mean=0.0, std=self.config.initializer_range)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(
|
||||
mean=0.0, std=self.config.initializer_range)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
def _set_gradient_checkpointing(self, module, value=False):
|
||||
if isinstance(module, BertEncoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
Args:
|
||||
kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
num_labels: An optional arg to tell the model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping if num_labels not supplied.
|
||||
If num_labels is not found, the model will use the default setting (2 classes).
|
||||
|
||||
Returns:
|
||||
The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
|
||||
model_dir = kwargs.get('model_dir', None)
|
||||
if model_dir is None:
|
||||
config = BertConfig(**kwargs)
|
||||
model = cls(config)
|
||||
else:
|
||||
model_kwargs = {}
|
||||
label2id = kwargs.get('label2id', parse_label_mapping(model_dir))
|
||||
id2label = kwargs.get(
|
||||
'id2label', None if label2id is None else
|
||||
{id: label
|
||||
for label, id in label2id.items()})
|
||||
if id2label is not None and label2id is None:
|
||||
label2id = {label: id for id, label in id2label.items()}
|
||||
|
||||
num_labels = kwargs.get(
|
||||
'num_labels', None if label2id is None else len(label2id))
|
||||
if num_labels is not None:
|
||||
model_kwargs['num_labels'] = num_labels
|
||||
if label2id is not None:
|
||||
model_kwargs['label2id'] = label2id
|
||||
if id2label is not None:
|
||||
model_kwargs['id2label'] = id2label
|
||||
model = super(Model, cls).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir, **model_kwargs)
|
||||
model.model_dir = model_dir
|
||||
return model
|
||||
|
||||
|
||||
@MODELS.register_module(group_key=Tasks.backbone, module_name=Models.bert)
|
||||
class BertModel(BertPreTrainedModel):
|
||||
"""The Bert Model transformer outputting raw hidden-states without any
|
||||
specific head on top.
|
||||
|
||||
This model inherits from [`PreTrainedModel`]. Check the superclass
|
||||
documentation for the generic methods the library implements for all its
|
||||
model (such as downloading or saving, resizing the input embeddings, pruning
|
||||
heads etc.)
|
||||
|
||||
This model is also a PyTorch
|
||||
[torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch
|
||||
documentation for all matter related to general usage and behavior.
|
||||
|
||||
Parameters:
|
||||
config ([`BertConfig`]): Model configuration class with all the
|
||||
parameters of the model.
|
||||
Initializing with a config file does not load the weights associated
|
||||
with the model, only the configuration. Check out the
|
||||
[`~PreTrainedModel.from_pretrained`] method to load the model
|
||||
weights.
|
||||
|
||||
The model can behave as an encoder (with only self-attention) as well as a
|
||||
decoder, in which case a layer of cross-attention is added between the
|
||||
self-attention layers, following the architecture described in [Attention is
|
||||
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam
|
||||
Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
|
||||
Kaiser and Illia Polosukhin.
|
||||
|
||||
To behave as an decoder the model needs to be initialized with the
|
||||
`is_decoder` argument of the configuration set to `True`. To be used in a
|
||||
Seq2Seq model, the model needs to initialized with both `is_decoder`
|
||||
argument and `add_cross_attention` set to `True`; an `encoder_hidden_states`
|
||||
is then expected as an input to the forward pass.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
self.embeddings = BertEmbeddings(config)
|
||||
self.encoder = BertEncoder(config)
|
||||
|
||||
self.pooler = BertPooler(config) if add_pooling_layer else None
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, model_dir=None, add_pooling_layer=True, **config):
|
||||
config = BertConfig(**config)
|
||||
model = cls(config, add_pooling_layer)
|
||||
return model
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.embeddings.word_embeddings
|
||||
|
||||
def set_input_embeddings(self, value):
|
||||
self.embeddings.word_embeddings = value
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
|
||||
class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
past_key_values=None,
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
**kwargs):
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`BertTokenizer`]. See
|
||||
[`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
|
||||
for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.FloatTensor` of shape `((batch_size, sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask
|
||||
values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
token_type_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the
|
||||
inputs. Indices are selected in `[0, 1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length)`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position
|
||||
embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers,
|
||||
num_heads)`, *optional*):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask
|
||||
values selected in `[0, 1]`:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length, hidden_size)`,
|
||||
*optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to
|
||||
directly pass an embedded representation. This is useful if you want
|
||||
more control over how to convert `input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention
|
||||
layers. See `attentions` under returned tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See
|
||||
`hidden_states` under returned tensors for more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~file_utils.ModelOutput`] instead of a
|
||||
plain tuple.
|
||||
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size,
|
||||
sequence_length, hidden_size)`, *optional*):
|
||||
Sequence of hidden-states at the output of the last layer of the
|
||||
encoder. Used in the cross-attention if the model is configured as a
|
||||
decoder.
|
||||
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size,
|
||||
sequence_length)`, *optional*):
|
||||
Mask to avoid performing attention on the padding token indices of
|
||||
the encoder input. This mask is used in the cross-attention if the
|
||||
model is configured as a decoder. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
past_key_values (`tuple(tuple(torch.FloatTensor))` of length
|
||||
`config.n_layers` with each tuple having 4 tensors of shape
|
||||
`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
||||
Contains precomputed key and value hidden states of the attention
|
||||
blocks. Can be used to speed up decoding.
|
||||
|
||||
If `past_key_values` are used, the user can optionally input only
|
||||
the last `decoder_input_ids` (those that don't have their past key
|
||||
value states given to this model) of shape `(batch_size, 1)` instead
|
||||
of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned
|
||||
and can be used to speed up decoding (see `past_key_values`).
|
||||
Others (**kwargs)
|
||||
some additional parameters might passed in from upstream pipeline,
|
||||
which not influence the results.
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else
|
||||
self.config.output_hidden_states)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
if self.config.is_decoder:
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
else:
|
||||
use_cache = False
|
||||
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError(
|
||||
'You cannot specify both input_ids and inputs_embeds at the same time'
|
||||
)
|
||||
elif input_ids is not None:
|
||||
input_shape = input_ids.size()
|
||||
elif inputs_embeds is not None:
|
||||
input_shape = inputs_embeds.size()[:-1]
|
||||
else:
|
||||
raise ValueError(
|
||||
'You have to specify either input_ids or inputs_embeds')
|
||||
|
||||
batch_size, seq_length = input_shape
|
||||
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||
|
||||
# past_key_values_length
|
||||
past_key_values_length = past_key_values[0][0].shape[
|
||||
2] if past_key_values is not None else 0
|
||||
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones(
|
||||
((batch_size, seq_length + past_key_values_length)),
|
||||
device=device)
|
||||
|
||||
if token_type_ids is None:
|
||||
if hasattr(self.embeddings, 'token_type_ids'):
|
||||
buffered_token_type_ids = self.embeddings.token_type_ids[:, :
|
||||
seq_length]
|
||||
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
|
||||
batch_size, seq_length)
|
||||
token_type_ids = buffered_token_type_ids_expanded
|
||||
else:
|
||||
token_type_ids = torch.zeros(
|
||||
input_shape, dtype=torch.long, device=device)
|
||||
|
||||
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
||||
# ourselves in which case we just need to make it broadcastable to all heads.
|
||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
|
||||
attention_mask, input_shape, device)
|
||||
|
||||
# If a 2D or 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size(
|
||||
)
|
||||
encoder_hidden_shape = (encoder_batch_size,
|
||||
encoder_sequence_length)
|
||||
if encoder_attention_mask is None:
|
||||
encoder_attention_mask = torch.ones(
|
||||
encoder_hidden_shape, device=device)
|
||||
encoder_extended_attention_mask = self.invert_attention_mask(
|
||||
encoder_attention_mask)
|
||||
else:
|
||||
encoder_extended_attention_mask = None
|
||||
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we keep the head
|
||||
# attention_probs has shape bsz x n_heads x N x N
|
||||
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
||||
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
||||
head_mask = self.get_head_mask(head_mask,
|
||||
self.config.num_hidden_layers)
|
||||
|
||||
embedding_output = self.embeddings(
|
||||
input_ids=input_ids,
|
||||
position_ids=position_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
past_key_values_length=past_key_values_length,
|
||||
)
|
||||
encoder_outputs = self.encoder(
|
||||
embedding_output,
|
||||
attention_mask=extended_attention_mask,
|
||||
head_mask=head_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_extended_attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
sequence_output = encoder_outputs[0]
|
||||
pooled_output = self.pooler(
|
||||
sequence_output) if self.pooler is not None else None
|
||||
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPoolingAndCrossAttentions(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
past_key_values=encoder_outputs.past_key_values,
|
||||
hidden_states=encoder_outputs.hidden_states,
|
||||
attentions=encoder_outputs.attentions,
|
||||
cross_attentions=encoder_outputs.cross_attentions,
|
||||
)
|
||||
|
||||
def extract_sequence_outputs(self, outputs):
|
||||
return outputs['last_hidden_state']
|
||||
|
||||
def extract_pooled_outputs(self, outputs):
|
||||
return outputs['pooler_output']
|
||||
@@ -1,3 +1,4 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers.modeling_outputs import TokenClassifierOutput
|
||||
299
modelscope/models/nlp/bert/fill_mask.py
Normal file
299
modelscope/models/nlp/bert/fill_mask.py
Normal file
@@ -0,0 +1,299 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers.activations import ACT2FN
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionFillMaskModelOutput
|
||||
from modelscope.utils import logger as logging
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .backbone import BertModel, BertPreTrainedModel
|
||||
from .configuration import BertConfig
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class BertPredictionHeadTransform(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.transform_act_fn(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertLMPredictionHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.transform = BertPredictionHeadTransform(config)
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = nn.Linear(
|
||||
config.hidden_size, config.vocab_size, bias=False)
|
||||
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
|
||||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BertOnlyMLMHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = BertLMPredictionHead(config)
|
||||
|
||||
def forward(self, sequence_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
return prediction_scores
|
||||
|
||||
|
||||
class BertOnlyNSPHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
||||
|
||||
def forward(self, pooled_output):
|
||||
seq_relationship_score = self.seq_relationship(pooled_output)
|
||||
return seq_relationship_score
|
||||
|
||||
|
||||
class BertPreTrainingHeads(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = BertLMPredictionHead(config)
|
||||
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
||||
|
||||
def forward(self, sequence_output, pooled_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
seq_relationship_score = self.seq_relationship(pooled_output)
|
||||
return prediction_scores, seq_relationship_score
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
|
||||
class BertForMaskedLM(BertPreTrainedModel):
|
||||
r"""Bert Model with a `language modeling` head on top.
|
||||
|
||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||
pruning heads etc.)
|
||||
|
||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||
general usage and behavior.
|
||||
|
||||
Preprocessor:
|
||||
This is the fill_mask model of Structbert, the preprocessor of this model
|
||||
is `modelscope.preprocessors.NLPPreprocessor`.
|
||||
|
||||
Parameters:
|
||||
config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
|
||||
all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'position_ids', r'predictions.decoder.bias'
|
||||
]
|
||||
|
||||
def __init__(self, config: BertConfig, **kwargs):
|
||||
super().__init__(config)
|
||||
|
||||
if config.is_decoder:
|
||||
logger.warning(
|
||||
'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
|
||||
'bi-directional self-attention.')
|
||||
|
||||
self.bert = BertModel(config, add_pooling_layer=False)
|
||||
self.cls = BertOnlyMLMHead(config)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
`What are token type IDs? <../glossary.html#token-type-ids>`_
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
`What are position IDs? <../glossary.html#position-ids>`_
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`,
|
||||
*optional*):
|
||||
Labels for computing the masked language modeling loss. Indices
|
||||
should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids`
|
||||
docstring) Tokens with indices set to `-100` are ignored (masked),
|
||||
the loss is only computed for the tokens with labels in `[0, ...,
|
||||
config.vocab_size]`
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionFillMaskModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_bert_backbone_base_std')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_backbone_base_std')
|
||||
>>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(
|
||||
prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, ) + outputs[2:]
|
||||
return ((masked_lm_loss, )
|
||||
+ output) if masked_lm_loss is not None else output
|
||||
|
||||
return AttentionFillMaskModelOutput(
|
||||
loss=masked_lm_loss,
|
||||
logits=prediction_scores,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
input_ids=input_ids,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(self,
|
||||
input_ids,
|
||||
attention_mask=None,
|
||||
**model_kwargs):
|
||||
input_shape = input_ids.shape
|
||||
effective_batch_size = input_shape[0]
|
||||
|
||||
# add a dummy token
|
||||
if self.config.pad_token_id is None:
|
||||
raise ValueError('The PAD token should be defined for generation')
|
||||
|
||||
padding_mask = attention_mask.new_zeros((attention_mask.shape[0], 1))
|
||||
attention_mask = torch.cat([attention_mask, padding_mask], dim=-1)
|
||||
dummy_token = torch.full((effective_batch_size, 1),
|
||||
self.config.pad_token_id,
|
||||
dtype=torch.long,
|
||||
device=input_ids.device)
|
||||
input_ids = torch.cat([input_ids, dummy_token], dim=1)
|
||||
|
||||
return {'input_ids': input_ids, 'attention_mask': attention_mask}
|
||||
File diff suppressed because it is too large
Load Diff
113
modelscope/models/nlp/bert/sentence_embedding.py
Normal file
113
modelscope/models/nlp/bert/sentence_embedding.py
Normal file
@@ -0,0 +1,113 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import BackboneModelOutput
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .backbone import BertModel, BertPreTrainedModel
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
|
||||
class BertForSentenceEmbedding(BertPreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.config = config
|
||||
setattr(self, self.base_model_prefix,
|
||||
BertModel(config, add_pooling_layer=False))
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
) -> BackboneModelOutput:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_corom_sentence-embedding_chinese-base')
|
||||
>>> print(model(**preprocessor('This is a test')))
|
||||
"""
|
||||
return self.base_model.forward(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict)
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
Args:
|
||||
kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
|
||||
Returns:
|
||||
The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
model_dir = kwargs.get('model_dir')
|
||||
model = super(
|
||||
Model,
|
||||
cls).from_pretrained(pretrained_model_name_or_path=model_dir)
|
||||
model.model_dir = model_dir
|
||||
return model
|
||||
208
modelscope/models/nlp/bert/text_classification.py
Normal file
208
modelscope/models/nlp/bert/text_classification.py
Normal file
@@ -0,0 +1,208 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionTextClassificationModelOutput
|
||||
from modelscope.utils import logger as logging
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .backbone import BertModel, BertPreTrainedModel
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
|
||||
@MODELS.register_module(Tasks.nli, module_name=Models.bert)
|
||||
@MODELS.register_module(
|
||||
Tasks.sentiment_classification, module_name=Models.bert)
|
||||
@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
|
||||
@MODELS.register_module(
|
||||
Tasks.zero_shot_classification, module_name=Models.bert)
|
||||
class BertForSequenceClassification(BertPreTrainedModel):
|
||||
r"""Bert Model transformer with a sequence classification/regression head on top
|
||||
(a linear layer on top of the pooled output) e.g. for GLUE tasks.
|
||||
|
||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||
pruning heads etc.)
|
||||
|
||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||
general usage and behavior.
|
||||
|
||||
Preprocessor:
|
||||
This is the fill_mask model of Bert, the preprocessor of this model
|
||||
is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
|
||||
|
||||
Trainer:
|
||||
This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
|
||||
NlpEpochBasedTrainer, or trainers from other frameworks.
|
||||
The preferred trainer in ModelScope is NlpEpochBasedTrainer.
|
||||
|
||||
Parameters:
|
||||
config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
|
||||
all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
|
||||
setattr(self, self.base_model_prefix, BertModel(config))
|
||||
classifier_dropout = (
|
||||
config.classifier_dropout if config.classifier_dropout is not None
|
||||
else config.hidden_dropout_prob)
|
||||
self.dropout = nn.Dropout(classifier_dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.ModelOutput` instead of a plain tuple.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
|
||||
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionTextClassificationModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_structbert_sentence-similarity_chinese-base')
|
||||
>>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.base_model.forward(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = 'regression'
|
||||
elif self.num_labels > 1 and (labels.dtype == torch.long
|
||||
or labels.dtype == torch.int):
|
||||
self.config.problem_type = 'single_label_classification'
|
||||
else:
|
||||
self.config.problem_type = 'multi_label_classification'
|
||||
|
||||
if self.config.problem_type == 'regression':
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == 'single_label_classification':
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(
|
||||
logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == 'multi_label_classification':
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
if not return_dict:
|
||||
output = (logits, ) + outputs[2:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return AttentionTextClassificationModelOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
89
modelscope/models/nlp/bert/text_ranking.py
Normal file
89
modelscope/models/nlp/bert/text_ranking.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import Model
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionTextClassificationModelOutput
|
||||
from modelscope.utils import logger as logging
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .backbone import BertModel
|
||||
from .text_classification import BertForSequenceClassification
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.text_ranking, module_name=Models.bert)
|
||||
class BertForTextRanking(BertForSequenceClassification):
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config)
|
||||
self.train_batch_size = kwargs.get('train_batch_size', 4)
|
||||
setattr(self, self.base_model_prefix,
|
||||
BertModel(self.config, add_pooling_layer=True))
|
||||
self.register_buffer(
|
||||
'target_label',
|
||||
torch.zeros(self.train_batch_size, dtype=torch.long))
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
**kwargs) -> AttentionTextClassificationModelOutput:
|
||||
outputs = self.base_model.forward(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict)
|
||||
|
||||
# backbone model should return pooled_output as its second output
|
||||
pooled_output = outputs[1]
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
if self.base_model.training:
|
||||
scores = logits.view(self.train_batch_size, -1)
|
||||
loss_fct = torch.nn.CrossEntropyLoss()
|
||||
loss = loss_fct(scores, self.target_label)
|
||||
return AttentionTextClassificationModelOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
)
|
||||
return AttentionTextClassificationModelOutput(logits=logits, )
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
Args:
|
||||
kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
num_labels: An optional arg to tell the model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping if num_labels not supplied.
|
||||
If num_labels is not found, the model will use the default setting (1 classes).
|
||||
|
||||
Returns:
|
||||
The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
|
||||
num_labels = kwargs.get('num_labels', 1)
|
||||
model_args = {} if num_labels is None else {'num_labels': num_labels}
|
||||
|
||||
model_dir = kwargs.get('model_dir')
|
||||
model = super(Model, cls).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir, **model_args)
|
||||
model.model_dir = model_dir
|
||||
return model
|
||||
225
modelscope/models/nlp/bert/token_classification.py
Normal file
225
modelscope/models/nlp/bert/token_classification.py
Normal file
@@ -0,0 +1,225 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.utils.checkpoint
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import TokenClassifierOutput
|
||||
from modelscope.utils import logger as logging
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .backbone import BertModel, BertPreTrainedModel
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.token_classification, module_name=Models.bert)
|
||||
@MODELS.register_module(Tasks.part_of_speech, module_name=Models.bert)
|
||||
@MODELS.register_module(Tasks.word_segmentation, module_name=Models.bert)
|
||||
class BertForTokenClassification(BertPreTrainedModel):
|
||||
r"""Bert Model with a token classification head on top (a linear layer on top of
|
||||
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks, word-segmentation.
|
||||
|
||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||
pruning heads etc.)
|
||||
|
||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||
general usage and behavior.
|
||||
|
||||
Preprocessor:
|
||||
This is the fill_mask model of Bert, the preprocessor of this model
|
||||
is `modelscope.preprocessors.SequenceClassificationPreprocessor`.
|
||||
|
||||
Trainer:
|
||||
This model is a normal PyTorch model, and can be trained by variable trainers, like EpochBasedTrainer,
|
||||
NlpEpochBasedTrainer, or trainers from other frameworks.
|
||||
The preferred trainer in ModelScope is NlpEpochBasedTrainer.
|
||||
|
||||
Parameters:
|
||||
config (:class:`~modelscope.models.nlp.structbert.SbertConfig`): Model configuration class with
|
||||
all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
"""
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
setattr(self, self.base_model_prefix,
|
||||
BertModel(config, add_pooling_layer=False))
|
||||
classifier_dropout = (
|
||||
config.classifier_dropout if config.classifier_dropout is not None
|
||||
else config.hidden_dropout_prob)
|
||||
self.dropout = nn.Dropout(classifier_dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
offset_mapping=None,
|
||||
label_mask=None,
|
||||
):
|
||||
r"""
|
||||
Args: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
|
||||
sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using
|
||||
:class:`~modelscope.models.nlp.structbert.SbertTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and
|
||||
:meth:`transformers.PreTrainedTokenizer.__call__` for details.
|
||||
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
|
||||
sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask
|
||||
values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
|
||||
sequence_length)`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the
|
||||
inputs. Indices are selected in ``[0, 1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size,
|
||||
sequence_length)`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position
|
||||
embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or
|
||||
:obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask
|
||||
values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
|
||||
sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to
|
||||
directly pass an embedded representation. This is useful if you want
|
||||
more control over how to convert :obj:`input_ids` indices into
|
||||
associated vectors than the model's internal embedding lookup
|
||||
matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention
|
||||
layers. See ``attentions`` under returned tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See
|
||||
``hidden_states`` under returned tensors for more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.ModelOutput`
|
||||
instead of a plain tuple.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`,
|
||||
`optional`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If
|
||||
:obj:`config.num_labels == 1` a regression loss is computed
|
||||
(Mean-Square loss), If :obj:`config.num_labels > 1` a classification
|
||||
loss is computed (Cross-Entropy).
|
||||
offset_mapping (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
|
||||
sequence_length)`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the sentence.
|
||||
Selected in the range ``[0, sequence_length - 1]``.
|
||||
label_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,
|
||||
sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask
|
||||
values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.TokenClassifierOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_bert_word-segmentation_chinese-base')
|
||||
>>> print(model(**preprocessor(('This is a test', 'This is also a test'))))
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
# Only keep active parts of the loss
|
||||
if attention_mask is not None:
|
||||
active_loss = attention_mask.view(-1) == 1
|
||||
active_logits = logits.view(-1, self.num_labels)
|
||||
active_labels = torch.where(
|
||||
active_loss, labels.view(-1),
|
||||
torch.tensor(loss_fct.ignore_index).type_as(labels))
|
||||
loss = loss_fct(active_logits, active_labels)
|
||||
else:
|
||||
loss = loss_fct(
|
||||
logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (logits, ) + outputs[2:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
offset_mapping=offset_mapping,
|
||||
)
|
||||
15
modelscope/models/nlp/bloom/backbone.py
Normal file
15
modelscope/models/nlp/bloom/backbone.py
Normal file
@@ -0,0 +1,15 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from transformers import BloomConfig
|
||||
from transformers import BloomModel as BloomModelTransform
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import BACKBONES
|
||||
from modelscope.utils.constant import Fields
|
||||
|
||||
|
||||
@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.bloom)
|
||||
class BloomModel(BloomModelTransform):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
config = BloomConfig(**kwargs)
|
||||
super().__init__(config)
|
||||
2
modelscope/models/nlp/csanmt/__init__.py
Normal file
2
modelscope/models/nlp/csanmt/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .translation import CsanmtForTranslation
|
||||
@@ -22,38 +22,28 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_deberta_v2 import DebertaV2Config
|
||||
from .tokenization_deberta_v2 import DebertaV2Tokenizer
|
||||
from .tokenization_deberta_v2_fast import DebertaV2TokenizerFast
|
||||
|
||||
from .modeling_deberta_v2 import (
|
||||
DebertaV2ForMaskedLM,
|
||||
DebertaV2ForMultipleChoice,
|
||||
DebertaV2ForQuestionAnswering,
|
||||
DebertaV2ForSequenceClassification,
|
||||
DebertaV2ForTokenClassification,
|
||||
from .configuration import DebertaV2Config
|
||||
from .tokenization import DebertaV2Tokenizer
|
||||
from .tokenization_fast import DebertaV2TokenizerFast
|
||||
from .backbone import (
|
||||
DebertaV2Model,
|
||||
DebertaV2PreTrainedModel,
|
||||
)
|
||||
from .fill_mask import DebertaV2ForMaskedLM
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_deberta_v2':
|
||||
['DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP', 'DebertaV2Config'],
|
||||
'tokenization_deberta_v2': ['DebertaV2Tokenizer']
|
||||
'configuration': ['DebertaV2Config'],
|
||||
'tokenization': ['DebertaV2Tokenizer'],
|
||||
'tokenization_fast': ['DebertaV2TokenizerFast'],
|
||||
'backbone': [
|
||||
'DebertaV2Model',
|
||||
'DebertaV2PreTrainedModel',
|
||||
],
|
||||
'fill_mask': [
|
||||
'DebertaV2ForMaskedLM',
|
||||
]
|
||||
}
|
||||
_import_structure['tokenization_deberta_v2_fast'] = [
|
||||
'DebertaV2TokenizerFast'
|
||||
]
|
||||
_import_structure['modeling_deberta_v2'] = [
|
||||
'DebertaV2ForMaskedLM',
|
||||
'DebertaV2ForMultipleChoice',
|
||||
'DebertaV2ForQuestionAnswering',
|
||||
'DebertaV2ForSequenceClassification',
|
||||
'DebertaV2ForTokenClassification',
|
||||
'DebertaV2Model',
|
||||
'DebertaV2PreTrainedModel',
|
||||
]
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
|
||||
@@ -20,28 +20,22 @@ from typing import Optional, Tuple, Union
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
|
||||
from torch.nn import LayerNorm
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.file_utils import (add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward)
|
||||
from transformers.modeling_outputs import (BaseModelOutput, MaskedLMOutput,
|
||||
MultipleChoiceModelOutput,
|
||||
QuestionAnsweringModelOutput,
|
||||
SequenceClassifierOutput,
|
||||
TokenClassifierOutput)
|
||||
from transformers.modeling_outputs import BaseModelOutput
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
from transformers.pytorch_utils import softmax_backward_data
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import Model, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionBackboneModelOutput
|
||||
from modelscope.utils import logger as logging
|
||||
from .configuration_deberta_v2 import DebertaV2Config
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .configuration import DebertaV2Config
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
_CONFIG_FOR_DOC = 'DebertaV2Config'
|
||||
_TOKENIZER_FOR_DOC = 'DebertaV2Tokenizer'
|
||||
_CHECKPOINT_FOR_DOC = 'nlp_debertav2_fill-mask_chinese-lite'
|
||||
|
||||
|
||||
# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
|
||||
class ContextPooler(nn.Module):
|
||||
@@ -1006,7 +1000,7 @@ class DebertaV2Embeddings(nn.Module):
|
||||
|
||||
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2
|
||||
class DebertaV2PreTrainedModel(PreTrainedModel):
|
||||
class DebertaV2PreTrainedModel(TorchModel, PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
@@ -1018,6 +1012,10 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = ['position_embeddings']
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config.name_or_path, **kwargs)
|
||||
super(Model, self).__init__(config)
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights."""
|
||||
if isinstance(module, nn.Linear):
|
||||
@@ -1037,8 +1035,24 @@ class DebertaV2PreTrainedModel(PreTrainedModel):
|
||||
if isinstance(module, DebertaV2Encoder):
|
||||
module.gradient_checkpointing = value
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.pop('model_dir', None)
|
||||
if model_dir is None:
|
||||
ponet_config = DebertaV2Config(**kwargs)
|
||||
model = cls(ponet_config)
|
||||
else:
|
||||
model = super(
|
||||
Model,
|
||||
cls).from_pretrained(pretrained_model_name_or_path=model_dir)
|
||||
return model
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.backbone, module_name=Models.deberta_v2)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
|
||||
class DebertaV2Model(DebertaV2PreTrainedModel):
|
||||
"""The bare DeBERTa_v2 Model transformer outputting raw hidden-states without any specific head on top.
|
||||
|
||||
DEBERTA_START_DOCSTRING = r"""
|
||||
The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
|
||||
Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
|
||||
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
||||
@@ -1048,65 +1062,13 @@ DEBERTA_START_DOCSTRING = r"""
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
|
||||
Parameters:
|
||||
config ([`DebertaV2Config`]): Model configuration class with all the parameters of the model.
|
||||
config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
||||
"""
|
||||
configuration.
|
||||
"""
|
||||
|
||||
DEBERTA_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `({0})`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using [`DebertaV2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
||||
[`PreTrainedTokenizer.__call__`] for details.
|
||||
|
||||
[What are input IDs?](../glossary#input-ids)
|
||||
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
[What are attention masks?](../glossary#attention-mask)
|
||||
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
[What are token type IDs?](../glossary#token-type-ids)
|
||||
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
[What are position IDs?](../glossary#position-ids)
|
||||
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
'The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.',
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaModel with Deberta->DebertaV2
|
||||
class DebertaV2Model(DebertaV2PreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config)
|
||||
|
||||
self.embeddings = DebertaV2Embeddings(config)
|
||||
@@ -1130,14 +1092,6 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
|
||||
raise NotImplementedError(
|
||||
'The prune function is not implemented in DeBERTa model.')
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
@@ -1148,7 +1102,53 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, BaseModelOutput]:
|
||||
) -> Union[Tuple, AttentionBackboneModelOutput]:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
||||
config.max_position_embeddings - 1]`.
|
||||
|
||||
inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
|
||||
is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
|
||||
model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a dataclass instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionBackboneModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite', task='backbone')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
|
||||
>>> print(model(**preprocessor('这是个测试')))
|
||||
"""
|
||||
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else
|
||||
@@ -1216,574 +1216,9 @@ class DebertaV2Model(DebertaV2PreTrainedModel):
|
||||
return (sequence_output, ) + encoder_outputs[
|
||||
(1 if output_hidden_states else 2):]
|
||||
|
||||
return BaseModelOutput(
|
||||
return AttentionBackboneModelOutput(
|
||||
last_hidden_state=sequence_output,
|
||||
hidden_states=encoder_outputs.hidden_states
|
||||
if output_hidden_states else None,
|
||||
attentions=encoder_outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""DeBERTa Model with a `language modeling` head on top.""",
|
||||
DEBERTA_START_DOCSTRING)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
|
||||
class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'position_ids', r'predictions.decoder.bias'
|
||||
]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
self.deberta = DebertaV2Model(config)
|
||||
self.cls = DebertaV2OnlyMLMHead(config)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=MaskedLMOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, MaskedLMOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
|
||||
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.deberta(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(
|
||||
prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, ) + outputs[1:]
|
||||
return ((masked_lm_loss, )
|
||||
+ output) if masked_lm_loss is not None else output
|
||||
|
||||
return MaskedLMOutput(
|
||||
loss=masked_lm_loss,
|
||||
logits=prediction_scores,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
|
||||
class DebertaV2PredictionHeadTransform(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.transform_act_fn(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
|
||||
class DebertaV2LMPredictionHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.transform = DebertaV2PredictionHeadTransform(config)
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = nn.Linear(
|
||||
config.hidden_size, config.vocab_size, bias=False)
|
||||
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
|
||||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
|
||||
class DebertaV2OnlyMLMHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = DebertaV2LMPredictionHead(config)
|
||||
|
||||
def forward(self, sequence_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
return prediction_scores
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
|
||||
pooled output) e.g. for GLUE tasks.
|
||||
""",
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaForSequenceClassification with Deberta->DebertaV2
|
||||
class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
num_labels = getattr(config, 'num_labels', 2)
|
||||
self.num_labels = num_labels
|
||||
|
||||
self.deberta = DebertaV2Model(config)
|
||||
self.pooler = ContextPooler(config)
|
||||
output_dim = self.pooler.output_dim
|
||||
|
||||
self.classifier = nn.Linear(output_dim, num_labels)
|
||||
drop_out = getattr(config, 'cls_dropout', None)
|
||||
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
|
||||
self.dropout = StableDropout(drop_out)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.deberta.get_input_embeddings()
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.deberta.set_input_embeddings(new_embeddings)
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=SequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, SequenceClassifierOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
||||
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
||||
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.deberta(
|
||||
input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
encoder_layer = outputs[0]
|
||||
pooled_output = self.pooler(encoder_layer)
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
# regression task
|
||||
loss_fn = nn.MSELoss()
|
||||
logits = logits.view(-1).to(labels.dtype)
|
||||
loss = loss_fn(logits, labels.view(-1))
|
||||
elif labels.dim() == 1 or labels.size(-1) == 1:
|
||||
label_index = (labels >= 0).nonzero()
|
||||
labels = labels.long()
|
||||
if label_index.size(0) > 0:
|
||||
labeled_logits = torch.gather(
|
||||
logits, 0,
|
||||
label_index.expand(
|
||||
label_index.size(0), logits.size(1)))
|
||||
labels = torch.gather(labels, 0, label_index.view(-1))
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(
|
||||
labeled_logits.view(-1, self.num_labels).float(),
|
||||
labels.view(-1))
|
||||
else:
|
||||
loss = torch.tensor(0).to(logits)
|
||||
else:
|
||||
log_softmax = nn.LogSoftmax(-1)
|
||||
loss = -((log_softmax(logits) * labels).sum(-1)).mean()
|
||||
elif self.config.problem_type == 'regression':
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == 'single_label_classification':
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(
|
||||
logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == 'multi_label_classification':
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
if not return_dict:
|
||||
output = (logits, ) + outputs[1:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
|
||||
Named-Entity-Recognition (NER) tasks.
|
||||
""",
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
|
||||
class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.deberta = DebertaV2Model(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TokenClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, TokenClassifierOutput]:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.deberta(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (logits, ) + outputs[1:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
|
||||
layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
|
||||
""",
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaForQuestionAnswering with Deberta->DebertaV2
|
||||
class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.deberta = DebertaV2Model(config)
|
||||
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=QuestionAnsweringModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
start_positions: Optional[torch.Tensor] = None,
|
||||
end_positions: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, QuestionAnsweringModelOutput]:
|
||||
r"""
|
||||
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the start of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for position (index) of the end of the labelled span for computing the token classification loss.
|
||||
Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
|
||||
are not taken into account for computing the loss.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.deberta(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
logits = self.qa_outputs(sequence_output)
|
||||
start_logits, end_logits = logits.split(1, dim=-1)
|
||||
start_logits = start_logits.squeeze(-1).contiguous()
|
||||
end_logits = end_logits.squeeze(-1).contiguous()
|
||||
|
||||
total_loss = None
|
||||
if start_positions is not None and end_positions is not None:
|
||||
# If we are on multi-GPU, split add a dimension
|
||||
if len(start_positions.size()) > 1:
|
||||
start_positions = start_positions.squeeze(-1)
|
||||
if len(end_positions.size()) > 1:
|
||||
end_positions = end_positions.squeeze(-1)
|
||||
# sometimes the start/end positions are outside our model inputs, we ignore these terms
|
||||
ignored_index = start_logits.size(1)
|
||||
start_positions = start_positions.clamp(0, ignored_index)
|
||||
end_positions = end_positions.clamp(0, ignored_index)
|
||||
|
||||
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
|
||||
start_loss = loss_fct(start_logits, start_positions)
|
||||
end_loss = loss_fct(end_logits, end_positions)
|
||||
total_loss = (start_loss + end_loss) / 2
|
||||
|
||||
if not return_dict:
|
||||
output = (start_logits, end_logits) + outputs[1:]
|
||||
return ((total_loss, )
|
||||
+ output) if total_loss is not None else output
|
||||
|
||||
return QuestionAnsweringModelOutput(
|
||||
loss=total_loss,
|
||||
start_logits=start_logits,
|
||||
end_logits=end_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
|
||||
softmax) e.g. for RocStories/SWAG tasks.
|
||||
""",
|
||||
DEBERTA_START_DOCSTRING,
|
||||
)
|
||||
class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
num_labels = getattr(config, 'num_labels', 2)
|
||||
self.num_labels = num_labels
|
||||
|
||||
self.deberta = DebertaV2Model(config)
|
||||
self.pooler = ContextPooler(config)
|
||||
output_dim = self.pooler.output_dim
|
||||
|
||||
self.classifier = nn.Linear(output_dim, 1)
|
||||
drop_out = getattr(config, 'cls_dropout', None)
|
||||
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
|
||||
self.dropout = StableDropout(drop_out)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.deberta.get_input_embeddings()
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.deberta.set_input_embeddings(new_embeddings)
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
DEBERTA_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=MultipleChoiceModelOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
||||
Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
|
||||
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
|
||||
`input_ids` above)
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
num_choices = input_ids.shape[
|
||||
1] if input_ids is not None else inputs_embeds.shape[1]
|
||||
|
||||
flat_input_ids = input_ids.view(
|
||||
-1, input_ids.size(-1)) if input_ids is not None else None
|
||||
flat_position_ids = position_ids.view(
|
||||
-1, position_ids.size(-1)) if position_ids is not None else None
|
||||
flat_token_type_ids = token_type_ids.view(
|
||||
-1,
|
||||
token_type_ids.size(-1)) if token_type_ids is not None else None
|
||||
flat_attention_mask = attention_mask.view(
|
||||
-1,
|
||||
attention_mask.size(-1)) if attention_mask is not None else None
|
||||
flat_inputs_embeds = (
|
||||
inputs_embeds.view(-1, inputs_embeds.size(-2),
|
||||
inputs_embeds.size(-1))
|
||||
if inputs_embeds is not None else None)
|
||||
|
||||
outputs = self.deberta(
|
||||
flat_input_ids,
|
||||
position_ids=flat_position_ids,
|
||||
token_type_ids=flat_token_type_ids,
|
||||
attention_mask=flat_attention_mask,
|
||||
inputs_embeds=flat_inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
encoder_layer = outputs[0]
|
||||
pooled_output = self.pooler(encoder_layer)
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
reshaped_logits = logits.view(-1, num_choices)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(reshaped_logits, labels)
|
||||
|
||||
if not return_dict:
|
||||
output = (reshaped_logits, ) + outputs[1:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return MultipleChoiceModelOutput(
|
||||
loss=loss,
|
||||
logits=reshaped_logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
@@ -13,8 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" DeBERTa-v2 model configuration, mainly copied from :class:`~transformers.DeBERTaV2Config"""
|
||||
from collections import OrderedDict
|
||||
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
230
modelscope/models/nlp/deberta_v2/fill_mask.py
Normal file
230
modelscope/models/nlp/deberta_v2/fill_mask.py
Normal file
@@ -0,0 +1,230 @@
|
||||
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
|
||||
# Copyright 2020 Microsoft and the Hugging Face Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers.activations import ACT2FN
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionFillMaskModelOutput
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .backbone import DebertaV2Model, DebertaV2PreTrainedModel
|
||||
|
||||
|
||||
# Copied from transformers.models.deberta.modeling_deberta.DebertaForMaskedLM with Deberta->DebertaV2
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
|
||||
class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
|
||||
r"""DeBERTa_v2 Model with a `language modeling` head on top.
|
||||
|
||||
The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
|
||||
Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
|
||||
on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.
|
||||
|
||||
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
||||
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
||||
and behavior.
|
||||
|
||||
Preprocessor:
|
||||
This is the fill_mask model of Deberta_v2, the preprocessor of this model
|
||||
is `modelscope.preprocessors.NLPPreprocessor`.
|
||||
|
||||
Parameters:
|
||||
config (`DebertaV2Config`): Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration.
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'position_ids', r'predictions.decoder.bias'
|
||||
]
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config)
|
||||
|
||||
self.deberta = DebertaV2Model(config)
|
||||
self.cls = DebertaV2OnlyMLMHead(config)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
token_type_ids: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
labels: Optional[torch.Tensor] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, AttentionFillMaskModelOutput]:
|
||||
r"""
|
||||
Args:
|
||||
input_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
attention_mask (`torch.FloatTensor` of shape `('batch_size, sequence_length')`, *optional*):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
|
||||
1]`:
|
||||
|
||||
- 0 corresponds to a *sentence A* token,
|
||||
- 1 corresponds to a *sentence B* token.
|
||||
|
||||
position_ids (`torch.LongTensor` of shape `('batch_size, sequence_length')`, *optional*):
|
||||
Indices of positions of each input sequence tokens in the position embeddings.
|
||||
Selected in the range `[0, config.max_position_embeddings - 1]`.
|
||||
|
||||
inputs_embeds (`torch.FloatTensor` of shape `('batch_size, sequence_length', hidden_size)`, *optional*):
|
||||
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert *input_ids* indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (`bool`, *optional*):
|
||||
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
||||
more detail.
|
||||
return_dict (`bool`, *optional*):
|
||||
Whether or not to return a dataclass instead of a plain tuple.
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
|
||||
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
|
||||
ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionFillMaskModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_debertav2_fill-mask_chinese-lite')
|
||||
>>> # Call the model, return some tensors
|
||||
>>> print(model(**preprocessor('你师父差得动你,你师父可[MASK]不动我。')))
|
||||
>>> # Call the pipeline
|
||||
>>> from modelscope.pipelines import pipeline
|
||||
>>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
|
||||
>>> print(pipeline_ins('你师父差得动你,你师父可[MASK]不动我。'))
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.deberta(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(
|
||||
prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, ) + outputs[1:]
|
||||
return ((masked_lm_loss, )
|
||||
+ output) if masked_lm_loss is not None else output
|
||||
|
||||
return AttentionFillMaskModelOutput(
|
||||
loss=masked_lm_loss,
|
||||
logits=prediction_scores,
|
||||
input_ids=input_ids,
|
||||
attentions=outputs.attentions,
|
||||
hidden_states=outputs.hidden_states)
|
||||
|
||||
|
||||
# copied from transformers.models.bert.BertPredictionHeadTransform with bert -> deberta
|
||||
class DebertaV2PredictionHeadTransform(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.transform_act_fn(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
# copied from transformers.models.bert.BertLMPredictionHead with bert -> deberta
|
||||
class DebertaV2LMPredictionHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.transform = DebertaV2PredictionHeadTransform(config)
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = nn.Linear(
|
||||
config.hidden_size, config.vocab_size, bias=False)
|
||||
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
|
||||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta
|
||||
class DebertaV2OnlyMLMHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = DebertaV2LMPredictionHead(config)
|
||||
|
||||
def forward(self, sequence_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
return prediction_scores
|
||||
@@ -24,7 +24,7 @@ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from modelscope.utils import logger as logging
|
||||
|
||||
if is_sentencepiece_available():
|
||||
from .tokenization_deberta_v2 import DebertaV2Tokenizer
|
||||
from .tokenization import DebertaV2Tokenizer
|
||||
else:
|
||||
DebertaV2Tokenizer = None
|
||||
|
||||
@@ -4,16 +4,16 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_gpt3 import GPT3Config
|
||||
from .modeling_gpt3 import GPT3Model
|
||||
from .gpt3_for_text_generation import GPT3ForTextGeneration
|
||||
from .tokenizer_gpt3 import JiebaBPETokenizer
|
||||
from .configuration import GPT3Config
|
||||
from .backbone import GPT3Model
|
||||
from .text_generation import GPT3ForTextGeneration
|
||||
from .tokenizer import JiebaBPETokenizer
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_gpt3': ['GPT3Config'],
|
||||
'modeling_gpt3': ['GPT3Model'],
|
||||
'gpt3_for_text_generation': ['GPT3ForTextGeneration'],
|
||||
'tokenizer_gpt3': ['JiebaBPETokenizer'],
|
||||
'configuration': ['GPT3Config'],
|
||||
'backbone': ['GPT3Model'],
|
||||
'text_generation': ['GPT3ForTextGeneration'],
|
||||
'tokenizer': ['JiebaBPETokenizer'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -24,7 +24,7 @@ from torch.nn import functional as F
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
|
||||
from modelscope.utils.constant import ModelFile
|
||||
from .configuration_gpt3 import GPT3Config
|
||||
from .configuration import GPT3Config
|
||||
|
||||
|
||||
class GPT3SelfAttention(nn.Module):
|
||||
@@ -4,14 +4,12 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .structbert import SbertModel
|
||||
from .backbone import GPTNeoModel
|
||||
else:
|
||||
_import_structure = {
|
||||
'structbert': ['SbertModel'],
|
||||
'backbone': ['GPTNeoModel'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
@@ -4,10 +4,11 @@ from transformers import GPTNeoModel as GPTNeoModelTransform
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import BACKBONES
|
||||
from modelscope.utils.constant import Fields
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@BACKBONES.register_module(group_key=Fields.nlp, module_name=Models.gpt_neo)
|
||||
@BACKBONES.register_module(
|
||||
group_key=Tasks.backbone, module_name=Models.gpt_neo)
|
||||
class GPTNeoModel(GPTNeoModelTransform):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
@@ -37,9 +37,9 @@ class TokenClassificationHead(TorchHead):
|
||||
sequence_output = inputs
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.classifier(sequence_output)
|
||||
return {OutputKeys.LOGITS: logits}
|
||||
return logits
|
||||
|
||||
def compute_loss(self, outputs: Dict[str, torch.Tensor],
|
||||
labels) -> Dict[str, torch.Tensor]:
|
||||
logits = outputs[OutputKeys.LOGITS]
|
||||
return {OutputKeys.LOSS: F.cross_entropy(logits, labels)}
|
||||
return F.cross_entropy(logits, labels)
|
||||
|
||||
@@ -1,164 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.bert import \
|
||||
BertForMaskedLM as BertForMaskedLMTransformer
|
||||
from modelscope.models.nlp.deberta_v2 import \
|
||||
DebertaV2ForMaskedLM as DebertaV2ForMaskedLMTransformer
|
||||
from modelscope.models.nlp.structbert import SbertForMaskedLM
|
||||
from modelscope.models.nlp.veco import \
|
||||
VecoForMaskedLM as VecoForMaskedLMTransformer
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['BertForMaskedLM', 'StructBertForMaskedLM', 'VecoForMaskedLM']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.structbert)
|
||||
class StructBertForMaskedLM(TorchModel, SbertForMaskedLM):
|
||||
"""Structbert for MLM model.
|
||||
|
||||
Inherited from structbert.SbertForMaskedLM and TorchModel, so this class can be registered into Model sets.
|
||||
"""
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super(TorchModel, self).__init__(model_dir)
|
||||
SbertForMaskedLM.__init__(self, config)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None):
|
||||
output = SbertForMaskedLM.forward(
|
||||
self,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
labels=labels)
|
||||
output[OutputKeys.INPUT_IDS] = input_ids
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.get('model_dir')
|
||||
return super(SbertForMaskedLM, StructBertForMaskedLM).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir, model_dir=model_dir)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.bert)
|
||||
class BertForMaskedLM(TorchModel, BertForMaskedLMTransformer):
|
||||
"""Bert for MLM model.
|
||||
|
||||
Inherited from transformers.BertForMaskedLM and TorchModel, so this class can be registered into Model sets.
|
||||
"""
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super(TorchModel, self).__init__(model_dir)
|
||||
BertForMaskedLMTransformer.__init__(self, config)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None):
|
||||
output = BertForMaskedLMTransformer.forward(
|
||||
self,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
labels=labels)
|
||||
output[OutputKeys.INPUT_IDS] = input_ids
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.get('model_dir')
|
||||
return super(BertForMaskedLMTransformer,
|
||||
BertForMaskedLM).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir,
|
||||
model_dir=model_dir)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.veco)
|
||||
class VecoForMaskedLM(TorchModel, VecoForMaskedLMTransformer):
|
||||
"""Veco for MLM model.
|
||||
|
||||
Inherited from veco.VecoForMaskedLM and TorchModel, so this class can be registered into Model sets.
|
||||
"""
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super(TorchModel, self).__init__(model_dir)
|
||||
VecoForMaskedLMTransformer.__init__(self, config)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None):
|
||||
output = VecoForMaskedLMTransformer.forward(
|
||||
self,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
labels=labels)
|
||||
output[OutputKeys.INPUT_IDS] = input_ids
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.get('model_dir')
|
||||
return super(VecoForMaskedLMTransformer,
|
||||
VecoForMaskedLM).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir,
|
||||
model_dir=model_dir)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.deberta_v2)
|
||||
class DebertaV2ForMaskedLM(TorchModel, DebertaV2ForMaskedLMTransformer):
|
||||
"""Deberta v2 for MLM model.
|
||||
|
||||
Inherited from deberta_v2.DebertaV2ForMaskedLM and TorchModel, so this class can be registered into Model sets.
|
||||
"""
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super(TorchModel, self).__init__(model_dir)
|
||||
DebertaV2ForMaskedLMTransformer.__init__(self, config)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None):
|
||||
output = DebertaV2ForMaskedLMTransformer.forward(
|
||||
self,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
labels=labels)
|
||||
output[OutputKeys.INPUT_IDS] = input_ids
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.get('model_dir')
|
||||
return super(DebertaV2ForMaskedLMTransformer,
|
||||
DebertaV2ForMaskedLM).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir,
|
||||
model_dir=model_dir)
|
||||
@@ -17,19 +17,19 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_palm import PalmConfig
|
||||
from .modeling_palm import (
|
||||
from .configuration import PalmConfig
|
||||
from .backbone import (
|
||||
AbsSummarizer,
|
||||
PalmForConditionalGeneration,
|
||||
Translator,
|
||||
)
|
||||
from .palm_for_text_generation import PalmForTextGeneration
|
||||
from .text_generation import PalmForTextGeneration
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_palm': ['PalmConfig'],
|
||||
'modeling_palm':
|
||||
'configuration': ['PalmConfig'],
|
||||
'backbone':
|
||||
['AbsSummarizer', 'PalmForConditionalGeneration', 'Translator'],
|
||||
'palm_for_text_generation': ['PalmForTextGeneration'],
|
||||
'text_generation': ['PalmForTextGeneration'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -35,7 +35,7 @@ from transformers.activations import ACT2FN
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
|
||||
from modelscope.utils import logger as logging
|
||||
from .configuration_palm import PalmConfig
|
||||
from .configuration import PalmConfig
|
||||
from .dureader_eval import compute_bleu_rouge, normalize
|
||||
|
||||
CONFIG_NAME = 'config.json'
|
||||
@@ -4,13 +4,13 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_plug import PlugNLGConfig
|
||||
from .modeling_plug import PlugModel
|
||||
from .configuration import PlugNLGConfig
|
||||
from .backbone import PlugModel
|
||||
from .distributed_plug import DistributedPlug
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_plug': ['PlugNLGConfig'],
|
||||
'modeling_plug': ['PlugModel'],
|
||||
'configuration': ['PlugNLGConfig'],
|
||||
'backbone': ['PlugModel'],
|
||||
'distributed_plug': ['DistributedPlug'],
|
||||
}
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ from torch import nn
|
||||
|
||||
from modelscope.utils.nlp.distributed import (normal_init_method,
|
||||
scaled_init_method)
|
||||
from .configuration_plug import PlugNLGConfig, PlugNLUConfig
|
||||
from .configuration import PlugNLGConfig, PlugNLUConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
@@ -14,7 +15,7 @@ from modelscope.utils.nlp.distributed import initialize_distributed
|
||||
from modelscope.utils.nlp.load_checkpoint import pre_load
|
||||
from modelscope.utils.torch_utils import set_random_seed_mpu
|
||||
from . import PlugModel
|
||||
from .configuration_plug import PlugNLGConfig
|
||||
from .configuration import PlugNLGConfig
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@@ -18,16 +18,16 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_ponet import PoNetConfig
|
||||
from .modeling_ponet import (PoNetForMaskedLM, PoNetModel,
|
||||
PoNetPreTrainedModel)
|
||||
from .tokenization_ponet import PoNetTokenizer
|
||||
from .configuration import PoNetConfig
|
||||
from .backbone import (PoNetModel, PoNetPreTrainedModel)
|
||||
from .tokenization import PoNetTokenizer
|
||||
from .fill_mask import PoNetForMaskedLM
|
||||
else:
|
||||
_import_structure = {
|
||||
'configuration_ponet': ['PoNetConfig'],
|
||||
'modeling_ponet':
|
||||
['PoNetForMaskedLM', 'PoNetModel', 'PoNetPreTrainedModel'],
|
||||
'tokenization_ponet': ['PoNetTokenizer'],
|
||||
'configuration': ['PoNetConfig'],
|
||||
'backbone': ['PoNetModel', 'PoNetPreTrainedModel'],
|
||||
'fill_mask': ['PoNetForMaskedLM'],
|
||||
'tokenization': ['PoNetTokenizer'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -16,43 +16,32 @@
|
||||
"""PyTorch PoNet model. """
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from distutils.version import LooseVersion
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
from packaging import version
|
||||
from torch import nn
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.file_utils import (ModelOutput, add_code_sample_docstrings,
|
||||
add_start_docstrings,
|
||||
add_start_docstrings_to_model_forward,
|
||||
replace_return_docstrings)
|
||||
from transformers.modeling_outputs import (
|
||||
BaseModelOutputWithPastAndCrossAttentions,
|
||||
BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
CausalLMOutputWithCrossAttentions, MaskedLMOutput,
|
||||
SequenceClassifierOutput, TokenClassifierOutput)
|
||||
from transformers.modeling_outputs import \
|
||||
BaseModelOutputWithPastAndCrossAttentions
|
||||
from transformers.modeling_utils import (PreTrainedModel,
|
||||
apply_chunking_to_forward,
|
||||
find_pruneable_heads_and_indices,
|
||||
prune_linear_layer)
|
||||
from transformers.models.bert.modeling_bert import \
|
||||
load_tf_weights_in_bert as load_tf_weights_in_ponet
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import Model, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionBackboneModelOutput
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .configuration_ponet import PoNetConfig
|
||||
from .configuration import PoNetConfig
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
is_pytorch_12plus = LooseVersion(torch.__version__) >= LooseVersion('1.12.0')
|
||||
|
||||
_CHECKPOINT_FOR_DOC = 'ponet-base-uncased'
|
||||
_CONFIG_FOR_DOC = 'PoNetConfig'
|
||||
_TOKENIZER_FOR_DOC = 'PoNetTokenizer'
|
||||
|
||||
CLS_ID = 101
|
||||
EOS_ID = 102
|
||||
|
||||
@@ -609,82 +598,20 @@ class PoNetPooler(nn.Module):
|
||||
return pooled_output
|
||||
|
||||
|
||||
class PoNetPredictionHeadTransform(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.transform_act_fn(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PoNetLMPredictionHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.transform = PoNetPredictionHeadTransform(config)
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = nn.Linear(
|
||||
config.hidden_size, config.vocab_size, bias=False)
|
||||
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
|
||||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PoNetOnlyMLMHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = PoNetLMPredictionHead(config)
|
||||
|
||||
def forward(self, sequence_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
return prediction_scores
|
||||
|
||||
|
||||
class PoNetPreTrainingHeads(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = PoNetLMPredictionHead(config)
|
||||
self.seq_relationship = nn.Linear(config.hidden_size, 3)
|
||||
|
||||
def forward(self, sequence_output, pooled_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
seq_relationship_score = self.seq_relationship(pooled_output)
|
||||
return prediction_scores, seq_relationship_score
|
||||
|
||||
|
||||
class PoNetPreTrainedModel(PreTrainedModel):
|
||||
class PoNetPreTrainedModel(TorchModel, PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = PoNetConfig
|
||||
load_tf_weights = load_tf_weights_in_ponet
|
||||
base_model_prefix = 'ponet'
|
||||
_keys_to_ignore_on_load_missing = [r'position_ids']
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config.name_or_path, **kwargs)
|
||||
super(Model, self).__init__(config)
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
if isinstance(module, nn.Linear):
|
||||
@@ -703,51 +630,22 @@ class PoNetPreTrainedModel(PreTrainedModel):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PoNetForPreTrainingOutput(ModelOutput):
|
||||
"""
|
||||
Output type of :class:`~transformers.PoNetForPreTraining`.
|
||||
|
||||
Args:
|
||||
loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction
|
||||
(classification) loss.
|
||||
mlm_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
|
||||
Masked language modeling loss.
|
||||
sop_loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
|
||||
sop loss.
|
||||
prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
|
||||
Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
|
||||
before SoftMax).
|
||||
hidden_states
|
||||
(:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed
|
||||
or when ``config.output_hidden_states=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
|
||||
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
|
||||
|
||||
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed
|
||||
or when ``config.output_attentions=True``):
|
||||
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
|
||||
sequence_length, sequence_length)`.
|
||||
|
||||
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
||||
heads.
|
||||
"""
|
||||
|
||||
loss: Optional[torch.FloatTensor] = None
|
||||
mlm_loss: Optional[torch.FloatTensor] = None
|
||||
sop_loss: Optional[torch.FloatTensor] = None
|
||||
prediction_logits: torch.FloatTensor = None
|
||||
seq_relationship_logits: torch.FloatTensor = None
|
||||
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
||||
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.pop('model_dir', None)
|
||||
if model_dir is None:
|
||||
ponet_config = PoNetConfig(**kwargs)
|
||||
model = cls(ponet_config)
|
||||
else:
|
||||
model = super(
|
||||
Model,
|
||||
cls).from_pretrained(pretrained_model_name_or_path=model_dir)
|
||||
return model
|
||||
|
||||
|
||||
PONET_START_DOCSTRING = r"""
|
||||
@MODELS.register_module(Tasks.backbone, module_name=Models.ponet)
|
||||
class PoNetModel(PoNetPreTrainedModel):
|
||||
"""The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.
|
||||
|
||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||
@@ -763,65 +661,6 @@ PONET_START_DOCSTRING = r"""
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
"""
|
||||
|
||||
PONET_INPUTS_DOCSTRING = r"""
|
||||
Args:
|
||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
`What are input IDs? <../glossary.html#input-ids>`__
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
`What are token type IDs? <../glossary.html#token-type-ids>`_
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
`What are position IDs? <../glossary.html#position-ids>`_
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||
"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
'The bare PoNet Model transformer outputting raw hidden-states without any specific head on top.',
|
||||
PONET_START_DOCSTRING,
|
||||
)
|
||||
class PoNetModel(PoNetPreTrainedModel):
|
||||
"""
|
||||
|
||||
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
||||
cross-attention is added between the self-attention layers, following the architecture described in `Attention is
|
||||
@@ -834,8 +673,8 @@ class PoNetModel(PoNetPreTrainedModel):
|
||||
input to the forward pass.
|
||||
"""
|
||||
|
||||
def __init__(self, config, add_pooling_layer=True):
|
||||
super().__init__(config)
|
||||
def __init__(self, config, add_pooling_layer=True, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
self.config = config
|
||||
|
||||
self.embeddings = PoNetEmbeddings(config)
|
||||
@@ -859,14 +698,6 @@ class PoNetModel(PoNetPreTrainedModel):
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=BaseModelOutputWithPoolingAndCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
@@ -885,6 +716,49 @@ class PoNetModel(PoNetPreTrainedModel):
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||
encoder_hidden_states
|
||||
(:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
||||
@@ -906,6 +780,16 @@ class PoNetModel(PoNetPreTrainedModel):
|
||||
use_cache (:obj:`bool`, `optional`):
|
||||
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
|
||||
decoding (see :obj:`past_key_values`).
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionBackboneModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base', task='backbone')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
|
||||
>>> print(model(**preprocessor('这是个测试')))
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
@@ -1006,7 +890,7 @@ class PoNetModel(PoNetPreTrainedModel):
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
|
||||
return BaseModelOutputWithPoolingAndCrossAttentions(
|
||||
return AttentionBackboneModelOutput(
|
||||
last_hidden_state=sequence_output,
|
||||
pooler_output=pooled_output,
|
||||
past_key_values=encoder_outputs.past_key_values,
|
||||
@@ -1014,578 +898,3 @@ class PoNetModel(PoNetPreTrainedModel):
|
||||
attentions=encoder_outputs.attentions,
|
||||
cross_attentions=encoder_outputs.cross_attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
PoNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
|
||||
sentence prediction (classification)` head.
|
||||
""",
|
||||
PONET_START_DOCSTRING,
|
||||
)
|
||||
class PoNetForPreTraining(PoNetPreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
self.ponet = PoNetModel(config)
|
||||
self.cls = PoNetPreTrainingHeads(config)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@replace_return_docstrings(
|
||||
output_type=PoNetForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
next_sentence_label=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
|
||||
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
|
||||
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
|
||||
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
|
||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
|
||||
(see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
|
||||
|
||||
- 0 indicates sequence B is a continuation of sequence A,
|
||||
- 1 indicates sequence B is a random sequence.
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import PoNetTokenizer, PoNetForPreTraining
|
||||
>>> import torch
|
||||
|
||||
>>> tokenizer = PoNetTokenizer.from_pretrained('ponet-base-uncased')
|
||||
>>> model = PoNetForPreTraining.from_pretrained('ponet-base-uncased')
|
||||
|
||||
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
>>> prediction_logits = outputs.prediction_logits
|
||||
>>> seq_relationship_logits = outputs.seq_relationship_logits
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ponet(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output, pooled_output = outputs[:2]
|
||||
prediction_scores, seq_relationship_score = self.cls(
|
||||
sequence_output, pooled_output)
|
||||
|
||||
total_loss = None
|
||||
masked_lm_loss = None
|
||||
next_sentence_loss = None
|
||||
if labels is not None and next_sentence_label is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(
|
||||
prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
next_sentence_loss = loss_fct(
|
||||
seq_relationship_score.view(-1, 3),
|
||||
next_sentence_label.view(-1))
|
||||
total_loss = masked_lm_loss + next_sentence_loss
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, seq_relationship_score) + outputs[2:]
|
||||
return ((total_loss, masked_lm_loss, next_sentence_loss)
|
||||
+ output) if total_loss is not None else output
|
||||
|
||||
return PoNetForPreTrainingOutput(
|
||||
loss=total_loss,
|
||||
mlm_loss=masked_lm_loss,
|
||||
sop_loss=next_sentence_loss,
|
||||
prediction_logits=prediction_scores,
|
||||
seq_relationship_logits=seq_relationship_score,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""PoNet Model with a `language modeling` head on top for CLM fine-tuning. """,
|
||||
PONET_START_DOCSTRING)
|
||||
class PoNetLMHeadModel(PoNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'position_ids', r'predictions.decoder.bias'
|
||||
]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
if not config.is_decoder:
|
||||
logger.warning(
|
||||
'If you want to use `PoNetLMHeadModel` as a standalone, add `is_decoder=True.`'
|
||||
)
|
||||
|
||||
self.ponet = PoNetModel(config, add_pooling_layer=False)
|
||||
self.cls = PoNetOnlyMLMHead(config)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@replace_return_docstrings(
|
||||
output_type=CausalLMOutputWithCrossAttentions,
|
||||
config_class=_CONFIG_FOR_DOC)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
labels=None,
|
||||
past_key_values=None,
|
||||
use_cache=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:
|
||||
`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
||||
the model is configured as a decoder.
|
||||
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
||||
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
|
||||
``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
|
||||
ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
|
||||
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers`
|
||||
with each tuple having 4 tensors of shape :
|
||||
obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
||||
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
||||
|
||||
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
|
||||
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
|
||||
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
|
||||
use_cache (:obj:`bool`, `optional`):
|
||||
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
|
||||
decoding (see :obj:`past_key_values`).
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
if labels is not None:
|
||||
use_cache = False
|
||||
|
||||
outputs = self.ponet(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
lm_loss = None
|
||||
if labels is not None:
|
||||
# we are doing next-token prediction; shift prediction scores and input ids by one
|
||||
shifted_prediction_scores = prediction_scores[:, :
|
||||
-1, :].contiguous()
|
||||
labels = labels[:, 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss()
|
||||
lm_loss = loss_fct(
|
||||
shifted_prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, ) + outputs[2:]
|
||||
return ((lm_loss, ) + output) if lm_loss is not None else output
|
||||
|
||||
return CausalLMOutputWithCrossAttentions(
|
||||
loss=lm_loss,
|
||||
logits=prediction_scores,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
cross_attentions=outputs.cross_attentions,
|
||||
)
|
||||
|
||||
def prepare_inputs_for_generation(self,
|
||||
input_ids,
|
||||
past=None,
|
||||
attention_mask=None,
|
||||
**model_kwargs):
|
||||
input_shape = input_ids.shape
|
||||
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
|
||||
if attention_mask is None:
|
||||
attention_mask = input_ids.new_ones(input_shape)
|
||||
|
||||
# cut decoder_input_ids if past is used
|
||||
if past is not None:
|
||||
input_ids = input_ids[:, -1:]
|
||||
|
||||
return {
|
||||
'input_ids': input_ids,
|
||||
'attention_mask': attention_mask,
|
||||
'past_key_values': past
|
||||
}
|
||||
|
||||
def _reorder_cache(self, past, beam_idx):
|
||||
reordered_past = ()
|
||||
for layer_past in past:
|
||||
reordered_past += (tuple(
|
||||
past_state.index_select(0, beam_idx)
|
||||
for past_state in layer_past), )
|
||||
return reordered_past
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""PoNet Model with a `language modeling` head on top. """,
|
||||
PONET_START_DOCSTRING)
|
||||
class PoNetForMaskedLM(PoNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'position_ids', r'predictions.decoder.bias'
|
||||
]
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
if config.is_decoder:
|
||||
logger.warning(
|
||||
'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
|
||||
'bi-directional self-attention.')
|
||||
|
||||
self.ponet = PoNetModel(config, add_pooling_layer=False)
|
||||
self.cls = PoNetOnlyMLMHead(config)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=MaskedLMOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
segment_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
|
||||
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ponet(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(
|
||||
prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, ) + outputs[2:]
|
||||
return ((masked_lm_loss, )
|
||||
+ output) if masked_lm_loss is not None else output
|
||||
|
||||
return MaskedLMOutput(
|
||||
loss=masked_lm_loss,
|
||||
logits=prediction_scores,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
PoNet Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
|
||||
output) e.g. for GLUE tasks.
|
||||
""",
|
||||
PONET_START_DOCSTRING,
|
||||
)
|
||||
class PoNetForSequenceClassification(PoNetPreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
|
||||
self.ponet = PoNetModel(config)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=SequenceClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
|
||||
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
|
||||
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
||||
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ponet(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
pooled_output = outputs[1]
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = 'regression'
|
||||
elif self.num_labels > 1 and (labels.dtype == torch.long
|
||||
or labels.dtype == torch.int):
|
||||
self.config.problem_type = 'single_label_classification'
|
||||
else:
|
||||
self.config.problem_type = 'multi_label_classification'
|
||||
|
||||
if self.config.problem_type == 'regression':
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == 'single_label_classification':
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(
|
||||
logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == 'multi_label_classification':
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
if not return_dict:
|
||||
output = (logits, ) + outputs[2:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
PoNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
|
||||
Named-Entity-Recognition (NER) tasks.
|
||||
""",
|
||||
PONET_START_DOCSTRING,
|
||||
)
|
||||
class PoNetForTokenClassification(PoNetPreTrainedModel):
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.num_labels = config.num_labels
|
||||
|
||||
self.ponet = PoNetModel(config, add_pooling_layer=False)
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
PONET_INPUTS_DOCSTRING.format('batch_size, sequence_length'))
|
||||
@add_code_sample_docstrings(
|
||||
processor_class=_TOKENIZER_FOR_DOC,
|
||||
checkpoint=_CHECKPOINT_FOR_DOC,
|
||||
output_type=TokenClassifierOutput,
|
||||
config_class=_CONFIG_FOR_DOC,
|
||||
)
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
|
||||
1]``.
|
||||
"""
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ponet(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
# Only keep active parts of the loss
|
||||
if attention_mask is not None:
|
||||
active_loss = attention_mask.view(-1) == 1
|
||||
active_logits = logits.view(-1, self.num_labels)
|
||||
active_labels = torch.where(
|
||||
active_loss, labels.view(-1),
|
||||
torch.tensor(loss_fct.ignore_index).type_as(labels))
|
||||
loss = loss_fct(active_logits, active_labels)
|
||||
else:
|
||||
loss = loss_fct(
|
||||
logits.view(-1, self.num_labels), labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (logits, ) + outputs[2:]
|
||||
return ((loss, ) + output) if loss is not None else output
|
||||
|
||||
return TokenClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
@@ -34,8 +34,7 @@ class PoNetConfig(PretrainedConfig):
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 30522):
|
||||
Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
||||
:obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
|
||||
:class:`~transformers.TFBertModel`.
|
||||
:obj:`inputs_ids` passed.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
@@ -55,8 +54,7 @@ class PoNetConfig(PretrainedConfig):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
||||
The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
|
||||
:class:`~transformers.TFBertModel`.
|
||||
The vocabulary size of the :obj:`token_type_ids` passed.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
|
||||
252
modelscope/models/nlp/ponet/fill_mask.py
Normal file
252
modelscope/models/nlp/ponet/fill_mask.py
Normal file
@@ -0,0 +1,252 @@
|
||||
# Copyright 2021-2022 The Alibaba DAMO Team Authors.
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import torch.utils.checkpoint
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers.activations import ACT2FN
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import AttentionFillMaskModelOutput
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .backbone import PoNetModel, PoNetPreTrainedModel
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class PoNetPredictionHeadTransform(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.LayerNorm = nn.LayerNorm(
|
||||
config.hidden_size, eps=config.layer_norm_eps)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.transform_act_fn(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PoNetLMPredictionHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.transform = PoNetPredictionHeadTransform(config)
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = nn.Linear(
|
||||
config.hidden_size, config.vocab_size, bias=False)
|
||||
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
|
||||
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
||||
self.decoder.bias = self.bias
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
hidden_states = self.decoder(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class PoNetOnlyMLMHead(nn.Module):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.predictions = PoNetLMPredictionHead(config)
|
||||
|
||||
def forward(self, sequence_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
return prediction_scores
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
|
||||
class PoNetForMaskedLM(PoNetPreTrainedModel):
|
||||
r"""PoNet Model with a `language modeling` head on top.
|
||||
|
||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||
pruning heads etc.)
|
||||
|
||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||
general usage and behavior.
|
||||
|
||||
Preprocessor:
|
||||
This is the fill_mask model of PoNet, the preprocessor of this model
|
||||
is `modelscope.preprocessors.FillMaskPoNetPreprocessor`.
|
||||
|
||||
Parameters:
|
||||
config (:class:`~modelscope.models.nlp.ponet.PoNetConfig`):
|
||||
Model configuration class with all the parameters of the model.
|
||||
Initializing with a config file does not load the weights associated with the model, only the
|
||||
configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
|
||||
weights.
|
||||
"""
|
||||
|
||||
_keys_to_ignore_on_load_unexpected = [r'pooler']
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r'position_ids', r'predictions.decoder.bias'
|
||||
]
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config)
|
||||
|
||||
if config.is_decoder:
|
||||
logger.warning(
|
||||
'If you want to use `PoNetForMaskedLM` make sure `config.is_decoder=False` for '
|
||||
'bi-directional self-attention.')
|
||||
|
||||
self.ponet = PoNetModel(config, add_pooling_layer=False)
|
||||
self.cls = PoNetOnlyMLMHead(config)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def get_output_embeddings(self):
|
||||
return self.cls.predictions.decoder
|
||||
|
||||
def set_output_embeddings(self, new_embeddings):
|
||||
self.cls.predictions.decoder = new_embeddings
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
segment_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
r"""
|
||||
Args:
|
||||
input_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`):
|
||||
Indices of input sequence tokens in the vocabulary.
|
||||
|
||||
Indices can be obtained using :class:`~modelscope.models.nlp.ponet.PoNetTokenizer`. See
|
||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||
details.
|
||||
|
||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
|
||||
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||
1]``:
|
||||
|
||||
- 0 corresponds to a `sentence A` token,
|
||||
- 1 corresponds to a `sentence B` token.
|
||||
|
||||
position_ids (:obj:`torch.LongTensor` of shape :obj:`('batch_size, sequence_length')`, `optional`):
|
||||
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
|
||||
config.max_position_embeddings - 1]``.
|
||||
|
||||
head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
|
||||
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 indicates the head is **not masked**,
|
||||
- 0 indicates the head is **masked**.
|
||||
|
||||
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`('batch_size, sequence_length', hidden_size)`,
|
||||
`optional`):
|
||||
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||
vectors than the model's internal embedding lookup matrix.
|
||||
output_attentions (:obj:`bool`, `optional`):
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
|
||||
config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
|
||||
|
||||
Returns:
|
||||
Returns `modelscope.outputs.AttentionFillMaskModelOutput`
|
||||
|
||||
Examples:
|
||||
>>> from modelscope.models import Model
|
||||
>>> from modelscope.preprocessors import Preprocessor
|
||||
>>> model = Model.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
|
||||
>>> preprocessor = Preprocessor.from_pretrained('damo/nlp_ponet_fill-mask_chinese-base')
|
||||
>>> # Call the model, return some tensors
|
||||
>>> print(model(**preprocessor('你师父差得动你,你师父可[MASK]不动我。')))
|
||||
>>> # Call the pipeline
|
||||
>>> from modelscope.pipelines import pipeline
|
||||
>>> pipeline_ins = pipeline('fill-mask', model=model, preprocessor=preprocessor)
|
||||
>>> print(pipeline_ins('你师父差得动你,你师父可[MASK]不动我。'))
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
outputs = self.ponet(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.cls(sequence_output)
|
||||
|
||||
masked_lm_loss = None
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(
|
||||
prediction_scores.view(-1, self.config.vocab_size),
|
||||
labels.view(-1))
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores, ) + outputs[2:]
|
||||
return ((masked_lm_loss, )
|
||||
+ output) if masked_lm_loss is not None else output
|
||||
|
||||
return AttentionFillMaskModelOutput(
|
||||
loss=masked_lm_loss,
|
||||
logits=prediction_scores,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
input_ids=input_ids,
|
||||
)
|
||||
@@ -19,6 +19,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
||||
|
||||
from transformers.file_utils import PaddingStrategy
|
||||
from transformers.models.bert.tokenization_bert import BertTokenizer
|
||||
from transformers.tokenization_utils import BatchEncoding, EncodedInput
|
||||
|
||||
from modelscope.utils.constant import ModelFile
|
||||
from modelscope.utils.logger import get_logger
|
||||
@@ -1,53 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.ponet import \
|
||||
PoNetForMaskedLM as PoNetForMaskedLMTransformer
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['PoNetForMaskedLM']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.fill_mask, module_name=Models.ponet)
|
||||
class PoNetForMaskedLM(TorchModel, PoNetForMaskedLMTransformer):
|
||||
"""PoNet for MLM model.'.
|
||||
|
||||
Inherited from ponet.PoNetForMaskedLM and TorchModel, so this class can be registered into Model sets.
|
||||
"""
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super(TorchModel, self).__init__(model_dir)
|
||||
PoNetForMaskedLMTransformer.__init__(self, config)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
labels=None):
|
||||
output = PoNetForMaskedLMTransformer.forward(
|
||||
self,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
segment_ids=segment_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
labels=labels)
|
||||
output[OutputKeys.INPUT_IDS] = input_ids
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
model_dir = kwargs.get('model_dir')
|
||||
return super(PoNetForMaskedLMTransformer,
|
||||
PoNetForMaskedLM).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir,
|
||||
model_dir=model_dir)
|
||||
@@ -1,74 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.structbert import SbertPreTrainedModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['SentenceEmbedding']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.sentence_embedding, module_name=Models.bert)
|
||||
class SentenceEmbedding(TorchModel, SbertPreTrainedModel):
|
||||
base_model_prefix: str = 'bert'
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r'position_ids']
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super().__init__(model_dir)
|
||||
self.config = config
|
||||
setattr(self, self.base_model_prefix, self.build_base_model())
|
||||
|
||||
def build_base_model(self):
|
||||
from .structbert import SbertModel
|
||||
return SbertModel(self.config, add_pooling_layer=False)
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, np.ndarray]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Any]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, np.ndarray]: results
|
||||
Example:
|
||||
{
|
||||
'predictions': array([1]), # lable 0-negative 1-positive
|
||||
'probabilities': array([[0.11491239, 0.8850876 ]], dtype=float32),
|
||||
'logits': array([[-0.53860897, 1.5029076 ]], dtype=float32) # true value
|
||||
}
|
||||
"""
|
||||
return self.base_model(**input)
|
||||
|
||||
def postprocess(self, inputs: Dict[str, np.ndarray],
|
||||
**kwargs) -> Dict[str, np.ndarray]:
|
||||
embs = inputs['last_hidden_state'][:, 0].cpu().numpy()
|
||||
num_sent = embs.shape[0]
|
||||
if num_sent >= 2:
|
||||
scores = np.dot(embs[0:1, ], np.transpose(embs[1:, ],
|
||||
(1, 0))).tolist()[0]
|
||||
else:
|
||||
scores = []
|
||||
result = {'text_embedding': embs, 'scores': scores}
|
||||
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
@param kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
@return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
model_args = {}
|
||||
|
||||
return super(SbertPreTrainedModel, SentenceEmbedding).from_pretrained(
|
||||
pretrained_model_name_or_path=kwargs.get('model_dir'),
|
||||
model_dir=kwargs.get('model_dir'),
|
||||
**model_args)
|
||||
@@ -1,287 +0,0 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
from torch import nn
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.bert import BertPreTrainedModel
|
||||
from modelscope.models.nlp.structbert import SbertPreTrainedModel
|
||||
from modelscope.models.nlp.veco import \
|
||||
VecoForSequenceClassification as VecoForSequenceClassificationTransform
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.constant import Tasks
|
||||
from modelscope.utils.hub import parse_label_mapping
|
||||
from modelscope.utils.tensor_utils import (torch_nested_detach,
|
||||
torch_nested_numpify)
|
||||
|
||||
__all__ = [
|
||||
'SbertForSequenceClassification', 'VecoForSequenceClassification',
|
||||
'BertForSequenceClassification'
|
||||
]
|
||||
|
||||
|
||||
class SequenceClassificationBase(TorchModel):
|
||||
"""A sequence classification base class for all the fitted sequence classification models.
|
||||
"""
|
||||
base_model_prefix: str = 'bert'
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super().__init__(model_dir)
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
setattr(self, self.base_model_prefix, self.build_base_model())
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
@abstractmethod
|
||||
def build_base_model(self):
|
||||
"""Build the backbone model.
|
||||
|
||||
Returns: the backbone instance.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def base_model(self):
|
||||
return getattr(self, self.base_model_prefix)
|
||||
|
||||
def forward(self, **kwargs):
|
||||
labels = None
|
||||
if OutputKeys.LABEL in kwargs:
|
||||
labels = kwargs.pop(OutputKeys.LABEL)
|
||||
elif OutputKeys.LABELS in kwargs:
|
||||
labels = kwargs.pop(OutputKeys.LABELS)
|
||||
|
||||
outputs = self.base_model.forward(**kwargs)
|
||||
|
||||
# backbone model should return pooled_output as its second output
|
||||
pooled_output = outputs[1]
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
if labels is not None:
|
||||
loss_fct = nn.CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
return {OutputKeys.LOGITS: logits, OutputKeys.LOSS: loss}
|
||||
return {OutputKeys.LOGITS: logits}
|
||||
|
||||
def postprocess(self, input, **kwargs):
|
||||
logits = input[OutputKeys.LOGITS]
|
||||
probs = torch_nested_numpify(torch_nested_detach(logits.softmax(-1)))
|
||||
pred = torch_nested_numpify(torch_nested_detach(logits.argmax(-1)))
|
||||
logits = torch_nested_numpify(torch_nested_detach(logits))
|
||||
res = {
|
||||
OutputKeys.PREDICTIONS: pred,
|
||||
OutputKeys.PROBABILITIES: probs,
|
||||
OutputKeys.LOGITS: logits
|
||||
}
|
||||
return res
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.sentence_similarity, module_name=Models.structbert)
|
||||
@MODELS.register_module(
|
||||
Tasks.sentiment_classification, module_name=Models.structbert)
|
||||
@MODELS.register_module(Tasks.nli, module_name=Models.structbert)
|
||||
@MODELS.register_module(
|
||||
Tasks.zero_shot_classification, module_name=Models.structbert)
|
||||
class SbertForSequenceClassification(SequenceClassificationBase,
|
||||
SbertPreTrainedModel):
|
||||
"""Sbert sequence classification model.
|
||||
|
||||
Inherited from SequenceClassificationBase.
|
||||
"""
|
||||
base_model_prefix: str = 'bert'
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r'position_ids']
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
if hasattr(config, 'base_model_prefix'):
|
||||
SbertForSequenceClassification.base_model_prefix = config.base_model_prefix
|
||||
super().__init__(config, model_dir)
|
||||
|
||||
def build_base_model(self):
|
||||
from .structbert import SbertModel
|
||||
return SbertModel(self.config, add_pooling_layer=True)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
labels=None,
|
||||
**kwargs):
|
||||
return super().forward(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
labels=labels)
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
@param kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
num_labels: An optional arg to tell the model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping if num_labels not supplied.
|
||||
If num_labels is not found, the model will use the default setting (2 classes).
|
||||
@return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
|
||||
model_dir = kwargs.get('model_dir')
|
||||
num_labels = kwargs.get('num_labels')
|
||||
if num_labels is None:
|
||||
label2id = parse_label_mapping(model_dir)
|
||||
if label2id is not None and len(label2id) > 0:
|
||||
num_labels = len(label2id)
|
||||
cls.id2label = {id: label for label, id in label2id.items()}
|
||||
model_args = {} if num_labels is None else {'num_labels': num_labels}
|
||||
return super(SbertPreTrainedModel,
|
||||
SbertForSequenceClassification).from_pretrained(
|
||||
pretrained_model_name_or_path=kwargs.get('model_dir'),
|
||||
model_dir=kwargs.get('model_dir'),
|
||||
**model_args)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.veco)
|
||||
@MODELS.register_module(
|
||||
Tasks.sentiment_classification, module_name=Models.veco)
|
||||
@MODELS.register_module(Tasks.nli, module_name=Models.veco)
|
||||
class VecoForSequenceClassification(TorchModel,
|
||||
VecoForSequenceClassificationTransform):
|
||||
"""Veco sequence classification model.
|
||||
|
||||
Inherited from VecoForSequenceClassification and TorchModel, so this class can be registered into the model set.
|
||||
This model cannot be inherited from SequenceClassificationBase, because Veco/XlmRoberta's classification structure
|
||||
is different.
|
||||
"""
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
super().__init__(model_dir)
|
||||
VecoForSequenceClassificationTransform.__init__(self, config)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
**kwargs):
|
||||
return VecoForSequenceClassificationTransform.forward(
|
||||
self,
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
labels=labels)
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
@param kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
num_labels: An optional arg to tell the model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping if num_labels not supplied.
|
||||
If num_labels is not found, the model will use the default setting (2 classes).
|
||||
@return: The loaded model, which is initialized by veco.VecoForSequenceClassification.from_pretrained
|
||||
"""
|
||||
|
||||
model_dir = kwargs.get('model_dir')
|
||||
num_labels = kwargs.get('num_labels')
|
||||
if num_labels is None:
|
||||
label2id = parse_label_mapping(model_dir)
|
||||
if label2id is not None and len(label2id) > 0:
|
||||
num_labels = len(label2id)
|
||||
|
||||
model_args = {} if num_labels is None else {'num_labels': num_labels}
|
||||
return super(VecoForSequenceClassificationTransform,
|
||||
VecoForSequenceClassification).from_pretrained(
|
||||
pretrained_model_name_or_path=kwargs.get('model_dir'),
|
||||
model_dir=kwargs.get('model_dir'),
|
||||
**model_args)
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.sentence_similarity, module_name=Models.bert)
|
||||
@MODELS.register_module(
|
||||
Tasks.sentiment_classification, module_name=Models.bert)
|
||||
@MODELS.register_module(Tasks.nli, module_name=Models.bert)
|
||||
@MODELS.register_module(Tasks.text_classification, module_name=Models.bert)
|
||||
class BertForSequenceClassification(SequenceClassificationBase,
|
||||
BertPreTrainedModel):
|
||||
"""Bert sequence classification model.
|
||||
|
||||
Inherited from SequenceClassificationBase.
|
||||
"""
|
||||
base_model_prefix: str = 'bert'
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r'position_ids']
|
||||
|
||||
def __init__(self, config, model_dir):
|
||||
if hasattr(config, 'base_model_prefix'):
|
||||
BertForSequenceClassification.base_model_prefix = config.base_model_prefix
|
||||
super().__init__(config, model_dir)
|
||||
|
||||
def build_base_model(self):
|
||||
from .bert import BertModel
|
||||
return BertModel(self.config, add_pooling_layer=True)
|
||||
|
||||
def forward(self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
**kwargs):
|
||||
return super().forward(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
labels=labels,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict)
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
@param kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
num_labels: An optional arg to tell the model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping if num_labels not supplied.
|
||||
If num_labels is not found, the model will use the default setting (2 classes).
|
||||
@return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
|
||||
model_dir = kwargs.get('model_dir')
|
||||
num_labels = kwargs.get('num_labels')
|
||||
if num_labels is None:
|
||||
label2id = parse_label_mapping(model_dir)
|
||||
if label2id is not None and len(label2id) > 0:
|
||||
num_labels = len(label2id)
|
||||
|
||||
model_args = {} if num_labels is None else {'num_labels': num_labels}
|
||||
return super(BertPreTrainedModel,
|
||||
BertForSequenceClassification).from_pretrained(
|
||||
pretrained_model_name_or_path=kwargs.get('model_dir'),
|
||||
model_dir=kwargs.get('model_dir'),
|
||||
**model_args)
|
||||
@@ -1,20 +1,22 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .model import SpaceGenerator
|
||||
from .model import SpaceModelBase, SpaceTokenizer, SpaceConfig
|
||||
from .space_for_dialog_intent_prediction import SpaceForDialogIntent
|
||||
from .space_for_dialog_modeling import SpaceForDialogModeling
|
||||
from .space_for_dialog_state_tracking import SpaceForDialogStateTracking
|
||||
from .model import SpaceModelBase, SpaceTokenizer
|
||||
from .dialog_intent_prediction import SpaceForDialogIntent
|
||||
from .dialog_modeling import SpaceForDialogModeling
|
||||
from .dialog_state_tracking import SpaceForDST
|
||||
from .configuration import SpaceConfig
|
||||
else:
|
||||
_import_structure = {
|
||||
'model':
|
||||
['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer', 'SpaceConfig'],
|
||||
'space_for_dialog_intent_prediction': ['SpaceForDialogIntent'],
|
||||
'space_for_dialog_modeling': ['SpaceForDialogModeling'],
|
||||
'space_for_dialog_state_tracking': ['SpaceForDialogStateTracking'],
|
||||
'model': ['SpaceGenerator', 'SpaceModelBase', 'SpaceTokenizer'],
|
||||
'dialog_intent_prediction': ['SpaceForDialogIntent'],
|
||||
'dialog_modeling': ['SpaceForDialogModeling'],
|
||||
'dialog_state_tracking': ['SpaceForDST'],
|
||||
'configuration': ['SpaceConfig']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -8,7 +8,7 @@ from modelscope.models import TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
|
||||
from modelscope.preprocessors.space import IntentBPETextField
|
||||
from modelscope.preprocessors.nlp import IntentBPETextField
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
|
||||
@@ -24,6 +24,10 @@ class SpaceForDialogIntent(TorchModel):
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
text_field (`BPETextField`, *optional*, defaults to `IntentBPETextField`):
|
||||
The text field.
|
||||
config (`Config`, *optional*, defaults to config in model hub):
|
||||
The config.
|
||||
"""
|
||||
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
@@ -72,10 +76,21 @@ class SpaceForDialogIntent(TorchModel):
|
||||
Example:
|
||||
{
|
||||
'pred': array([2.62349960e-03 4.12110658e-03 4.12748595e-05 3.77560973e-05
|
||||
1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04
|
||||
6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01
|
||||
2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32)
|
||||
1.08599677e-04 1.72710388e-05 2.95618793e-05 1.93638436e-04
|
||||
6.45841064e-05 1.15997791e-04 5.11605394e-05 9.87020373e-01
|
||||
2.66957268e-05 4.72324500e-05 9.74208378e-05], dtype=float32),
|
||||
}
|
||||
Example:
|
||||
>>> from modelscope.hub.snapshot_download import snapshot_download
|
||||
>>> from modelscope.models.nlp import SpaceForDialogIntent
|
||||
>>> from modelscope.preprocessors import DialogIntentPredictionPreprocessor
|
||||
>>> cache_path = snapshot_download('damo/nlp_space_dialog-intent-prediction')
|
||||
>>> preprocessor = DialogIntentPredictionPreprocessor(model_dir=cache_path)
|
||||
>>> model = SpaceForDialogIntent(
|
||||
model_dir=cache_path,
|
||||
text_field=preprocessor.text_field,
|
||||
config=preprocessor.config)
|
||||
>>> print(model(preprocessor("What do I need to do for the card activation?")))
|
||||
"""
|
||||
import numpy as np
|
||||
pred = self.trainer.forward(input)
|
||||
@@ -8,7 +8,7 @@ from modelscope.models import TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.space import SpaceGenerator, SpaceModelBase
|
||||
from modelscope.preprocessors.space import MultiWOZBPETextField
|
||||
from modelscope.preprocessors.nlp import MultiWOZBPETextField
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
|
||||
@@ -23,7 +23,12 @@ class SpaceForDialogModeling(TorchModel):
|
||||
"""initialize the test generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
model_dir (`str`):
|
||||
The model path.
|
||||
text_field (`BPETextField`, *optional*, defaults to `MultiWOZBPETextField`):
|
||||
The text field.
|
||||
config (`Config`, *optional*, defaults to config in model hub):
|
||||
The config.
|
||||
"""
|
||||
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
@@ -82,6 +87,19 @@ class SpaceForDialogModeling(TorchModel):
|
||||
'aspn': array([47,8345,32,29,1983]),
|
||||
'db': array([19, 24, 20]),
|
||||
}
|
||||
Examples:
|
||||
>>> from modelscope.hub.snapshot_download import snapshot_download
|
||||
>>> from modelscope.models.nlp import SpaceForDialogModeling
|
||||
>>> from modelscope.preprocessors import DialogModelingPreprocessor
|
||||
>>> cache_path = snapshot_download('damo/nlp_space_dialog-modeling')
|
||||
>>> preprocessor = DialogModelingPreprocessor(model_dir=cache_path)
|
||||
>>> model = SpaceForDialogModeling(model_dir=cache_path,
|
||||
text_field=preprocessor.text_field,
|
||||
config=preprocessor.config)
|
||||
>>> print(model(preprocessor({
|
||||
'user_input': 'i would like a taxi from saint john \'s college to pizza hut fen ditton .',
|
||||
'history': {}
|
||||
})))
|
||||
"""
|
||||
|
||||
first_turn = input['first_turn']
|
||||
@@ -1,6 +1,6 @@
|
||||
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
|
||||
# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION.
|
||||
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@@ -16,14 +16,22 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch Space model. mainly copied from :module:`~transformers.modeling_xlm_roberta`"""
|
||||
|
||||
from typing import Dict
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
from transformers.file_utils import add_start_docstrings
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
|
||||
from modelscope.models.nlp.structbert.modeling_sbert import (
|
||||
SbertForMaskedLM, SbertModel, SbertPreTrainedModel)
|
||||
from .configuration_space import SpaceConfig
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import Model, TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.nlp.structbert import (SbertForMaskedLM, SbertModel,
|
||||
SbertPreTrainedModel)
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .configuration import SpaceConfig
|
||||
|
||||
SPACE_START_DOCSTRING = r"""
|
||||
|
||||
@@ -57,6 +65,63 @@ class SpaceModel(SbertModel):
|
||||
config_class = SpaceConfig
|
||||
|
||||
|
||||
class SpacePreTrainedModel(TorchModel, PreTrainedModel):
|
||||
"""
|
||||
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
||||
models.
|
||||
"""
|
||||
|
||||
config_class = SpaceConfig
|
||||
base_model_prefix = 'bert'
|
||||
supports_gradient_checkpointing = True
|
||||
_keys_to_ignore_on_load_missing = [r'position_ids']
|
||||
|
||||
def __init__(self, config, **kwargs):
|
||||
super().__init__(config.name_or_path, **kwargs)
|
||||
super(Model, self).__init__(config)
|
||||
|
||||
def _init_weights(self, module):
|
||||
"""Initialize the weights"""
|
||||
if isinstance(module, nn.Linear):
|
||||
# Slightly different from the TF version which uses truncated_normal for initialization
|
||||
# cf https://github.com/pytorch/pytorch/pull/5617
|
||||
module.weight.data.normal_(
|
||||
mean=0.0, std=self.config.initializer_range)
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
elif isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(
|
||||
mean=0.0, std=self.config.initializer_range)
|
||||
if module.padding_idx is not None:
|
||||
module.weight.data[module.padding_idx].zero_()
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
@classmethod
|
||||
def _instantiate(cls, **kwargs):
|
||||
"""Instantiate the model.
|
||||
|
||||
@param kwargs: Input args.
|
||||
model_dir: The model dir used to load the checkpoint and the label information.
|
||||
num_labels: An optional arg to tell the model how many classes to initialize.
|
||||
Method will call utils.parse_label_mapping if num_labels is not input.
|
||||
label2id: An optional label2id mapping, which will cover the label2id in configuration (if exists).
|
||||
|
||||
@return: The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
|
||||
"""
|
||||
|
||||
model_dir = kwargs.pop('model_dir', None)
|
||||
if model_dir is None:
|
||||
config = SpaceConfig(**kwargs)
|
||||
model = cls(config)
|
||||
else:
|
||||
model_kwargs = {}
|
||||
model = super(Model, cls).from_pretrained(
|
||||
pretrained_model_name_or_path=model_dir, **model_kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Space Model transformer with Dialog state tracking heads on top (a inform projection
|
||||
@@ -65,7 +130,9 @@ class SpaceModel(SbertModel):
|
||||
""",
|
||||
SPACE_START_DOCSTRING,
|
||||
)
|
||||
class SpaceForDST(SbertPreTrainedModel):
|
||||
@MODELS.register_module(
|
||||
Tasks.task_oriented_conversation, module_name=Models.space_dst)
|
||||
class SpaceForDST(SpacePreTrainedModel):
|
||||
|
||||
def __init__(self, config):
|
||||
super(SpaceForDST, self).__init__(config)
|
||||
@@ -113,18 +180,105 @@ class SpaceForDST(SbertPreTrainedModel):
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self,
|
||||
input_ids,
|
||||
input_mask=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
start_pos=None,
|
||||
end_pos=None,
|
||||
inform_slot_id=None,
|
||||
refer_id=None,
|
||||
class_label_id=None,
|
||||
diag_state=None):
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Tensor]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Tensor]: results
|
||||
Example:
|
||||
{
|
||||
'inputs': dict(input_ids, input_masks,start_pos), # tracking states
|
||||
'outputs': dict(slots_logits),
|
||||
'unique_ids': str(test-example.json-0), # default value
|
||||
'input_ids_unmasked': array([101, 7632, 1010,0,0,0])
|
||||
'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
|
||||
'inform': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
|
||||
'prefix': str('final'), #default value
|
||||
'ds': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}])
|
||||
}
|
||||
|
||||
Example:
|
||||
>>> from modelscope.hub.snapshot_download import snapshot_download
|
||||
>>> from modelscope.models.nlp import SpaceForDST
|
||||
>>> from modelscope.preprocessors import DialogStateTrackingPreprocessor
|
||||
>>> cache_path = snapshot_download('damo/nlp_space_dialog-state-tracking')
|
||||
>>> model = SpaceForDST.from_pretrained(cache_path)
|
||||
>>> preprocessor = DialogStateTrackingPreprocessor(model_dir=cache_path)
|
||||
>>> print(model(preprocessor({
|
||||
'utter': {
|
||||
'User-1': "Hi, I'm looking for a train that is going"
|
||||
"to cambridge and arriving there by 20:45, is there anything like that?"
|
||||
},
|
||||
'history_states': [{}]
|
||||
})))
|
||||
"""
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# self.model.eval() ????
|
||||
batch = input['batch']
|
||||
|
||||
features = input['features']
|
||||
diag_state = input['diag_state']
|
||||
turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]]
|
||||
reset_diag_state = np.where(np.array(turn_itrs) == '0')[0]
|
||||
for slot in self.config.dst_slot_list:
|
||||
for i in reset_diag_state:
|
||||
diag_state[slot][i] = 0
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {
|
||||
'input_ids': batch[0],
|
||||
'input_mask': batch[1],
|
||||
'segment_ids': batch[2],
|
||||
'start_pos': batch[3],
|
||||
'end_pos': batch[4],
|
||||
'inform_slot_id': batch[5],
|
||||
'refer_id': batch[6],
|
||||
'diag_state': diag_state,
|
||||
'class_label_id': batch[8]
|
||||
}
|
||||
unique_ids = [features[i.item()].guid for i in batch[9]]
|
||||
values = [features[i.item()].values for i in batch[9]]
|
||||
input_ids_unmasked = [
|
||||
features[i.item()].input_ids_unmasked for i in batch[9]
|
||||
]
|
||||
inform = [features[i.item()].inform for i in batch[9]]
|
||||
outputs = self._forward(**inputs)
|
||||
|
||||
# Update dialog state for next turn.
|
||||
for slot in self.config.dst_slot_list:
|
||||
updates = outputs[2][slot].max(1)[1]
|
||||
for i, u in enumerate(updates):
|
||||
if u != 0:
|
||||
diag_state[slot][i] = u
|
||||
|
||||
return {
|
||||
'inputs': inputs,
|
||||
'outputs': outputs,
|
||||
'unique_ids': unique_ids,
|
||||
'input_ids_unmasked': input_ids_unmasked,
|
||||
'values': values,
|
||||
'inform': inform,
|
||||
'prefix': 'final',
|
||||
'ds': input['ds']
|
||||
}
|
||||
|
||||
def _forward(self,
|
||||
input_ids,
|
||||
input_mask=None,
|
||||
segment_ids=None,
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
start_pos=None,
|
||||
end_pos=None,
|
||||
inform_slot_id=None,
|
||||
refer_id=None,
|
||||
class_label_id=None,
|
||||
diag_state=None):
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
attention_mask=input_mask,
|
||||
@@ -132,8 +286,8 @@ class SpaceForDST(SbertPreTrainedModel):
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
pooled_output = outputs[1]
|
||||
sequence_output = outputs.last_hidden_state
|
||||
pooled_output = outputs.pooler_output
|
||||
|
||||
sequence_output = self.dropout(sequence_output)
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
@@ -233,36 +387,6 @@ class SpaceForDST(SbertPreTrainedModel):
|
||||
per_slot_start_logits,
|
||||
per_slot_end_logits,
|
||||
per_slot_refer_logits,
|
||||
) + outputs[2:]
|
||||
) + (outputs.embedding_output, )
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
'The Space Model Model with a `language modeling` head on tops',
|
||||
SPACE_START_DOCSTRING,
|
||||
)
|
||||
class SpaceForMaskedLM(SbertForMaskedLM):
|
||||
"""
|
||||
This class overrides [`SbertForMaskedLM`]. Please check the superclass for the
|
||||
appropriate documentation alongside usage examples.
|
||||
"""
|
||||
|
||||
config_class = SpaceConfig
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Space Model with only one head on top as done during the pretraining: a `masked language modeling` head.
|
||||
""",
|
||||
SPACE_START_DOCSTRING,
|
||||
)
|
||||
class SpaceForPreTraining(SbertPreTrainedModel):
|
||||
|
||||
def __init__(self, model_name_or_path: str):
|
||||
super(SpaceForPreTraining, self).__init__()
|
||||
self.bert_model = SpaceForMaskedLM.from_pretrained(model_name_or_path)
|
||||
|
||||
def forward(self, input_ids: torch.tensor, mlm_labels: torch.tensor):
|
||||
outputs = self.bert_model(input_ids, masked_lm_labels=mlm_labels)
|
||||
return outputs[0]
|
||||
@@ -1,10 +1,8 @@
|
||||
from .configuration_space import SpaceConfig
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .gen_unified_transformer import GenUnifiedTransformer
|
||||
from .generator import SpaceGenerator
|
||||
from .intent_unified_transformer import IntentUnifiedTransformer
|
||||
from .model_base import SpaceModelBase
|
||||
from .modeling_space import (SpaceForDST, SpaceForMaskedLM,
|
||||
SpaceForPreTraining, SpaceModel)
|
||||
from .tokenization_space import (BasicTokenizer, SpaceTokenizer,
|
||||
WordpieceTokenizer)
|
||||
from .unified_transformer import UnifiedTransformer
|
||||
|
||||
@@ -71,14 +71,11 @@ class SpaceGenerator(object):
|
||||
return
|
||||
|
||||
def __call__(self, step_fn, state):
|
||||
"""
|
||||
Running generation.
|
||||
"""Running generation.
|
||||
|
||||
@param : step_fn : decoding one step
|
||||
@type : function
|
||||
|
||||
@param : state : initial state
|
||||
@type : dict
|
||||
Args:
|
||||
step_fn (`function`) : decoding one step
|
||||
state(`dict`) : initial state
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -104,11 +101,9 @@ class BeamSearch(SpaceGenerator):
|
||||
"""
|
||||
Running beam search.
|
||||
|
||||
@param : step_fn : decoding one step
|
||||
@type : function
|
||||
|
||||
@param : state : initial state
|
||||
@type : dict
|
||||
Args:
|
||||
step_fn(`function`) : decoding one step
|
||||
state(`dict`) : initial state
|
||||
"""
|
||||
if prev_input is not None:
|
||||
|
||||
|
||||
@@ -64,8 +64,8 @@ class SpaceModelBase(nn.Module):
|
||||
"""
|
||||
Forward process, include real forward, collect metrices and optimize(optional)
|
||||
|
||||
@params : inputs : input data
|
||||
@type : dict of numpy.ndarray/int/float/...
|
||||
Args:
|
||||
inputs(`dict` of numpy.ndarray/int/float/...) : input data
|
||||
"""
|
||||
if is_training:
|
||||
self.train()
|
||||
@@ -85,11 +85,10 @@ class SpaceModelBase(nn.Module):
|
||||
eos_id=None,
|
||||
max_gen_len=None,
|
||||
prev_input=None):
|
||||
"""
|
||||
Inference process.
|
||||
"""Inference process.
|
||||
|
||||
@params : inputs : input data
|
||||
@type : dict of numpy.ndarray/int/float/...
|
||||
Args:
|
||||
inputs(`dict` of numpy.ndarray/int/float/...) : input data
|
||||
"""
|
||||
self.eval()
|
||||
results = self._infer(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||
# Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
||||
@@ -119,15 +119,12 @@ class UnifiedTransformer(SpaceModelBase):
|
||||
input_mask,
|
||||
append_head=False,
|
||||
auto_regressive=False):
|
||||
"""
|
||||
Create attention mask.
|
||||
"""Create attention mask.
|
||||
from sequence to matrix:[batch_size, max_seq_len, 1] -> [batch_size, max_seq_len, max_seq_len]
|
||||
|
||||
@param : input_mask
|
||||
@type : Variable(shape: [batch_size, max_seq_len])
|
||||
|
||||
@param : auto_regressive
|
||||
@type : bool
|
||||
Args:
|
||||
input_mask (Variable(shape: [batch_size, max_seq_len]))
|
||||
auto_regressive(bool)
|
||||
"""
|
||||
seq_len = input_mask.shape[1]
|
||||
|
||||
@@ -150,15 +147,12 @@ class UnifiedTransformer(SpaceModelBase):
|
||||
return mask
|
||||
|
||||
def _join_mask(self, mask1, mask2):
|
||||
"""
|
||||
Merge source attention mask and target attention mask.
|
||||
"""Merge source attention mask and target attention mask.
|
||||
There are four parts:left upper (lu) / right upper (ru) / left below (lb) / right below (rb)
|
||||
|
||||
@param : mask1 : source attention mask
|
||||
@type : Variable(shape: [batch_size, max_src_len, max_src_len])
|
||||
|
||||
@param : mask1 : target attention mask
|
||||
@type : Variable(shape: [batch_size, max_tgt_len, max_tgt_len])
|
||||
Args:
|
||||
mask1(Variable(shape: [batch_size, max_src_len, max_src_len])) : source attention mask
|
||||
mask2(Variable(shape: [batch_size, max_tgt_len, max_tgt_len])) : target attention mask
|
||||
"""
|
||||
batch_size = mask1.shape[0]
|
||||
seq_len1 = mask1.shape[1]
|
||||
|
||||
@@ -30,18 +30,13 @@ class TransformerBlock(nn.Module):
|
||||
return
|
||||
|
||||
def forward(self, inp, mask=None, cache=None):
|
||||
"""
|
||||
Forward process on one transformer layer.
|
||||
"""Forward process on one transformer layer.
|
||||
|
||||
@param : x
|
||||
@type : Variable(shape: [batch_size, seq_len, hidden_size])
|
||||
|
||||
@param : memory
|
||||
@type : Variable(shape: [batch_size, seq_len, hidden_size])
|
||||
|
||||
@param : mask
|
||||
|
||||
@param : cache
|
||||
Args:
|
||||
x(Variable(shape: [batch_size, seq_len, hidden_size]))
|
||||
memory(Variable(shape: [batch_size, seq_len, hidden_size]))
|
||||
mask
|
||||
cache
|
||||
"""
|
||||
attn_out = self.attn(inp, mask, cache)
|
||||
attn_out = self.dropout_layer(attn_out)
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
from typing import Dict
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
__all__ = ['SpaceForDialogStateTracking']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.task_oriented_conversation, module_name=Models.space_dst)
|
||||
class SpaceForDialogStateTracking(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the test generation model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
|
||||
from modelscope.models.nlp.space.model import SpaceForDST, SpaceConfig
|
||||
self.model_dir = model_dir
|
||||
|
||||
self.config = SpaceConfig.from_pretrained(self.model_dir)
|
||||
self.model = SpaceForDST.from_pretrained(self.model_dir)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
"""return the result by the model
|
||||
|
||||
Args:
|
||||
input (Dict[str, Tensor]): the preprocessed data
|
||||
|
||||
Returns:
|
||||
Dict[str, Tensor]: results
|
||||
Example:
|
||||
{
|
||||
'inputs': dict(input_ids, input_masks,start_pos), # tracking states
|
||||
'outputs': dict(slots_logits),
|
||||
'unique_ids': str(test-example.json-0), # default value
|
||||
'input_ids_unmasked': array([101, 7632, 1010,0,0,0])
|
||||
'values': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
|
||||
'inform': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}]),
|
||||
'prefix': str('final'), #default value
|
||||
'ds': array([{'taxi-leaveAt': 'none', 'taxi-destination': 'none'}])
|
||||
}
|
||||
"""
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
self.model.eval()
|
||||
batch = input['batch']
|
||||
|
||||
features = input['features']
|
||||
diag_state = input['diag_state']
|
||||
turn_itrs = [features[i.item()].guid.split('-')[2] for i in batch[9]]
|
||||
reset_diag_state = np.where(np.array(turn_itrs) == '0')[0]
|
||||
for slot in self.config.dst_slot_list:
|
||||
for i in reset_diag_state:
|
||||
diag_state[slot][i] = 0
|
||||
|
||||
with torch.no_grad():
|
||||
inputs = {
|
||||
'input_ids': batch[0],
|
||||
'input_mask': batch[1],
|
||||
'segment_ids': batch[2],
|
||||
'start_pos': batch[3],
|
||||
'end_pos': batch[4],
|
||||
'inform_slot_id': batch[5],
|
||||
'refer_id': batch[6],
|
||||
'diag_state': diag_state,
|
||||
'class_label_id': batch[8]
|
||||
}
|
||||
unique_ids = [features[i.item()].guid for i in batch[9]]
|
||||
values = [features[i.item()].values for i in batch[9]]
|
||||
input_ids_unmasked = [
|
||||
features[i.item()].input_ids_unmasked for i in batch[9]
|
||||
]
|
||||
inform = [features[i.item()].inform for i in batch[9]]
|
||||
outputs = self.model(**inputs)
|
||||
|
||||
# Update dialog state for next turn.
|
||||
for slot in self.config.dst_slot_list:
|
||||
updates = outputs[2][slot].max(1)[1]
|
||||
for i, u in enumerate(updates):
|
||||
if u != 0:
|
||||
diag_state[slot][i] = u
|
||||
|
||||
return {
|
||||
'inputs': inputs,
|
||||
'outputs': outputs,
|
||||
'unique_ids': unique_ids,
|
||||
'input_ids_unmasked': input_ids_unmasked,
|
||||
'values': values,
|
||||
'inform': inform,
|
||||
'prefix': 'final',
|
||||
'ds': input['ds']
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .table_question_answering import TableQuestionAnswering
|
||||
else:
|
||||
_import_structure = {
|
||||
'table_question_answering': ['TableQuestionAnswering']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user