Fix trust_remote_code (#1016)

* add cmd line option of clear-cache

* fix typo

* fix trust_remote_code for old dataset and py-script

---------

Co-authored-by: Yingda Chen <yingda.chen@alibaba-inc.com>
This commit is contained in:
Xingjun.Wang
2024-10-12 22:00:30 +08:00
committed by GitHub
parent b0f03ffd6d
commit 90acaccc28
5 changed files with 13 additions and 5 deletions

View File

@@ -17,7 +17,7 @@ class DatasetContextConfig:
data_files: Union[str, Sequence[str],
Mapping[str, Union[str, Sequence[str]]]],
download_mode: DownloadMode, cache_root_dir: str,
use_streaming: bool, stream_batch_size: int, **kwargs):
use_streaming: bool, stream_batch_size: int, trust_remote_code: bool, **kwargs):
self._download_config = None
self._data_meta_config = None
@@ -44,6 +44,7 @@ class DatasetContextConfig:
self.use_streaming = use_streaming
self.stream_batch_size = stream_batch_size
self.download_virgo_files: bool = False
self.trust_remote_code: bool = trust_remote_code
@property
def config_kwargs(self) -> dict:

View File

@@ -127,6 +127,7 @@ class OssDownloader(BaseDownloader):
cache_dir = self.dataset_context_config.cache_root_dir
download_mode = self.dataset_context_config.download_mode
input_kwargs = self.dataset_context_config.config_kwargs
trust_remote_code = self.dataset_context_config.trust_remote_code
if self.builder is None and not dataset_py_script:
raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
@@ -141,7 +142,7 @@ class OssDownloader(BaseDownloader):
data_files=data_files,
cache_dir=cache_dir,
download_mode=download_mode.value,
ignore_verifications=True,
trust_remote_code=trust_remote_code,
**input_kwargs)
else:
self.dataset = self.data_files_manager.fetch_data_files(

View File

@@ -105,6 +105,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
download_mode_val = self.dataset_context_config.download_mode.value
use_streaming = self.dataset_context_config.use_streaming
input_config_kwargs = self.dataset_context_config.config_kwargs
trust_remote_code = self.dataset_context_config.trust_remote_code
# To use the huggingface data loader
if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
@@ -118,6 +119,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
download_mode=download_mode_val,
streaming=use_streaming,
ignore_verifications=True,
trust_remote_code=trust_remote_code,
**input_config_kwargs)
# download statistics
self.api.dataset_download_statistics(

View File

@@ -168,6 +168,7 @@ class MsDataset:
custom_cfg: Optional[Config] = Config(),
token: Optional[str] = None,
dataset_info_only: Optional[bool] = False,
trust_remote_code: Optional[bool] = True,
**config_kwargs,
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
@@ -198,6 +199,7 @@ class MsDataset:
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
token (str, Optional): SDK token of ModelScope.
dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
trust_remote_code (bool, Optional): If set to True, trust the remote code.
**config_kwargs (additional keyword arguments): Keyword arguments to be passed
Returns:
@@ -250,6 +252,7 @@ class MsDataset:
cache_root_dir=cache_dir,
use_streaming=use_streaming,
stream_batch_size=stream_batch_size,
trust_remote_code=trust_remote_code,
**config_kwargs)
# Load from local disk
@@ -275,6 +278,7 @@ class MsDataset:
split=split,
streaming=use_streaming,
download_mode=download_mode.value,
trust_remote_code=trust_remote_code,
**config_kwargs)
# Load from the modelscope hub
@@ -303,7 +307,7 @@ class MsDataset:
token=token,
streaming=use_streaming,
dataset_info_only=dataset_info_only,
trust_remote_code=True,
trust_remote_code=trust_remote_code,
**config_kwargs) as dataset_res:
return dataset_res

View File

@@ -824,7 +824,7 @@ def get_module_with_script(self) -> DatasetModule:
name=self.name,
)
if not os.path.exists(importable_file_path):
trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
if trust_remote_code:
_create_importable_file(
local_path=local_script_path,
@@ -884,7 +884,7 @@ class DatasetsWrapperHF:
streaming: bool = False,
num_proc: Optional[int] = None,
storage_options: Optional[Dict] = None,
trust_remote_code: bool = None,
trust_remote_code: bool = True,
dataset_info_only: Optional[bool] = False,
**config_kwargs,
) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,