mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-24 20:19:22 +01:00
Fix trust_remote_code (#1016)
* add cmd line option of clear-cache * fix typo * fix trust_remote_code for old dataset and py-script --------- Co-authored-by: Yingda Chen <yingda.chen@alibaba-inc.com>
This commit is contained in:
@@ -17,7 +17,7 @@ class DatasetContextConfig:
|
||||
data_files: Union[str, Sequence[str],
|
||||
Mapping[str, Union[str, Sequence[str]]]],
|
||||
download_mode: DownloadMode, cache_root_dir: str,
|
||||
use_streaming: bool, stream_batch_size: int, **kwargs):
|
||||
use_streaming: bool, stream_batch_size: int, trust_remote_code: bool, **kwargs):
|
||||
|
||||
self._download_config = None
|
||||
self._data_meta_config = None
|
||||
@@ -44,6 +44,7 @@ class DatasetContextConfig:
|
||||
self.use_streaming = use_streaming
|
||||
self.stream_batch_size = stream_batch_size
|
||||
self.download_virgo_files: bool = False
|
||||
self.trust_remote_code: bool = trust_remote_code
|
||||
|
||||
@property
|
||||
def config_kwargs(self) -> dict:
|
||||
|
||||
@@ -127,6 +127,7 @@ class OssDownloader(BaseDownloader):
|
||||
cache_dir = self.dataset_context_config.cache_root_dir
|
||||
download_mode = self.dataset_context_config.download_mode
|
||||
input_kwargs = self.dataset_context_config.config_kwargs
|
||||
trust_remote_code = self.dataset_context_config.trust_remote_code
|
||||
|
||||
if self.builder is None and not dataset_py_script:
|
||||
raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
|
||||
@@ -141,7 +142,7 @@ class OssDownloader(BaseDownloader):
|
||||
data_files=data_files,
|
||||
cache_dir=cache_dir,
|
||||
download_mode=download_mode.value,
|
||||
ignore_verifications=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**input_kwargs)
|
||||
else:
|
||||
self.dataset = self.data_files_manager.fetch_data_files(
|
||||
|
||||
@@ -105,6 +105,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
|
||||
download_mode_val = self.dataset_context_config.download_mode.value
|
||||
use_streaming = self.dataset_context_config.use_streaming
|
||||
input_config_kwargs = self.dataset_context_config.config_kwargs
|
||||
trust_remote_code = self.dataset_context_config.trust_remote_code
|
||||
|
||||
# To use the huggingface data loader
|
||||
if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
|
||||
@@ -118,6 +119,7 @@ class RemoteDataLoaderManager(DataLoaderManager):
|
||||
download_mode=download_mode_val,
|
||||
streaming=use_streaming,
|
||||
ignore_verifications=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**input_config_kwargs)
|
||||
# download statistics
|
||||
self.api.dataset_download_statistics(
|
||||
|
||||
@@ -168,6 +168,7 @@ class MsDataset:
|
||||
custom_cfg: Optional[Config] = Config(),
|
||||
token: Optional[str] = None,
|
||||
dataset_info_only: Optional[bool] = False,
|
||||
trust_remote_code: Optional[bool] = True,
|
||||
**config_kwargs,
|
||||
) -> Union[dict, 'MsDataset', NativeIterableDataset]:
|
||||
"""Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
|
||||
@@ -198,6 +199,7 @@ class MsDataset:
|
||||
see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
|
||||
token (str, Optional): SDK token of ModelScope.
|
||||
dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
|
||||
trust_remote_code (bool, Optional): If set to True, trust the remote code.
|
||||
**config_kwargs (additional keyword arguments): Keyword arguments to be passed
|
||||
|
||||
Returns:
|
||||
@@ -250,6 +252,7 @@ class MsDataset:
|
||||
cache_root_dir=cache_dir,
|
||||
use_streaming=use_streaming,
|
||||
stream_batch_size=stream_batch_size,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs)
|
||||
|
||||
# Load from local disk
|
||||
@@ -275,6 +278,7 @@ class MsDataset:
|
||||
split=split,
|
||||
streaming=use_streaming,
|
||||
download_mode=download_mode.value,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs)
|
||||
|
||||
# Load from the modelscope hub
|
||||
@@ -303,7 +307,7 @@ class MsDataset:
|
||||
token=token,
|
||||
streaming=use_streaming,
|
||||
dataset_info_only=dataset_info_only,
|
||||
trust_remote_code=True,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs) as dataset_res:
|
||||
|
||||
return dataset_res
|
||||
|
||||
@@ -824,7 +824,7 @@ def get_module_with_script(self) -> DatasetModule:
|
||||
name=self.name,
|
||||
)
|
||||
if not os.path.exists(importable_file_path):
|
||||
trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name)
|
||||
trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
|
||||
if trust_remote_code:
|
||||
_create_importable_file(
|
||||
local_path=local_script_path,
|
||||
@@ -884,7 +884,7 @@ class DatasetsWrapperHF:
|
||||
streaming: bool = False,
|
||||
num_proc: Optional[int] = None,
|
||||
storage_options: Optional[Dict] = None,
|
||||
trust_remote_code: bool = None,
|
||||
trust_remote_code: bool = True,
|
||||
dataset_info_only: Optional[bool] = False,
|
||||
**config_kwargs,
|
||||
) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,
|
||||
|
||||
Reference in New Issue
Block a user