diff --git a/modelscope/msdatasets/context/dataset_context_config.py b/modelscope/msdatasets/context/dataset_context_config.py index 48124d78..bfe7dbd1 100644 --- a/modelscope/msdatasets/context/dataset_context_config.py +++ b/modelscope/msdatasets/context/dataset_context_config.py @@ -17,7 +17,7 @@ class DatasetContextConfig: data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], download_mode: DownloadMode, cache_root_dir: str, - use_streaming: bool, stream_batch_size: int, **kwargs): + use_streaming: bool, stream_batch_size: int, trust_remote_code: bool, **kwargs): self._download_config = None self._data_meta_config = None @@ -44,6 +44,7 @@ class DatasetContextConfig: self.use_streaming = use_streaming self.stream_batch_size = stream_batch_size self.download_virgo_files: bool = False + self.trust_remote_code: bool = trust_remote_code @property def config_kwargs(self) -> dict: diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py index f29acc8f..92074449 100644 --- a/modelscope/msdatasets/data_loader/data_loader.py +++ b/modelscope/msdatasets/data_loader/data_loader.py @@ -127,6 +127,7 @@ class OssDownloader(BaseDownloader): cache_dir = self.dataset_context_config.cache_root_dir download_mode = self.dataset_context_config.download_mode input_kwargs = self.dataset_context_config.config_kwargs + trust_remote_code = self.dataset_context_config.trust_remote_code if self.builder is None and not dataset_py_script: raise f'meta-file: {dataset_name}.py not found on the modelscope hub.' @@ -141,7 +142,7 @@ class OssDownloader(BaseDownloader): data_files=data_files, cache_dir=cache_dir, download_mode=download_mode.value, - ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_kwargs) else: self.dataset = self.data_files_manager.fetch_data_files( diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py index 0dec5d89..504f3da6 100644 --- a/modelscope/msdatasets/data_loader/data_loader_manager.py +++ b/modelscope/msdatasets/data_loader/data_loader_manager.py @@ -105,6 +105,7 @@ class RemoteDataLoaderManager(DataLoaderManager): download_mode_val = self.dataset_context_config.download_mode.value use_streaming = self.dataset_context_config.use_streaming input_config_kwargs = self.dataset_context_config.config_kwargs + trust_remote_code = self.dataset_context_config.trust_remote_code # To use the huggingface data loader if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER: @@ -118,6 +119,7 @@ class RemoteDataLoaderManager(DataLoaderManager): download_mode=download_mode_val, streaming=use_streaming, ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_config_kwargs) # download statistics self.api.dataset_download_statistics( diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index b57a16ac..899142ad 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -168,6 +168,7 @@ class MsDataset: custom_cfg: Optional[Config] = Config(), token: Optional[str] = None, dataset_info_only: Optional[bool] = False, + trust_remote_code: Optional[bool] = True, **config_kwargs, ) -> Union[dict, 'MsDataset', NativeIterableDataset]: """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. @@ -198,6 +199,7 @@ class MsDataset: see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3 token (str, Optional): SDK token of ModelScope. dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict). + trust_remote_code (bool, Optional): If set to True, trust the remote code. **config_kwargs (additional keyword arguments): Keyword arguments to be passed Returns: @@ -250,6 +252,7 @@ class MsDataset: cache_root_dir=cache_dir, use_streaming=use_streaming, stream_batch_size=stream_batch_size, + trust_remote_code=trust_remote_code, **config_kwargs) # Load from local disk @@ -275,6 +278,7 @@ class MsDataset: split=split, streaming=use_streaming, download_mode=download_mode.value, + trust_remote_code=trust_remote_code, **config_kwargs) # Load from the modelscope hub @@ -303,7 +307,7 @@ class MsDataset: token=token, streaming=use_streaming, dataset_info_only=dataset_info_only, - trust_remote_code=True, + trust_remote_code=trust_remote_code, **config_kwargs) as dataset_res: return dataset_res diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index 5b3a8bb7..3fb996ac 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -824,7 +824,7 @@ def get_module_with_script(self) -> DatasetModule: name=self.name, ) if not os.path.exists(importable_file_path): - trust_remote_code = resolve_trust_remote_code(self.trust_remote_code, self.name) + trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name) if trust_remote_code: _create_importable_file( local_path=local_script_path, @@ -884,7 +884,7 @@ class DatasetsWrapperHF: streaming: bool = False, num_proc: Optional[int] = None, storage_options: Optional[Dict] = None, - trust_remote_code: bool = None, + trust_remote_code: bool = True, dataset_info_only: Optional[bool] = False, **config_kwargs, ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,