diff --git a/modelscope/msdatasets/utils/hf_file_utils.py b/modelscope/msdatasets/utils/hf_file_utils.py index ba7edccf..4abd0409 100644 --- a/modelscope/msdatasets/utils/hf_file_utils.py +++ b/modelscope/msdatasets/utils/hf_file_utils.py @@ -370,98 +370,3 @@ def get_from_cache_ms( json.dump(meta, meta_file) return cache_path - - -# def get_from_cache_ms( -# url, -# cache_dir=None, -# force_download=False, -# user_agent=None, -# use_etag=True, -# token=None, -# storage_options=None, -# download_desc=None, -# disable_tqdm=False, -# ) -> str: -# """ -# Given a URL, look for the corresponding file in the local cache. -# If it's not there, download it. Then return the path to the cached file. -# -# Return: -# Local path (string) -# -# Raises: -# FileNotFoundError: in case of non-recoverable file -# (non-existent or no cache on disk) -# ConnectionError: in case of unreachable url -# and no cache on disk -# """ -# if storage_options is None: -# storage_options = {} -# if cache_dir is None: -# cache_dir = MS_DATASETS_CACHE -# if isinstance(cache_dir, Path): -# cache_dir = str(cache_dir) -# -# os.makedirs(cache_dir, exist_ok=True) -# -# response = None -# etag = None -# -# # Try a first time to file the file on the local file system without eTag (None) -# # if we don't ask for 'force_download' then we spare a request -# filename = hash_url_to_filename(url, etag=None) -# cache_path = os.path.join(cache_dir, filename) -# -# if os.path.exists(cache_path) and not force_download and not use_etag: -# return cache_path -# -# # Prepare headers for authentication -# headers = get_authentication_headers_for_url(url, token=token) -# if user_agent is not None: -# headers['user-agent'] = user_agent -# -# response = fsspec_head(url, storage_options=storage_options) -# etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None -# -# # Try a second time -# filename = hash_url_to_filename(url, etag) -# cache_path = os.path.join(cache_dir, filename) -# -# if os.path.exists(cache_path) and not force_download: -# return cache_path -# -# # Prevent parallel downloads of the same file with a lock. -# lock_path = cache_path + '.lock' -# with FileLock(lock_path): -# # Retry in case previously locked processes just enter after the precedent process releases the lock -# if os.path.exists(cache_path) and not force_download: -# return cache_path -# -# incomplete_path = cache_path + '.incomplete' -# -# @contextmanager -# def temp_file_manager(mode='w+b'): -# with open(incomplete_path, mode) as f: -# yield f -# -# # Download to temporary file, then copy to cache path once finished. -# # Otherwise, you get corrupt cache entries if the download gets interrupted. -# with temp_file_manager() as temp_file: -# logger.info(f'{url} not found in cache or force_download set to True, downloading to {temp_file.name}') -# # GET file object -# fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc, disable_tqdm=disable_tqdm) -# -# logger.info(f'storing {url} in cache at {cache_path}') -# shutil.move(temp_file.name, cache_path) -# umask = os.umask(0o666) -# os.umask(umask) -# os.chmod(cache_path, 0o666 & ~umask) -# -# logger.info(f'creating metadata file for {cache_path}') -# meta = {'url': url, 'etag': etag} -# meta_path = cache_path + '.json' -# with open(meta_path, 'w', encoding='utf-8') as meta_file: -# json.dump(meta, meta_file) -# -# return cache_path