add hash verficiation into cache file existence check (#1116)

* add hash check into cache file existence check

Co-authored-by: Yingda Chen <yingda.chen@alibaba-inc.com>
This commit is contained in:
Yingda Chen
2024-12-02 19:29:11 +08:00
committed by GitHub
parent 0daf77d489
commit 1950ec1839
3 changed files with 24 additions and 11 deletions

View File

@@ -214,7 +214,7 @@ def _repo_file_download(
if repo_file['Path'] == file_path:
if cache.exists(repo_file):
logger.debug(
f'File {repo_file["Name"]} already in cache, skip downloading!'
f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!'
)
return cache.get_file_by_info(repo_file)
else:
@@ -251,7 +251,7 @@ def _repo_file_download(
if repo_file['Path'] == file_path:
if cache.exists(repo_file):
logger.debug(
f'File {repo_file["Name"]} already in cache, skip downloading!'
f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!'
)
return cache.get_file_by_info(repo_file)
else:

View File

@@ -468,7 +468,8 @@ def _download_file_lists(
if cache.exists(repo_file):
file_name = os.path.basename(repo_file['Name'])
logger.debug(
f'File {file_name} already in cache, skip downloading!')
f'File {file_name} already in cache with identical hash, skip downloading!'
)
continue
except Exception as e:
logger.warning('The file pattern is invalid : %s' % e)

View File

@@ -7,6 +7,8 @@ import tempfile
from shutil import move, rmtree
from typing import Dict
from modelscope.hub.constants import FILE_HASH
from modelscope.hub.utils.utils import compute_hash
from modelscope.utils.logger import get_logger
logger = get_logger()
@@ -252,26 +254,36 @@ class ModelFileSystemCache(FileSystemCache):
return cache_key
def exists(self, model_file_info):
"""Check the file is cached or not.
"""Check the file is cached or not. Note existence check will also cover digest check
Args:
model_file_info (CachedFileInfo): The cached file info
Returns:
bool: If exists return True otherwise False
bool: If exists and has the same hash, return True otherwise False
"""
key = self.__get_cache_key(model_file_info)
is_exists = False
file_path = key['Path']
cache_file_path = os.path.join(self.cache_root_location,
model_file_info['Path'])
for cached_key in self.cached_files:
if cached_key['Path'] == key['Path'] and (
if cached_key['Path'] == file_path and (
cached_key['Revision'].startswith(key['Revision'])
or key['Revision'].startswith(cached_key['Revision'])):
is_exists = True
break
file_path = os.path.join(self.cache_root_location,
model_file_info['Path'])
expected_hash = model_file_info[FILE_HASH]
if expected_hash is not None and os.path.exists(
cache_file_path):
cache_file_sha256 = compute_hash(cache_file_path)
if expected_hash == cache_file_sha256:
is_exists = True
break
else:
logger.info(
f'File [{file_path}] exists in cache but with a mismatched hash, will re-download.'
)
if is_exists:
if os.path.exists(file_path):
if os.path.exists(cache_file_path):
return True
else:
self.remove_key(