From 1950ec1839831bbff0626d1a20b8cfce51a1d965 Mon Sep 17 00:00:00 2001 From: Yingda Chen Date: Mon, 2 Dec 2024 19:29:11 +0800 Subject: [PATCH] add hash verficiation into cache file existence check (#1116) * add hash check into cache file existence check Co-authored-by: Yingda Chen --- modelscope/hub/file_download.py | 4 ++-- modelscope/hub/snapshot_download.py | 3 ++- modelscope/hub/utils/caching.py | 28 ++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index a6d7c2e2..40ac8a03 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -214,7 +214,7 @@ def _repo_file_download( if repo_file['Path'] == file_path: if cache.exists(repo_file): logger.debug( - f'File {repo_file["Name"]} already in cache, skip downloading!' + f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!' ) return cache.get_file_by_info(repo_file) else: @@ -251,7 +251,7 @@ def _repo_file_download( if repo_file['Path'] == file_path: if cache.exists(repo_file): logger.debug( - f'File {repo_file["Name"]} already in cache, skip downloading!' + f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!' ) return cache.get_file_by_info(repo_file) else: diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py index f28c18e0..015cadbd 100644 --- a/modelscope/hub/snapshot_download.py +++ b/modelscope/hub/snapshot_download.py @@ -468,7 +468,8 @@ def _download_file_lists( if cache.exists(repo_file): file_name = os.path.basename(repo_file['Name']) logger.debug( - f'File {file_name} already in cache, skip downloading!') + f'File {file_name} already in cache with identical hash, skip downloading!' + ) continue except Exception as e: logger.warning('The file pattern is invalid : %s' % e) diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py index ecdec7cc..675d62a8 100644 --- a/modelscope/hub/utils/caching.py +++ b/modelscope/hub/utils/caching.py @@ -7,6 +7,8 @@ import tempfile from shutil import move, rmtree from typing import Dict +from modelscope.hub.constants import FILE_HASH +from modelscope.hub.utils.utils import compute_hash from modelscope.utils.logger import get_logger logger = get_logger() @@ -252,26 +254,36 @@ class ModelFileSystemCache(FileSystemCache): return cache_key def exists(self, model_file_info): - """Check the file is cached or not. + """Check the file is cached or not. Note existence check will also cover digest check Args: model_file_info (CachedFileInfo): The cached file info Returns: - bool: If exists return True otherwise False + bool: If exists and has the same hash, return True otherwise False """ key = self.__get_cache_key(model_file_info) is_exists = False + file_path = key['Path'] + cache_file_path = os.path.join(self.cache_root_location, + model_file_info['Path']) for cached_key in self.cached_files: - if cached_key['Path'] == key['Path'] and ( + if cached_key['Path'] == file_path and ( cached_key['Revision'].startswith(key['Revision']) or key['Revision'].startswith(cached_key['Revision'])): - is_exists = True - break - file_path = os.path.join(self.cache_root_location, - model_file_info['Path']) + expected_hash = model_file_info[FILE_HASH] + if expected_hash is not None and os.path.exists( + cache_file_path): + cache_file_sha256 = compute_hash(cache_file_path) + if expected_hash == cache_file_sha256: + is_exists = True + break + else: + logger.info( + f'File [{file_path}] exists in cache but with a mismatched hash, will re-download.' + ) if is_exists: - if os.path.exists(file_path): + if os.path.exists(cache_file_path): return True else: self.remove_key(