import os import re import subprocess import argparse def find_image_references(md_file): """Find all image references in a markdown file.""" with open(md_file, "r", encoding="utf-8") as f: content = f.read() pattern = r"!\[.*?\]\((.*?)\)" matches = re.findall(pattern, content) cleaned_paths = [] for match in matches: path = match.lstrip("/") if "img/" in path: path = path[path.index("img/") + 4 :] # Only keep references to versioned images parts = os.path.normpath(path).split(os.sep) if len(parts) >= 2 and parts[0].replace(".", "").isdigit(): cleaned_paths.append(path) return cleaned_paths def scan_markdown_files(docs_dir): """Recursively scan all markdown files in the docs directory.""" md_files = [] for root, _, files in os.walk(docs_dir): for file in files: if file.endswith(".md"): md_files.append(os.path.join(root, file)) return md_files def find_all_images(img_dir): """Find all image files in version subdirectories.""" image_files = [] for root, _, files in os.walk(img_dir): # Get the relative path from img_dir to current directory rel_dir = os.path.relpath(root, img_dir) # Skip if we're in the root img directory if rel_dir == ".": continue # Check if the immediate parent directory is a version number parent_dir = rel_dir.split(os.sep)[0] if not parent_dir.replace(".", "").isdigit(): continue for file in files: if file.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".svg")): rel_path = os.path.relpath(os.path.join(root, file), img_dir) image_files.append(rel_path) return image_files def grep_check_image(docs_dir, image_path): """ Check if versioned image is referenced anywhere using grep. Returns True if any reference is found, False otherwise. """ try: # Split the image path to get version and filename parts = os.path.normpath(image_path).split(os.sep) version = parts[0] # e.g., "0.29.0" filename = parts[-1] # e.g., "world-state-suggestions-2.png" # For versioned images, require both version and filename to match version_pattern = f"{version}.*{filename}" try: result = subprocess.run( ["grep", "-r", "-l", version_pattern, docs_dir], capture_output=True, text=True, ) if result.stdout.strip(): print( f"Found reference to {image_path} with version pattern: {version_pattern}" ) return True except subprocess.CalledProcessError: pass except Exception as e: print(f"Error during grep check for {image_path}: {e}") return False def main(): parser = argparse.ArgumentParser( description="Find and optionally delete unused versioned images in MkDocs project" ) parser.add_argument( "--docs-dir", type=str, required=True, help="Path to the docs directory" ) parser.add_argument( "--img-dir", type=str, required=True, help="Path to the images directory" ) parser.add_argument("--delete", action="store_true", help="Delete unused images") parser.add_argument( "--verbose", action="store_true", help="Show all found references and files" ) parser.add_argument( "--skip-grep", action="store_true", help="Skip the additional grep validation" ) args = parser.parse_args() # Convert paths to absolute paths docs_dir = os.path.abspath(args.docs_dir) img_dir = os.path.abspath(args.img_dir) print(f"Scanning markdown files in: {docs_dir}") print(f"Looking for versioned images in: {img_dir}") # Get all markdown files md_files = scan_markdown_files(docs_dir) print(f"Found {len(md_files)} markdown files") # Collect all image references used_images = set() for md_file in md_files: refs = find_image_references(md_file) used_images.update(refs) # Get all actual images (only from version directories) all_images = set(find_all_images(img_dir)) if args.verbose: print("\nAll versioned image references found in markdown:") for img in sorted(used_images): print(f"- {img}") print("\nAll versioned images in directory:") for img in sorted(all_images): print(f"- {img}") # Find potentially unused images unused_images = all_images - used_images # Additional grep validation if not skipped if not args.skip_grep and unused_images: print("\nPerforming additional grep validation...") actually_unused = set() for img in unused_images: if not grep_check_image(docs_dir, img): actually_unused.add(img) if len(actually_unused) != len(unused_images): print( f"\nGrep validation found {len(unused_images) - len(actually_unused)} additional image references!" ) unused_images = actually_unused # Report findings print("\nResults:") print(f"Total versioned images found: {len(all_images)}") print(f"Versioned images referenced in markdown: {len(used_images)}") print(f"Unused versioned images: {len(unused_images)}") if unused_images: print("\nUnused versioned images:") for img in sorted(unused_images): print(f"- {img}") if args.delete: print("\nDeleting unused versioned images...") for img in unused_images: full_path = os.path.join(img_dir, img) try: os.remove(full_path) print(f"Deleted: {img}") except Exception as e: print(f"Error deleting {img}: {e}") print("\nDeletion complete") else: print("\nNo unused versioned images found!") if __name__ == "__main__": main()