Files
talemate/docs/cleanup.py
veguAI fb2fa31f13 linting
* precommit

* linting

* add linting to workflow

* ruff.toml added
2025-06-29 19:51:08 +03:00

186 lines
6.0 KiB
Python

import os
import re
import subprocess
import argparse
def find_image_references(md_file):
"""Find all image references in a markdown file."""
with open(md_file, "r", encoding="utf-8") as f:
content = f.read()
pattern = r"!\[.*?\]\((.*?)\)"
matches = re.findall(pattern, content)
cleaned_paths = []
for match in matches:
path = match.lstrip("/")
if "img/" in path:
path = path[path.index("img/") + 4 :]
# Only keep references to versioned images
parts = os.path.normpath(path).split(os.sep)
if len(parts) >= 2 and parts[0].replace(".", "").isdigit():
cleaned_paths.append(path)
return cleaned_paths
def scan_markdown_files(docs_dir):
"""Recursively scan all markdown files in the docs directory."""
md_files = []
for root, _, files in os.walk(docs_dir):
for file in files:
if file.endswith(".md"):
md_files.append(os.path.join(root, file))
return md_files
def find_all_images(img_dir):
"""Find all image files in version subdirectories."""
image_files = []
for root, _, files in os.walk(img_dir):
# Get the relative path from img_dir to current directory
rel_dir = os.path.relpath(root, img_dir)
# Skip if we're in the root img directory
if rel_dir == ".":
continue
# Check if the immediate parent directory is a version number
parent_dir = rel_dir.split(os.sep)[0]
if not parent_dir.replace(".", "").isdigit():
continue
for file in files:
if file.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".svg")):
rel_path = os.path.relpath(os.path.join(root, file), img_dir)
image_files.append(rel_path)
return image_files
def grep_check_image(docs_dir, image_path):
"""
Check if versioned image is referenced anywhere using grep.
Returns True if any reference is found, False otherwise.
"""
try:
# Split the image path to get version and filename
parts = os.path.normpath(image_path).split(os.sep)
version = parts[0] # e.g., "0.29.0"
filename = parts[-1] # e.g., "world-state-suggestions-2.png"
# For versioned images, require both version and filename to match
version_pattern = f"{version}.*{filename}"
try:
result = subprocess.run(
["grep", "-r", "-l", version_pattern, docs_dir],
capture_output=True,
text=True,
)
if result.stdout.strip():
print(
f"Found reference to {image_path} with version pattern: {version_pattern}"
)
return True
except subprocess.CalledProcessError:
pass
except Exception as e:
print(f"Error during grep check for {image_path}: {e}")
return False
def main():
parser = argparse.ArgumentParser(
description="Find and optionally delete unused versioned images in MkDocs project"
)
parser.add_argument(
"--docs-dir", type=str, required=True, help="Path to the docs directory"
)
parser.add_argument(
"--img-dir", type=str, required=True, help="Path to the images directory"
)
parser.add_argument("--delete", action="store_true", help="Delete unused images")
parser.add_argument(
"--verbose", action="store_true", help="Show all found references and files"
)
parser.add_argument(
"--skip-grep", action="store_true", help="Skip the additional grep validation"
)
args = parser.parse_args()
# Convert paths to absolute paths
docs_dir = os.path.abspath(args.docs_dir)
img_dir = os.path.abspath(args.img_dir)
print(f"Scanning markdown files in: {docs_dir}")
print(f"Looking for versioned images in: {img_dir}")
# Get all markdown files
md_files = scan_markdown_files(docs_dir)
print(f"Found {len(md_files)} markdown files")
# Collect all image references
used_images = set()
for md_file in md_files:
refs = find_image_references(md_file)
used_images.update(refs)
# Get all actual images (only from version directories)
all_images = set(find_all_images(img_dir))
if args.verbose:
print("\nAll versioned image references found in markdown:")
for img in sorted(used_images):
print(f"- {img}")
print("\nAll versioned images in directory:")
for img in sorted(all_images):
print(f"- {img}")
# Find potentially unused images
unused_images = all_images - used_images
# Additional grep validation if not skipped
if not args.skip_grep and unused_images:
print("\nPerforming additional grep validation...")
actually_unused = set()
for img in unused_images:
if not grep_check_image(docs_dir, img):
actually_unused.add(img)
if len(actually_unused) != len(unused_images):
print(
f"\nGrep validation found {len(unused_images) - len(actually_unused)} additional image references!"
)
unused_images = actually_unused
# Report findings
print("\nResults:")
print(f"Total versioned images found: {len(all_images)}")
print(f"Versioned images referenced in markdown: {len(used_images)}")
print(f"Unused versioned images: {len(unused_images)}")
if unused_images:
print("\nUnused versioned images:")
for img in sorted(unused_images):
print(f"- {img}")
if args.delete:
print("\nDeleting unused versioned images...")
for img in unused_images:
full_path = os.path.join(img_dir, img)
try:
os.remove(full_path)
print(f"Deleted: {img}")
except Exception as e:
print(f"Error deleting {img}: {e}")
print("\nDeletion complete")
else:
print("\nNo unused versioned images found!")
if __name__ == "__main__":
main()