talemate/docs/cleanup.py

import os
import re
import subprocess
import argparse


def find_image_references(md_file):
    """Find all image references in a markdown file."""
    with open(md_file, "r", encoding="utf-8") as f:
        content = f.read()

    pattern = r"!\[.*?\]\((.*?)\)"
    matches = re.findall(pattern, content)

    cleaned_paths = []
    for match in matches:
        path = match.lstrip("/")
        if "img/" in path:
            path = path[path.index("img/") + 4 :]
            # Only keep references to versioned images
            parts = os.path.normpath(path).split(os.sep)
            if len(parts) >= 2 and parts[0].replace(".", "").isdigit():
                cleaned_paths.append(path)

    return cleaned_paths


def scan_markdown_files(docs_dir):
    """Recursively scan all markdown files in the docs directory."""
    md_files = []
    for root, _, files in os.walk(docs_dir):
        for file in files:
            if file.endswith(".md"):
                md_files.append(os.path.join(root, file))
    return md_files


def find_all_images(img_dir):
    """Find all image files in version subdirectories."""
    image_files = []
    for root, _, files in os.walk(img_dir):
        # Get the relative path from img_dir to current directory
        rel_dir = os.path.relpath(root, img_dir)

        # Skip if we're in the root img directory
        if rel_dir == ".":
            continue

        # Check if the immediate parent directory is a version number
        parent_dir = rel_dir.split(os.sep)[0]
        if not parent_dir.replace(".", "").isdigit():
            continue

        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".svg")):
                rel_path = os.path.relpath(os.path.join(root, file), img_dir)
                image_files.append(rel_path)
    return image_files


def grep_check_image(docs_dir, image_path):
    """
    Check if versioned image is referenced anywhere using grep.
    Returns True if any reference is found, False otherwise.
    """
    try:
        # Split the image path to get version and filename
        parts = os.path.normpath(image_path).split(os.sep)
        version = parts[0]  # e.g., "0.29.0"
        filename = parts[-1]  # e.g., "world-state-suggestions-2.png"

        # For versioned images, require both version and filename to match
        version_pattern = f"{version}.*{filename}"
        try:
            result = subprocess.run(
                ["grep", "-r", "-l", version_pattern, docs_dir],
                capture_output=True,
                text=True,
            )
            if result.stdout.strip():
                print(
                    f"Found reference to {image_path} with version pattern: {version_pattern}"
                )
                return True
        except subprocess.CalledProcessError:
            pass

    except Exception as e:
        print(f"Error during grep check for {image_path}: {e}")

    return False


def main():
    parser = argparse.ArgumentParser(
        description="Find and optionally delete unused versioned images in MkDocs project"
    )
    parser.add_argument(
        "--docs-dir", type=str, required=True, help="Path to the docs directory"
    )
    parser.add_argument(
        "--img-dir", type=str, required=True, help="Path to the images directory"
    )
    parser.add_argument("--delete", action="store_true", help="Delete unused images")
    parser.add_argument(
        "--verbose", action="store_true", help="Show all found references and files"
    )
    parser.add_argument(
        "--skip-grep", action="store_true", help="Skip the additional grep validation"
    )
    args = parser.parse_args()

    # Convert paths to absolute paths
    docs_dir = os.path.abspath(args.docs_dir)
    img_dir = os.path.abspath(args.img_dir)

    print(f"Scanning markdown files in: {docs_dir}")
    print(f"Looking for versioned images in: {img_dir}")

    # Get all markdown files
    md_files = scan_markdown_files(docs_dir)
    print(f"Found {len(md_files)} markdown files")

    # Collect all image references
    used_images = set()
    for md_file in md_files:
        refs = find_image_references(md_file)
        used_images.update(refs)

    # Get all actual images (only from version directories)
    all_images = set(find_all_images(img_dir))

    if args.verbose:
        print("\nAll versioned image references found in markdown:")
        for img in sorted(used_images):
            print(f"- {img}")

        print("\nAll versioned images in directory:")
        for img in sorted(all_images):
            print(f"- {img}")

    # Find potentially unused images
    unused_images = all_images - used_images

    # Additional grep validation if not skipped
    if not args.skip_grep and unused_images:
        print("\nPerforming additional grep validation...")
        actually_unused = set()
        for img in unused_images:
            if not grep_check_image(docs_dir, img):
                actually_unused.add(img)

        if len(actually_unused) != len(unused_images):
            print(
                f"\nGrep validation found {len(unused_images) - len(actually_unused)} additional image references!"
            )
        unused_images = actually_unused

    # Report findings
    print("\nResults:")
    print(f"Total versioned images found: {len(all_images)}")
    print(f"Versioned images referenced in markdown: {len(used_images)}")
    print(f"Unused versioned images: {len(unused_images)}")

    if unused_images:
        print("\nUnused versioned images:")
        for img in sorted(unused_images):
            print(f"- {img}")

        if args.delete:
            print("\nDeleting unused versioned images...")
            for img in unused_images:
                full_path = os.path.join(img_dir, img)
                try:
                    os.remove(full_path)
                    print(f"Deleted: {img}")
                except Exception as e:
                    print(f"Error deleting {img}: {e}")
            print("\nDeletion complete")
    else:
        print("\nNo unused versioned images found!")


if __name__ == "__main__":
    main()