From 9ed8f50d404790a3bc71cafc39bbe05d88ac537a Mon Sep 17 00:00:00 2001 From: Classic298 <27028174+Classic298@users.noreply.github.com> Date: Thu, 5 Feb 2026 21:11:00 +0100 Subject: [PATCH] fix: bundle NLTK punkt_tab in Docker image for airgapped environments (#21165) Pre-download NLTK punkt_tab during Docker build instead of at runtime. This fixes document extraction failures in offline/airgapped environments where the container cannot download the tokenizer data after restarts. Fixes #21150 --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index aa0bbf6f04..1608a24afe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -143,6 +143,7 @@ RUN pip3 install --no-cache-dir uv && \ python -c "import os; from sentence_transformers import SentenceTransformer; SentenceTransformer(os.environ.get('AUXILIARY_EMBEDDING_MODEL', 'TaylorAI/bge-micro-v2'), device='cpu')" && \ python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])"; \ python -c "import os; import tiktoken; tiktoken.get_encoding(os.environ['TIKTOKEN_ENCODING_NAME'])"; \ + python -c "import nltk; nltk.download('punkt_tab')"; \ else \ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir && \ uv pip install --system -r requirements.txt --no-cache-dir && \ @@ -151,6 +152,7 @@ RUN pip3 install --no-cache-dir uv && \ python -c "import os; from sentence_transformers import SentenceTransformer; SentenceTransformer(os.environ.get('AUXILIARY_EMBEDDING_MODEL', 'TaylorAI/bge-micro-v2'), device='cpu')" && \ python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])"; \ python -c "import os; import tiktoken; tiktoken.get_encoding(os.environ['TIKTOKEN_ENCODING_NAME'])"; \ + python -c "import nltk; nltk.download('punkt_tab')"; \ fi; \ fi; \ mkdir -p /app/backend/data && chown -R $UID:$GID /app/backend/data/ && \