From 2f3ac0fcd22a54050b65a9b3428f4980f98644f2 Mon Sep 17 00:00:00 2001 From: Jintao Date: Tue, 16 Sep 2025 12:49:29 +0800 Subject: [PATCH] [docker] update swift docker (#1502) --- docker/Dockerfile.ubuntu | 8 +++++--- docker/build_image.py | 20 ++++++++++++++++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index f72e4cc9..e889c8d7 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -79,13 +79,15 @@ RUN if [ "$INSTALL_MS_DEPS" = "True" ]; then \ pip install --no-cache-dir huggingface-hub transformers peft -U; \ fi; \ if [ "$INSTALL_MEGATRON_DEPS" = "True" ]; then \ - pip install liger_kernel nvitop pre-commit transformers huggingface-hub -U && \ + pip install "sglang[all]<0.5" "math_verify==0.5.2" "gradio<5.33" -U && \ + pip install liger_kernel wandb swanlab nvitop pre-commit "transformers<4.57" "trl<0.21" huggingface-hub -U && \ SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") && echo $SITE_PACKAGES && \ CUDNN_PATH=$SITE_PACKAGES/nvidia/cudnn CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ - pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable; \ + pip install --no-build-isolation transformer_engine[pytorch]; \ cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone https://github.com/NVIDIA/apex && \ - cd apex && pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ + cd apex && git checkout e13873debc4699d39c6861074b9a3b2a02327f92 && pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ cd / && rm -fr /tmp/apex && pip cache purge; \ + pip install git+https://github.com/NVIDIA/Megatron-LM.git@core_r0.13.0; \ fi # install nvm and set node version to 18 diff --git a/docker/build_image.py b/docker/build_image.py index e1b6ca14..07c616f4 100644 --- a/docker/build_image.py +++ b/docker/build_image.py @@ -344,6 +344,23 @@ class LLMImageBuilder(Builder): class SwiftImageBuilder(LLMImageBuilder): + def init_args(self, args) -> Any: + if not args.base_image: + args.base_image = 'nvidia/cuda:12.6.3-devel-ubuntu22.04' + if not args.cuda_version: + args.cuda_version = '12.6.3' + if not args.torch_version: + args.torch_version = '2.7.1' + args.torchaudio_version = '2.7.1' + args.torchvision_version = '0.22.1' + if not args.vllm_version: + args.vllm_version = '0.10.1.1' + if not args.lmdeploy_version: + args.lmdeploy_version = '0.9.2.post1' + if not args.flashattn_version: + args.flashattn_version = '2.7.4.post1' + return super().init_args(args) + def generate_dockerfile(self) -> str: meta_file = './docker/install.sh' with open('docker/Dockerfile.extra_install', 'r') as f: @@ -351,8 +368,7 @@ class SwiftImageBuilder(LLMImageBuilder): extra_content = extra_content.replace('{python_version}', self.args.python_version) extra_content += """ -RUN pip install --no-cache-dir deepspeed==0.14.5 --no-deps && \ - pip install --no-cache-dir -U icecream soundfile pybind11 +RUN pip install --no-cache-dir -U icecream soundfile pybind11 py-spy """ version_args = ( f'{self.args.torch_version} {self.args.torchvision_version} {self.args.torchaudio_version} '