From 2f5f52fc3cb6024c57049c143a05b88e7d65b875 Mon Sep 17 00:00:00 2001 From: dwd Date: Wed, 6 May 2026 20:57:17 +0800 Subject: [PATCH] feat(docker/Metax): add metax dockerfile and its requirements for ms-swift 4.1.x (#1689) --- docker/Metax/4.1/Dockerfile.metax | 164 ++++++++++++++++++ docker/Metax/4.1/Dockerfile.with_metax_image | 73 ++++++++ docker/Metax/4.1/build.sh | 13 ++ docker/Metax/4.1/build_from_metax_image.sh | 10 ++ docker/Metax/4.1/override.txt | 5 + docker/Metax/4.1/requirements_extra.txt | 14 ++ .../Metax/4.1/swift_building_instructions.md | 52 ++++++ 7 files changed, 331 insertions(+) create mode 100644 docker/Metax/4.1/Dockerfile.metax create mode 100644 docker/Metax/4.1/Dockerfile.with_metax_image create mode 100644 docker/Metax/4.1/build.sh create mode 100644 docker/Metax/4.1/build_from_metax_image.sh create mode 100644 docker/Metax/4.1/override.txt create mode 100644 docker/Metax/4.1/requirements_extra.txt create mode 100644 docker/Metax/4.1/swift_building_instructions.md diff --git a/docker/Metax/4.1/Dockerfile.metax b/docker/Metax/4.1/Dockerfile.metax new file mode 100644 index 00000000..07ec8205 --- /dev/null +++ b/docker/Metax/4.1/Dockerfile.metax @@ -0,0 +1,164 @@ +ARG BUILD_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:9.6 +ARG PYTHON_VERSION=3.12 +ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple +ARG UV_TRUSTED_HOST=repos.metax-tech.com + +# may need passing a particular vllm version during build +ARG VLLM_VERSION +ARG MACA_VERSION +ARG CU_BRIDGE_VERSION=${MACA_VERSION} + +#################### BASE BUILD IMAGE #################### +FROM ${BUILD_BASE_IMAGE} AS base +ARG UV_TRUSTED_HOST + +# maca environment variables +ENV MACA_PATH=/opt/maca +ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin +ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge" +ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR +ENV CUCC_CMAKE_ENTRY=2 +ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" +ENV PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/tools/cu-bridge/tools:${MACA_PATH}/tools/cu-bridge/bin:${PATH} +ENV LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH} + +# uv environment variables +ENV VIRTUAL_ENV=/opt/venv +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_HTTP_TIMEOUT=6000 +ENV UV_LINK_MODE=copy +ARG UV_EXTRA_INDEX_URL +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} +ARG UV_INDEX_URL +ENV UV_INDEX_URL=https://mirrors.aliyun.com/pypi/simple +ENV UV_TRUSTED_INDEX_HOST=mirrors.aliyun.com +ENV UV_OVERRIDE=/workspace/override.txt + +# vllm compile option +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +# AI version arguments +ARG PYTHON_VERSION +ARG VLLM_VERSION +ARG VLLM_METAX_VERSION +ARG MACA_VERSION +ARG MEGATRON_VERSION +ARG SWIFT_VERSION +ARG CU_BRIDGE_VERSION +ARG TE_VERSION + +WORKDIR /workspace +COPY override.txt /workspace/override.txt +COPY requirements_extra.txt /workspace/requirements_extra.txt + +RUN printf "[metax-centos]\n\ +name=Maca Driver Yum Repository\n\ +baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\ +enabled=1\n\ +gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo + +RUN dnf -y install python3-pip hostname && \ + dnf clean all + +RUN python3 -m pip install uv -i $UV_INDEX_URL --trusted-host ${UV_TRUSTED_INDEX_HOST} && \ + uv venv /opt/venv --python=${PYTHON_VERSION} + +RUN python3 --version && \ + uv self version + +RUN yum makecache && yum install -y \ + unzip vim git openblas-devel make cmake \ + ninja-build gcc g++ procps-ng \ + libibverbs librdmacm libibumad \ + && yum clean all + +RUN git clone --depth 1 --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git +RUN git clone --depth 1 --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git +RUN git clone --depth 1 --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git +RUN git clone --depth 1 --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git + +# Step 1: install MACA SDK, Metax-Driver and cu-bridge +# Metax-Driver mainly contains vbios and kmd files, which are not needed in a container. +# Here we keep the mx-smi management tool. Kernel version mismatch errors are ignored. +RUN yum makecache && \ + yum install -y metax-driver-${MACA_VERSION}* mxgvm && \ + yum clean all && rm -rf /var/cache/yum /tmp/* + +RUN printf "[maca-sdk]\n\ +name=Maca Sdk Yum Repository\n\ +baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/\n\ +enabled=1\n\ +gpgcheck=0" > /etc/yum.repos.d/maca-sdk-rpm.repo + +RUN yum makecache && \ + yum install -y maca_sdk-${MACA_VERSION}* && \ + yum clean all && rm -rf /var/cache/yum /tmp/* + +RUN cd /tmp/ && \ + export MACA_PATH=/opt/maca && \ + curl -o ${CU_BRIDGE_VERSION}.zip -LsSf https://gitee.com/metax-maca/cu-bridge/repository/archive/${CU_BRIDGE_VERSION}.zip && \ + unzip ${CU_BRIDGE_VERSION}.zip && \ + mv cu-bridge-${CU_BRIDGE_VERSION} cu-bridge && \ + chmod 755 cu-bridge -Rf && \ + cd cu-bridge && \ + mkdir build && cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ && \ + make && make install + +# Step 2: trim unused MACA packages and install build prerequisites +RUN cd vLLM-metax && \ + uv pip install -r requirements/build.txt && \ + uv pip install build + +RUN yum makecache && yum install -y \ + gcc \ + binutils \ + procps-ng \ + libibverbs \ + librdmacm \ + libibumad \ + openblas \ + numactl-libs \ + && yum clean all && rm -rf /var/cache/yum /tmp/* + +# Step 3: install Metax python requirements +RUN cd vLLM-metax && \ + UV_HTTP_TIMEOUT=960 uv pip install -r requirements/maca.txt --trusted-host ${UV_TRUSTED_HOST} + +# Step 4: build vLLM with empty device to avoid CUDA dependency +RUN cd vllm && \ + python3 use_existing_torch.py && \ + uv pip install -r requirements/build.txt + +RUN cd vllm && \ + VLLM_TARGET_DEVICE=empty uv pip install -v . --no-build-isolation + +# Step 5: build vLLM-metax +RUN cd vLLM-metax && \ + uv pip install -r requirements/build.txt && \ + python3 -m build -w -n && \ + uv pip install dist/*.whl + +# Step 6: install Megatron-LM +RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py && \ + cd Megatron-LM && \ + uv pip install . + +# Step 7: install transformer-engine +RUN uv pip install transformer_engine==${TE_VERSION} -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST} + +# Step 8: patch and install ms-swift v4.1.0 with Megatron extra dependencies +RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py && \ + cd ms-swift && \ + uv pip install '.[megatron]' + +# Step 9: install optional runtime dependencies used by swift 4.1.0 +RUN uv pip install deepspeed -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST} +RUN uv pip install pip +RUN uv pip install -r requirements_extra.txt +RUN ln -sf ${CUDA_PATH}/bin/nvcc ${CUDA_PATH}/bin/cucc + +# vllm installation may bring in incompatible CUDA-only wheels. Remove them here. +RUN uv pip uninstall flashinfer-python cupy-cuda12x flash-linear-attention fla-core + +#################### FINAL IMAGE #################### diff --git a/docker/Metax/4.1/Dockerfile.with_metax_image b/docker/Metax/4.1/Dockerfile.with_metax_image new file mode 100644 index 00000000..296f5668 --- /dev/null +++ b/docker/Metax/4.1/Dockerfile.with_metax_image @@ -0,0 +1,73 @@ +ARG BUILD_BASE_IMAGE=mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:4.0.4-maca.ai3.5.3.5-torch2.8-py312-ubuntu22.04-amd64 +ARG PYTHON_VERSION=3.12 + +FROM ${BUILD_BASE_IMAGE} AS base + +# NOTE: +# This fast-build path inherits Python/Torch/TE from a prebuilt Metax release image. +# We keep the verified base image tag here instead of guessing a newer one. +# As a result, this path may lag behind the Megatron-SWIFT Quick Start recommendations. + +# may need passing a particular vllm version during build +ARG VLLM_VERSION +ARG VLLM_METAX_VERSION +ARG MEGATRON_VERSION +ARG SWIFT_VERSION + +ENV MACA_PATH=/opt/maca +ENV CUCC_CMAKE_ENTRY=2 +ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR +ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge +ENV PATH=/opt/conda/bin:/opt/conda/condabin:${CUDA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${MACA_PATH}/bin:${PATH} +ENV LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH} + +WORKDIR /workspace +COPY requirements_extra.txt /workspace/requirements_extra.txt + +RUN echo $PATH +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +# Initialize cu-bridge if it is not already prepared in the base image. +RUN if [ ! -d /root/cu-bridge ]; then \ + ${MACA_PATH}/tools/cu-bridge/tools/pre_make; \ + fi + +# Clone all GitHub sources while the external proxy is enabled. +RUN rm -rf /workspace/ms-swift /workspace/vLLM-metax /workspace/vllm /workspace/Megatron-LM +RUN git clone --depth 1 --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git +RUN git clone --depth 1 --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git +RUN git clone --depth 1 --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git +RUN git clone --depth 1 --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git + +# install cmake +RUN pip install cmake ninja + +# Step 1: build original vLLM for torch setup +RUN cd vllm && \ + python3 use_existing_torch.py && \ + pip install -r requirements/build.txt + +# Step 2: build vLLM with empty device to avoid CUDA dependency +RUN cd vllm && \ + VLLM_TARGET_DEVICE=empty pip install -v . --no-build-isolation + +# Step 3: build vLLM-metax +RUN cd vLLM-metax && \ + python3 use_existing_metax.py && \ + pip install -r requirements/build.txt && \ + python3 -m build -w -n && \ + pip install dist/*.whl + +# Step 4: patch and install Megatron-LM +RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py && \ + cd /workspace/Megatron-LM && \ + pip install . + +# Step 5: patch and install ms-swift v4.1.0 with its Megatron extra +RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py && \ + cd ms-swift && \ + pip install "transformers<5.4.0" && \ + pip install '.[megatron]' && \ + pip install -r /workspace/requirements_extra.txt + +CMD ["bash"] diff --git a/docker/Metax/4.1/build.sh b/docker/Metax/4.1/build.sh new file mode 100644 index 00000000..6c112b53 --- /dev/null +++ b/docker/Metax/4.1/build.sh @@ -0,0 +1,13 @@ +docker build \ + --network host \ + -f Dockerfile.metax \ + -t swift:v4.1.0 \ + --build-arg VLLM_VERSION=v0.17.1 \ + --build-arg VLLM_METAX_VERSION=v0.17.0 \ + --build-arg MACA_VERSION=3.5.3 \ + --build-arg MEGATRON_VERSION=core_v0.16.0 \ + --build-arg SWIFT_VERSION=v4.1.0 \ + --build-arg TE_VERSION=2.8.0 \ + --build-arg CU_BRIDGE_VERSION=3.5.3 \ + --no-cache \ + . diff --git a/docker/Metax/4.1/build_from_metax_image.sh b/docker/Metax/4.1/build_from_metax_image.sh new file mode 100644 index 00000000..3c2fb454 --- /dev/null +++ b/docker/Metax/4.1/build_from_metax_image.sh @@ -0,0 +1,10 @@ +docker build \ + --network host \ + -f Dockerfile.with_metax_image \ + -t swift:v4.1.0 \ + --build-arg VLLM_VERSION=v0.17.1 \ + --build-arg VLLM_METAX_VERSION=v0.17.0 \ + --build-arg MEGATRON_VERSION=core_v0.16.0 \ + --build-arg SWIFT_VERSION=v4.1.0 \ + --no-cache \ + . diff --git a/docker/Metax/4.1/override.txt b/docker/Metax/4.1/override.txt new file mode 100644 index 00000000..4a3f3b38 --- /dev/null +++ b/docker/Metax/4.1/override.txt @@ -0,0 +1,5 @@ +setuptools>=77.0.3,<80 +datasets>=3.0,<4.0 +flash-linear-attention +mcoplib +transformers<5.4.0 diff --git a/docker/Metax/4.1/requirements_extra.txt b/docker/Metax/4.1/requirements_extra.txt new file mode 100644 index 00000000..3480e1d9 --- /dev/null +++ b/docker/Metax/4.1/requirements_extra.txt @@ -0,0 +1,14 @@ +decord +diffusers==0.35.2 +evalscope>=1.0.0 +evalscope[opencompass] +evalscope[vlmeval] +keye_vl_utils>=1.5.2 +librosa +mpi4py +optimum==1.27.0 +pytorchvideo +qwen_omni_utils>=0.0.9 +qwen_vl_utils==0.0.14 +soundfile +timm diff --git a/docker/Metax/4.1/swift_building_instructions.md b/docker/Metax/4.1/swift_building_instructions.md new file mode 100644 index 00000000..c7a33091 --- /dev/null +++ b/docker/Metax/4.1/swift_building_instructions.md @@ -0,0 +1,52 @@ +# 1. Build swift 4.1 image from a UBI9 base image + Full build from a minimal base image, using a venv virtual environment. + +## 1.1. Build + ``` bash + bash build.sh + ``` + +## 1.2. Run a container + ``` bash + docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \ + --shm-size 100gb --ulimit memlock=-1 \ + --security-opt seccomp=unconfined --security-opt apparmor=unconfined \ + --device=/dev/dri --device=/dev/mxcd \ + --name base_image \ + ${IMAGE_ID} bash + ``` + +## 1.3. Activate the venv environment + ``` bash + source /opt/venv/bin/activate + ``` + +## 1.4. Run swift examples + ``` bash + cd /workspace/ms-swift + bash examples/train/full/train.sh + ``` + +# 2. Build swift 4.1 image from a Metax release image + Faster build based on the pre-built Metax release image. + +## 2.1. Build + ``` bash + bash build_from_metax_image.sh + ``` + +## 2.2. Run a container + ``` bash + docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \ + --shm-size 100gb --ulimit memlock=-1 \ + --security-opt seccomp=unconfined --security-opt apparmor=unconfined \ + --device=/dev/dri --device=/dev/mxcd \ + --name base_image \ + ${IMAGE_ID} bash + ``` + +## 2.3. Run swift examples + ``` bash + cd /workspace/ms-swift + bash examples/train/full/train.sh + ```