feat(docker/Metax): add metax dockerfile and its requirements for ms-swift 4.1.x (#1689)

2026-05-18 05:05:00 +02:00 · 2026-05-06 20:57:17 +08:00
parent 68ab75af24
commit 2f5f52fc3c
7 changed files with 331 additions and 0 deletions
--- a/docker/Metax/4.1/Dockerfile.metax
+++ b/docker/Metax/4.1/Dockerfile.metax
@@ -0,0 +1,164 @@
+ARG BUILD_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:9.6
+ARG PYTHON_VERSION=3.12
+ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple
+ARG UV_TRUSTED_HOST=repos.metax-tech.com
+
+# may need passing a particular vllm version during build
+ARG VLLM_VERSION
+ARG MACA_VERSION
+ARG CU_BRIDGE_VERSION=${MACA_VERSION}
+
+#################### BASE BUILD IMAGE ####################
+FROM ${BUILD_BASE_IMAGE} AS base
+ARG UV_TRUSTED_HOST
+
+# maca environment variables
+ENV MACA_PATH=/opt/maca
+ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin
+ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge"
+ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
+ENV CUCC_CMAKE_ENTRY=2
+ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
+ENV PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/tools/cu-bridge/tools:${MACA_PATH}/tools/cu-bridge/bin:${PATH}
+ENV LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH}
+
+# uv environment variables
+ENV VIRTUAL_ENV=/opt/venv
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_HTTP_TIMEOUT=6000
+ENV UV_LINK_MODE=copy
+ARG UV_EXTRA_INDEX_URL
+ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL}
+ARG UV_INDEX_URL
+ENV UV_INDEX_URL=https://mirrors.aliyun.com/pypi/simple
+ENV UV_TRUSTED_INDEX_HOST=mirrors.aliyun.com
+ENV UV_OVERRIDE=/workspace/override.txt
+
+# vllm compile option
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# AI version arguments
+ARG PYTHON_VERSION
+ARG VLLM_VERSION
+ARG VLLM_METAX_VERSION
+ARG MACA_VERSION
+ARG MEGATRON_VERSION
+ARG SWIFT_VERSION
+ARG CU_BRIDGE_VERSION
+ARG TE_VERSION
+
+WORKDIR /workspace
+COPY override.txt /workspace/override.txt
+COPY requirements_extra.txt /workspace/requirements_extra.txt
+
+RUN printf "[metax-centos]\n\
+name=Maca Driver Yum Repository\n\
+baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\
+enabled=1\n\
+gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo
+
+RUN dnf -y install python3-pip hostname && \
+    dnf clean all
+
+RUN python3 -m pip install uv -i $UV_INDEX_URL --trusted-host ${UV_TRUSTED_INDEX_HOST} && \
+    uv venv /opt/venv --python=${PYTHON_VERSION}
+
+RUN python3 --version && \
+    uv self version
+
+RUN yum makecache && yum install -y \
+    unzip vim git openblas-devel make cmake \
+    ninja-build gcc g++ procps-ng \
+    libibverbs librdmacm libibumad \
+    && yum clean all
+
+RUN git clone --depth 1 --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
+RUN git clone --depth 1 --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git
+RUN git clone --depth 1 --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
+RUN git clone --depth 1 --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git
+
+# Step 1: install MACA SDK, Metax-Driver and cu-bridge
+# Metax-Driver mainly contains vbios and kmd files, which are not needed in a container.
+# Here we keep the mx-smi management tool. Kernel version mismatch errors are ignored.
+RUN yum makecache && \
+    yum install -y metax-driver-${MACA_VERSION}* mxgvm && \
+    yum clean all && rm -rf /var/cache/yum /tmp/*
+
+RUN printf "[maca-sdk]\n\
+name=Maca Sdk Yum Repository\n\
+baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/\n\
+enabled=1\n\
+gpgcheck=0" > /etc/yum.repos.d/maca-sdk-rpm.repo
+
+RUN yum makecache && \
+    yum install -y maca_sdk-${MACA_VERSION}* && \
+    yum clean all && rm -rf /var/cache/yum /tmp/*
+
+RUN cd /tmp/ && \
+    export MACA_PATH=/opt/maca && \
+    curl -o ${CU_BRIDGE_VERSION}.zip -LsSf https://gitee.com/metax-maca/cu-bridge/repository/archive/${CU_BRIDGE_VERSION}.zip && \
+    unzip ${CU_BRIDGE_VERSION}.zip && \
+    mv cu-bridge-${CU_BRIDGE_VERSION} cu-bridge && \
+    chmod 755 cu-bridge -Rf && \
+    cd cu-bridge && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ && \
+    make && make install
+
+# Step 2: trim unused MACA packages and install build prerequisites
+RUN cd vLLM-metax && \
+    uv pip install -r requirements/build.txt && \
+    uv pip install build
+
+RUN yum makecache && yum install -y \
+    gcc \
+    binutils \
+    procps-ng \
+    libibverbs \
+    librdmacm \
+    libibumad \
+    openblas \
+    numactl-libs \
+    && yum clean all && rm -rf /var/cache/yum /tmp/*
+
+# Step 3: install Metax python requirements
+RUN cd vLLM-metax && \
+    UV_HTTP_TIMEOUT=960 uv pip install -r requirements/maca.txt --trusted-host ${UV_TRUSTED_HOST}
+
+# Step 4: build vLLM with empty device to avoid CUDA dependency
+RUN cd vllm && \
+    python3 use_existing_torch.py && \
+    uv pip install -r requirements/build.txt
+
+RUN cd vllm && \
+    VLLM_TARGET_DEVICE=empty uv pip install -v . --no-build-isolation
+
+# Step 5: build vLLM-metax
+RUN cd vLLM-metax && \
+    uv pip install -r requirements/build.txt && \
+    python3 -m build -w -n && \
+    uv pip install dist/*.whl
+
+# Step 6: install Megatron-LM
+RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py && \
+    cd Megatron-LM && \
+    uv pip install .
+
+# Step 7: install transformer-engine
+RUN uv pip install transformer_engine==${TE_VERSION} -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
+
+# Step 8: patch and install ms-swift v4.1.0 with Megatron extra dependencies
+RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py && \
+    cd ms-swift && \
+    uv pip install '.[megatron]'
+
+# Step 9: install optional runtime dependencies used by swift 4.1.0
+RUN uv pip install deepspeed -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
+RUN uv pip install pip
+RUN uv pip install -r requirements_extra.txt
+RUN ln -sf ${CUDA_PATH}/bin/nvcc ${CUDA_PATH}/bin/cucc
+
+# vllm installation may bring in incompatible CUDA-only wheels. Remove them here.
+RUN uv pip uninstall flashinfer-python cupy-cuda12x flash-linear-attention fla-core
+
+#################### FINAL IMAGE ####################
--- a/docker/Metax/4.1/Dockerfile.with_metax_image
+++ b/docker/Metax/4.1/Dockerfile.with_metax_image
@@ -0,0 +1,73 @@
+ARG BUILD_BASE_IMAGE=mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:4.0.4-maca.ai3.5.3.5-torch2.8-py312-ubuntu22.04-amd64
+ARG PYTHON_VERSION=3.12
+
+FROM ${BUILD_BASE_IMAGE} AS base
+
+# NOTE:
+# This fast-build path inherits Python/Torch/TE from a prebuilt Metax release image.
+# We keep the verified base image tag here instead of guessing a newer one.
+# As a result, this path may lag behind the Megatron-SWIFT Quick Start recommendations.
+
+# may need passing a particular vllm version during build
+ARG VLLM_VERSION
+ARG VLLM_METAX_VERSION
+ARG MEGATRON_VERSION
+ARG SWIFT_VERSION
+
+ENV MACA_PATH=/opt/maca
+ENV CUCC_CMAKE_ENTRY=2
+ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
+ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+ENV PATH=/opt/conda/bin:/opt/conda/condabin:${CUDA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${MACA_PATH}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+
+WORKDIR /workspace
+COPY requirements_extra.txt /workspace/requirements_extra.txt
+
+RUN echo $PATH
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+# Initialize cu-bridge if it is not already prepared in the base image.
+RUN if [ ! -d /root/cu-bridge ]; then \
+        ${MACA_PATH}/tools/cu-bridge/tools/pre_make; \
+    fi
+
+# Clone all GitHub sources while the external proxy is enabled.
+RUN rm -rf /workspace/ms-swift /workspace/vLLM-metax /workspace/vllm /workspace/Megatron-LM
+RUN git clone --depth 1 --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
+RUN git clone --depth 1 --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git
+RUN git clone --depth 1 --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
+RUN git clone --depth 1 --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git
+
+# install cmake
+RUN pip install cmake ninja
+
+# Step 1: build original vLLM for torch setup
+RUN cd vllm && \
+    python3 use_existing_torch.py && \
+    pip install -r requirements/build.txt
+
+# Step 2: build vLLM with empty device to avoid CUDA dependency
+RUN cd vllm && \
+    VLLM_TARGET_DEVICE=empty pip install -v . --no-build-isolation
+
+# Step 3: build vLLM-metax
+RUN cd vLLM-metax && \
+    python3 use_existing_metax.py && \
+    pip install -r requirements/build.txt && \
+    python3 -m build -w -n && \
+    pip install dist/*.whl
+
+# Step 4: patch and install Megatron-LM
+RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py && \
+    cd /workspace/Megatron-LM && \
+    pip install .
+
+# Step 5: patch and install ms-swift v4.1.0 with its Megatron extra
+RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py && \
+    cd ms-swift && \
+    pip install "transformers<5.4.0" && \
+    pip install '.[megatron]' && \
+    pip install -r /workspace/requirements_extra.txt
+
+CMD ["bash"]
--- a/docker/Metax/4.1/build.sh
+++ b/docker/Metax/4.1/build.sh
@@ -0,0 +1,13 @@
+docker build \
+    --network host \
+    -f Dockerfile.metax \
+    -t swift:v4.1.0 \
+    --build-arg VLLM_VERSION=v0.17.1 \
+    --build-arg VLLM_METAX_VERSION=v0.17.0 \
+    --build-arg MACA_VERSION=3.5.3 \
+    --build-arg MEGATRON_VERSION=core_v0.16.0 \
+    --build-arg SWIFT_VERSION=v4.1.0 \
+    --build-arg TE_VERSION=2.8.0 \
+    --build-arg CU_BRIDGE_VERSION=3.5.3 \
+    --no-cache \
+    .
--- a/docker/Metax/4.1/build_from_metax_image.sh
+++ b/docker/Metax/4.1/build_from_metax_image.sh
@@ -0,0 +1,10 @@
+docker build \
+    --network host \
+    -f Dockerfile.with_metax_image \
+    -t swift:v4.1.0 \
+    --build-arg VLLM_VERSION=v0.17.1 \
+    --build-arg VLLM_METAX_VERSION=v0.17.0 \
+    --build-arg MEGATRON_VERSION=core_v0.16.0 \
+    --build-arg SWIFT_VERSION=v4.1.0 \
+    --no-cache \
+    .
--- a/docker/Metax/4.1/override.txt
+++ b/docker/Metax/4.1/override.txt
@@ -0,0 +1,5 @@
+setuptools>=77.0.3,<80
+datasets>=3.0,<4.0
+flash-linear-attention
+mcoplib
+transformers<5.4.0
--- a/docker/Metax/4.1/requirements_extra.txt
+++ b/docker/Metax/4.1/requirements_extra.txt
@@ -0,0 +1,14 @@
+decord
+diffusers==0.35.2
+evalscope>=1.0.0
+evalscope[opencompass]
+evalscope[vlmeval]
+keye_vl_utils>=1.5.2
+librosa
+mpi4py
+optimum==1.27.0
+pytorchvideo
+qwen_omni_utils>=0.0.9
+qwen_vl_utils==0.0.14
+soundfile
+timm
--- a/docker/Metax/4.1/swift_building_instructions.md
+++ b/docker/Metax/4.1/swift_building_instructions.md
@@ -0,0 +1,52 @@
+# 1. Build swift 4.1 image from a UBI9 base image
+    Full build from a minimal base image, using a venv virtual environment.
+
+## 1.1. Build
+    ``` bash
+    bash build.sh
+    ```
+
+## 1.2. Run a container
+    ``` bash
+    docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video  \
+    --shm-size 100gb --ulimit memlock=-1 \
+    --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
+    --device=/dev/dri --device=/dev/mxcd \
+    --name base_image \
+    ${IMAGE_ID} bash
+    ```
+
+## 1.3. Activate the venv environment
+    ``` bash
+    source /opt/venv/bin/activate
+    ```
+
+## 1.4. Run swift examples
+    ``` bash
+    cd /workspace/ms-swift
+    bash examples/train/full/train.sh
+    ```
+
+# 2. Build swift 4.1 image from a Metax release image
+    Faster build based on the pre-built Metax release image.
+
+## 2.1. Build
+    ``` bash
+    bash build_from_metax_image.sh
+    ```
+
+## 2.2. Run a container
+    ``` bash
+    docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video  \
+    --shm-size 100gb --ulimit memlock=-1 \
+    --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
+    --device=/dev/dri --device=/dev/mxcd \
+    --name base_image \
+    ${IMAGE_ID} bash
+    ```
+
+## 2.3. Run swift examples
+    ``` bash
+    cd /workspace/ms-swift
+    bash examples/train/full/train.sh
+    ```